diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..841b043
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#  
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#  
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required (VERSION 3.1)
+project(amdovx)
+
+add_subdirectory(openvx)
+add_subdirectory(runvx)
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
new file mode 100644
index 0000000..548cb05
--- /dev/null
+++ b/COPYRIGHT.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fdfdf29
--- /dev/null
+++ b/README.md
@@ -0,0 +1,30 @@
+﻿# AMD OpenVX (AMDOVX)
+AMD OpenVX (beta preview) is a highly optimized open source implementation of the [Khronos OpenVX](https://www.khronos.org/registry/vx/) computer vision specification. It allows for rapid prototyping as well as fast execution on a wide range of computer hardware, including small embedded x86 CPUs and large workstation discrete GPUs.
+
+#### Features
+* The code is highly optimized for both x86 CPU and OpenCL for GPU
+* Supported hardware spans the range from low power embedded APUs (like the new G series) to laptop, desktop and workstation graphics
+* Supports Windows and Linux
+* Includes a “graph optimizer” that looks at the entire processing pipeline and removes/replaces/merges functions to improve performance and minimize bandwidth at runtime 
+* Scripting support allows for rapid prototyping, without re-compiling at production performance levels
+* Interoperates with the popular (open source library) OpenCV
+
+The current release verion is 0.9 (beta preview).
+
+Build this project to generate AMD OpenVX library and RUNVX executable. 
+* Refer to openvx/include/vx_ext_amd.h for extensions in AMD OpenVX library.
+* Refer to runvx/README.md for RUNVX details. 
+
+## Build Instructions
+
+#### Pre-requisites
+* AMD APP SDK 3.0 [download](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/).
+* OpenCV 3.0 [download](http://opencv.org/downloads.html).
+* CMake 3.1 or newer [download](http://cmake.org/download/).
+* OpenCV_DIR environment variable should point to OpenCV/build folder
+
+#### Build using Visual Studio Professional 2013 on 64-bit Windows 10/8.1/7
+* Use amdovx-core/amdovx.sln to build for x64 platform
+
+#### Build using CMake on Linux (Ubuntu 15.10 64-bit)
+* Use CMake to configure and generate Makefile
diff --git a/amdovx.sln b/amdovx.sln
new file mode 100644
index 0000000..63a1dc6
--- /dev/null
+++ b/amdovx.sln
@@ -0,0 +1,31 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.31101.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "openvx", "openvx\openvx.vcxproj", "{973F2004-2215-431F-8A2C-93ABAAFB6A24}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "runvx", "runvx\runvx.vcxproj", "{E14F83E9-2295-466C-9647-7BD0D03ECE4B}"
+	ProjectSection(ProjectDependencies) = postProject
+		{973F2004-2215-431F-8A2C-93ABAAFB6A24} = {973F2004-2215-431F-8A2C-93ABAAFB6A24}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{973F2004-2215-431F-8A2C-93ABAAFB6A24}.Debug|x64.ActiveCfg = Debug|x64
+		{973F2004-2215-431F-8A2C-93ABAAFB6A24}.Debug|x64.Build.0 = Debug|x64
+		{973F2004-2215-431F-8A2C-93ABAAFB6A24}.Release|x64.ActiveCfg = Release|x64
+		{973F2004-2215-431F-8A2C-93ABAAFB6A24}.Release|x64.Build.0 = Release|x64
+		{E14F83E9-2295-466C-9647-7BD0D03ECE4B}.Debug|x64.ActiveCfg = Debug|x64
+		{E14F83E9-2295-466C-9647-7BD0D03ECE4B}.Debug|x64.Build.0 = Debug|x64
+		{E14F83E9-2295-466C-9647-7BD0D03ECE4B}.Release|x64.ActiveCfg = Release|x64
+		{E14F83E9-2295-466C-9647-7BD0D03ECE4B}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/openvx/CMakeLists.txt b/openvx/CMakeLists.txt
new file mode 100644
index 0000000..f2e5eb9
--- /dev/null
+++ b/openvx/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#  
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#  
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required (VERSION 3.1)
+project (openvx)
+
+set (CMAKE_CXX_STANDARD 11)
+
+find_package(OpenCL)
+
+include_directories(include ago api)
+
+list(APPEND SOURCES
+	ago/ago_drama.cpp
+	ago/ago_drama_alloc.cpp
+	ago/ago_drama_analyze.cpp
+	ago/ago_drama_divide.cpp
+	ago/ago_drama_merge.cpp
+	ago/ago_drama_remove.cpp
+	ago/ago_haf_cpu.cpp
+	ago/ago_haf_cpu_arithmetic.cpp
+	ago/ago_haf_cpu_canny.cpp
+	ago/ago_haf_cpu_ch_extract_combine.cpp
+	ago/ago_haf_cpu_color_convert.cpp
+	ago/ago_haf_cpu_fast_corners.cpp
+	ago/ago_haf_cpu_filter.cpp
+	ago/ago_haf_cpu_geometric.cpp
+	ago/ago_haf_cpu_harris.cpp
+	ago/ago_haf_cpu_histogram.cpp
+	ago/ago_haf_cpu_logical.cpp
+	ago/ago_haf_cpu_opticalflow.cpp
+	ago/ago_haf_cpu_pyramid.cpp
+	ago/ago_haf_gpu_common.cpp
+	ago/ago_haf_gpu_conversion.cpp
+	ago/ago_haf_gpu_corners.cpp
+	ago/ago_haf_gpu_linear_filter.cpp
+	ago/ago_haf_gpu_special_filters.cpp
+	ago/ago_interface.cpp
+	ago/ago_kernel_api.cpp
+	ago/ago_kernel_list.cpp
+	ago/ago_platform.cpp
+	ago/ago_util.cpp
+	ago/ago_util_opencl.cpp
+	api/vxu.cpp
+	api/vx_api.cpp
+	api/vx_nodes.cpp
+)
+
+add_library(openvx STATIC ${SOURCES})
+
+if (OpenCL_FOUND)
+	target_compile_definitions(openvx PUBLIC ENABLE_OPENCL=1)
+	include_directories(${OpenCL_INCLUDE_DIRS})
+	target_link_libraries(openvx ${OpenCL_LIBRARIES})
+else(OpenCL_FOUND)
+	target_compile_definitions(openvx PUBLIC ENABLE_OPENCL=0)
+endif(OpenCL_FOUND)
+
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mxop")
+	target_link_libraries(openvx dl)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+endif()
diff --git a/openvx/ago/ago_drama.cpp b/openvx/ago/ago_drama.cpp
new file mode 100644
index 0000000..278f2ae
--- /dev/null
+++ b/openvx/ago/ago_drama.cpp
@@ -0,0 +1,572 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+int agoOptimizeDramaCheckArgs(AgoGraph * agraph)
+{
+	int astatus = 0;
+	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next)
+	{
+		AgoKernel * akernel = anode->akernel;
+		for (vx_uint32 arg = 0; arg < AGO_MAX_PARAMS; arg++) {
+			if (!anode->paramList[arg] || (arg >= anode->paramCount))
+			{
+				if (((akernel->argConfig[arg] & AGO_KERNEL_ARG_OPTIONAL_FLAG) == 0) && ((akernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) != 0))
+				{
+					agoAddLogEntry(&akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaCheckArgs: kernel %s: missing argument#%d\n", akernel->name, arg);
+					astatus = -1;
+				}
+			}
+			else if ((akernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == 0)
+			{
+				agoAddLogEntry(&akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaCheckArgs: kernel %s: unexpected argument#%d\n", akernel->name, arg);
+				astatus = -1;
+			}
+		}
+	}
+	return astatus;
+}
+
+static bool DetectRectOverlap(vx_rectangle_t& a, vx_rectangle_t& b)
+{
+	vx_rectangle_t c;
+	c.start_x = max(a.start_x, b.start_x);
+	c.start_y = max(a.start_y, b.start_y);
+	c.end_x = min(a.end_x, b.end_x);
+	c.end_y = min(a.end_y, b.end_y);
+	return (c.start_x < c.end_x) && (c.start_y < c.end_y) ? true : false;
+}
+
+void agoOptimizeDramaGetDataUsageOfROI(AgoGraph * agraph, AgoData * roiMasterImage, vx_uint32& inputUsageCount, vx_uint32& outputUsageCount, vx_uint32& inoutUsageCount)
+{
+	std::list<vx_rectangle_t> rectList;
+	vx_uint32 outputUsageCount_ = 0;
+	for (int isVirtual = 0; isVirtual <= 1; isVirtual++) {
+		for (AgoData * data = isVirtual ? agraph->ref.context->dataList.head : agraph->dataList.head; data; data = data->next) {
+			if (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI && data->u.img.roiMasterImage == roiMasterImage) {
+				inputUsageCount += data->inputUsageCount;
+				inoutUsageCount += data->inoutUsageCount;
+				if (data->outputUsageCount > 0) {
+					if (outputUsageCount == 0) {
+						bool detectedOverlap = false;
+						for (auto it = rectList.begin(); it != rectList.end(); it++) {
+							if (DetectRectOverlap(*it, data->u.img.rect_roi)) {
+								detectedOverlap = true;
+								break;
+							}
+						}
+						rectList.push_back(data->u.img.rect_roi);
+						if (detectedOverlap) {
+							outputUsageCount_ += data->outputUsageCount;
+						}
+						else {
+							outputUsageCount_ = max(outputUsageCount_, data->outputUsageCount);
+						}
+					}
+					else {
+						outputUsageCount_ += data->outputUsageCount;
+					}
+				}
+			}
+		}
+	}
+	outputUsageCount += outputUsageCount_;
+}
+
+void agoOptimizeDramaMarkDataUsageOfROI(AgoGraph * agraph, AgoData * roiMasterImage, vx_uint32 inputUsageCount, vx_uint32 outputUsageCount, vx_uint32 inoutUsageCount)
+{
+	for (int isVirtual = 0; isVirtual <= 1; isVirtual++) {
+		for (AgoData * data = isVirtual ? agraph->ref.context->dataList.head : agraph->dataList.head; data; data = data->next) {
+			if (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI && data->u.img.roiMasterImage == roiMasterImage) {
+				data->inputUsageCount = inputUsageCount;
+				data->outputUsageCount = outputUsageCount;
+				data->inoutUsageCount = inoutUsageCount;
+			}
+		}
+	}
+}
+
+void agoOptimizeDramaMarkDataUsage(AgoGraph * agraph)
+{
+	// reset the data usage in all data elements
+	for (int isVirtual = 0; isVirtual <= 1; isVirtual++) {
+		for (AgoData * data = isVirtual ? agraph->ref.context->dataList.head : agraph->dataList.head; data; data = data->next) {
+			data->inputUsageCount = 0;
+			data->outputUsageCount = 0;
+			data->inoutUsageCount = 0;
+			for (vx_uint32 i = 0; i < data->numChildren; i++) {
+				AgoData * idata = data->children[i];
+				if (idata) {
+					idata->inputUsageCount = 0;
+					idata->outputUsageCount = 0;
+					idata->inoutUsageCount = 0;
+					for (vx_uint32 j = 0; j < idata->numChildren; j++) {
+						AgoData * jdata = idata->children[j];
+						if (jdata) {
+							jdata->inputUsageCount = 0;
+							jdata->outputUsageCount = 0;
+							jdata->inoutUsageCount = 0;
+							for (vx_uint32 k = 0; k < jdata->numChildren; k++) {
+								AgoData * kdata = jdata->children[k];
+								if (kdata) {
+									kdata->inputUsageCount = 0;
+									kdata->outputUsageCount = 0;
+									kdata->inoutUsageCount = 0;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	// update the data usage by this graph
+	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+		AgoKernel * akernel = anode->akernel;
+		for (vx_uint32 arg = 0; arg < anode->paramCount; arg++) {
+			AgoData * adata = anode->paramList[arg];
+			if (adata) {
+				// mark the usage of the data item
+				if ((akernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG))
+					adata->inoutUsageCount++;
+				else if (akernel->argConfig[arg] & AGO_KERNEL_ARG_OUTPUT_FLAG)
+					adata->outputUsageCount++;
+				else if (akernel->argConfig[arg] & AGO_KERNEL_ARG_INPUT_FLAG)
+					adata->inputUsageCount++;
+				// get image plane input/output non-usage count to compensate propagation in the next step
+				if (akernel->func && adata->ref.type == VX_TYPE_IMAGE && adata->numChildren > 1) {
+					if ((akernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) != (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) {
+						anode->funcExchange[0] = arg;
+						for (vx_uint32 plane = 0; plane < adata->numChildren; plane++)
+							anode->funcExchange[1 + plane] = 0;
+						if (!akernel->func(anode, ago_kernel_cmd_get_image_plane_nonusage)) {
+							for (vx_uint32 plane = 0; plane < adata->numChildren; plane++) {
+								if (adata->children[plane] && anode->funcExchange[1 + plane]) {
+									if (akernel->argConfig[arg] & AGO_KERNEL_ARG_OUTPUT_FLAG)
+										adata->children[plane]->outputUsageCount--;
+									else if (akernel->argConfig[arg] & AGO_KERNEL_ARG_INPUT_FLAG)
+										adata->children[plane]->inputUsageCount--;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	// propagate usage counts from top-level to children (e.g., PYRAMID to IMAGE)
+	for (int isVirtual = 0; isVirtual <= 1; isVirtual++) {
+		for (AgoData * data = isVirtual ? agraph->ref.context->dataList.head : agraph->dataList.head; data; data = data->next) {
+			if (!data->parent) {
+				vx_uint32 min_outputUsageCount = INT_MAX;
+				for (vx_uint32 i = 0; i < data->numChildren; i++) {
+					AgoData * idata = data->children[i];
+					if (idata) {
+						idata->outputUsageCount += data->outputUsageCount;
+						idata->inoutUsageCount += data->inoutUsageCount;
+						idata->inputUsageCount += data->inputUsageCount;
+						vx_uint32 imin_outputUsageCount = INT_MAX;
+						for (vx_uint32 j = 0; j < idata->numChildren; j++) {
+							AgoData * jdata = idata->children[j];
+							if (jdata) {
+								jdata->outputUsageCount += idata->outputUsageCount;
+								jdata->inoutUsageCount += idata->inoutUsageCount;
+								jdata->inputUsageCount += idata->inputUsageCount;
+								vx_uint32 jmin_outputUsageCount = INT_MAX;
+								for (vx_uint32 k = 0; k < jdata->numChildren; k++) {
+									AgoData * kdata = jdata->children[k];
+									if (kdata) {
+										kdata->outputUsageCount += jdata->outputUsageCount;
+										kdata->inoutUsageCount += jdata->inoutUsageCount;
+										kdata->inputUsageCount += jdata->inputUsageCount;
+										// IMPORTANT: parent check is needed to deal with image aliasing inside pyramids (result of agoReplaceDataInGraph)
+										if (kdata->parent == jdata && jmin_outputUsageCount > kdata->outputUsageCount) jmin_outputUsageCount = kdata->outputUsageCount;
+									}
+								}
+								if (!jdata->outputUsageCount && jmin_outputUsageCount != INT_MAX) jdata->outputUsageCount = jmin_outputUsageCount;
+								// IMPORTANT: parent check is needed to deal with image aliasing inside pyramids (result of agoReplaceDataInGraph)
+								if (jdata->parent == idata && imin_outputUsageCount > jdata->outputUsageCount) imin_outputUsageCount = jdata->outputUsageCount;
+							}
+						}
+						if (!idata->outputUsageCount && imin_outputUsageCount != INT_MAX) idata->outputUsageCount = imin_outputUsageCount;
+						// IMPORTANT: parent check is needed to deal with image aliasing inside pyramids (result of agoReplaceDataInGraph)
+						if (idata->parent == data && min_outputUsageCount > idata->outputUsageCount) min_outputUsageCount = idata->outputUsageCount;
+					}
+				}
+				if (!data->outputUsageCount && min_outputUsageCount != INT_MAX) data->outputUsageCount = min_outputUsageCount;
+			}
+		}
+	}
+	// add up ROI data usage
+	for (int isVirtual = 0; isVirtual <= 1; isVirtual++) {
+		for (AgoData * data = isVirtual ? agraph->ref.context->dataList.head : agraph->dataList.head; data; data = data->next) {
+			if (data->ref.type == VX_TYPE_IMAGE && !data->u.img.isROI) {
+				agoOptimizeDramaGetDataUsageOfROI(agraph, data, data->inputUsageCount, data->outputUsageCount, data->inoutUsageCount);
+				agoOptimizeDramaMarkDataUsageOfROI(agraph, data, data->inputUsageCount, data->outputUsageCount, data->inoutUsageCount);
+			}
+		}
+	}
+}
+
+static int agoSetDataHierarchicalLevel(AgoData * data, vx_uint32 hierarchical_level)
+{
+	data->hierarchical_level = hierarchical_level;
+	if(!hierarchical_level) {
+		data->hierarchical_life_start = data->hierarchical_life_end = 0;
+	}
+#if SHOW_DEBUG_HIERARCHICAL_LEVELS
+	if (data->hierarchical_level) {
+		char name[1024];
+		agoGetDataName(name, data);
+		printf("DEBUG: HIERARCHICAL DATA %3d %s\n", data->hierarchical_level, name);
+	}
+#endif
+	// propagate hierarchical_level to all of its children (if available)
+	for (vx_uint32 child = 0; child < data->numChildren; child++) {
+		if (data->children[child]) {
+			agoSetDataHierarchicalLevel(data->children[child], hierarchical_level);
+		}
+	}
+	// propagate hierarchical_level to image-ROI master (if available)
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		if (data->u.img.isROI) {
+			if (data->u.img.roiMasterImage && !data->u.img.roiMasterImage->hierarchical_level) {
+				agoSetDataHierarchicalLevel(data->u.img.roiMasterImage, hierarchical_level);
+			}
+		}
+		else if (hierarchical_level) {
+			for (AgoData * pdata = data->isVirtual ? ((AgoGraph *)data->ref.scope)->dataList.head : data->ref.context->dataList.head; pdata; pdata = pdata->next) {
+				if (pdata->ref.type == VX_TYPE_IMAGE && pdata->u.img.isROI && pdata->u.img.roiMasterImage == data && !pdata->hierarchical_level) {
+					agoSetDataHierarchicalLevel(pdata, hierarchical_level);
+				}
+			}
+		}
+	}
+	// propagate hierarchical_level to parent (if possible)
+	if (hierarchical_level) {
+		if (data->parent) {
+			vx_uint32 hierarchical_level_sibling_min = INT_MAX, hierarchical_level_sibling_max = 0;
+			for (vx_uint32 child = 0; child < data->parent->numChildren; child++) {
+				if (data->parent->children[child]) {
+					vx_uint32 hierarchical_level_sibling = data->parent->children[child]->hierarchical_level;
+					if (hierarchical_level_sibling_min > hierarchical_level_sibling)
+						hierarchical_level_sibling_min = hierarchical_level_sibling;
+					if (hierarchical_level_sibling_max < hierarchical_level_sibling)
+						hierarchical_level_sibling_max = hierarchical_level_sibling;
+				}
+			}
+			// make sure that all siblings has hierarchical_level the parent hierarchical_level is max of all siblings
+			if (hierarchical_level_sibling_min > 0 && hierarchical_level_sibling_max > 0)
+				data->parent->hierarchical_level = hierarchical_level_sibling_max;
+		}
+	}
+	return 0;
+}
+
+int agoOptimizeDramaComputeGraphHierarchy(AgoGraph * graph)
+{
+#if SHOW_DEBUG_HIERARCHICAL_LEVELS
+	printf("DEBUG: HIERARCHICAL **** *** **************************************\n");
+#endif
+
+	agoOptimizeDramaMarkDataUsage(graph);
+
+	////////////////////////////////////////////////
+	// make sure that there is only one writer and
+	// make sure that virtual buffers always have a writer
+	////////////////////////////////////////////////
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next)
+	{
+		node->hierarchical_level = 0;
+		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+			AgoData * data = node->paramList[arg];
+			if (data) {
+#if SHOW_DEBUG_HIERARCHICAL_LEVELS
+				char name[1024];
+				agoGetDataName(name, data);
+				printf("DEBUG: DATA USAGE #%d [ %d %d %d ] %s %s\n", arg, data->inputUsageCount, data->outputUsageCount, data->inoutUsageCount, node->akernel->name, name);
+#endif
+				if (data->outputUsageCount > 1) {
+					vx_status status = VX_ERROR_MULTIPLE_WRITERS;
+					agoAddLogEntry(&graph->ref, status, "ERROR: vxVerifyGraph: kernel %s: multiple writers for argument#%d (%s)\n", node->akernel->name, arg, data->name);
+					return status;
+				}
+				else if (data->isVirtual && data->outputUsageCount == 0 && !data->isInitialized) {
+					vx_status status = VX_ERROR_MULTIPLE_WRITERS;
+					agoAddLogEntry(&graph->ref, status, "ERROR: vxVerifyGraph: kernel %s: no writer/initializer for virtual buffer at argument#%d (%s)\n", node->akernel->name, arg, data->name);
+					return status;
+				}
+			}
+		}
+	}
+
+	////////////////////////////////////////////////
+	// reset hierarchical_level = 0 for all data
+	////////////////////////////////////////////////
+	for (int isVirtual = 0; isVirtual <= 1; isVirtual++) {
+		for (AgoData * data = isVirtual ? graph->ref.context->dataList.head : graph->dataList.head; data; data = data->next) {
+			agoSetDataHierarchicalLevel(data, 0);
+		}
+	}
+
+	////////////////////////////////////////////////
+	// identify object for nodes with hierarchical_level = 1 (head nodes)
+	// (i.e., nodes that only take objects not updated by this graph)
+	////////////////////////////////////////////////
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next)
+	{
+		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+			AgoData * data = node->paramList[arg];
+			if (data) {
+				if (data->parent && data->parent->ref.type != VX_TYPE_DELAY) 
+					data = data->parent;
+				vx_uint32 inputUsageCount = data->inputUsageCount;
+				vx_uint32 inoutUsageCount = data->inoutUsageCount;
+				vx_uint32 outputUsageCount = data->outputUsageCount;
+				for (vx_uint32 i = 0; i < data->numChildren; i++) {
+					AgoData * idata = data->children[i];
+					if (idata) {
+						if (outputUsageCount < idata->outputUsageCount) {
+							inputUsageCount = idata->inputUsageCount;
+							inoutUsageCount = idata->inoutUsageCount;
+							outputUsageCount = idata->outputUsageCount;
+						}
+						for (vx_uint32 j = 0; j < idata->numChildren; j++) {
+							AgoData * jdata = idata->children[j];
+							if (jdata) {
+								if (outputUsageCount < jdata->outputUsageCount) {
+									inputUsageCount = jdata->inputUsageCount;
+									inoutUsageCount = jdata->inoutUsageCount;
+									outputUsageCount = jdata->outputUsageCount;
+								}
+								for (vx_uint32 k = 0; k < jdata->numChildren; k++) {
+									AgoData * kdata = jdata->children[k];
+									if (kdata) {
+										if (outputUsageCount < kdata->outputUsageCount) {
+											inputUsageCount = kdata->inputUsageCount;
+											inoutUsageCount = kdata->inoutUsageCount;
+											outputUsageCount = kdata->outputUsageCount;
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+#if 0 // TBD: disabled temporarily as a quick workaround for Birdirectional buffer issue
+				if (inoutUsageCount > 0 && inputUsageCount > 0) {
+					// can't support a data as input an input parameter as well as bidirectional parameter in a single graph
+					printf("ERROR: agoVerifyGraph: detected a buffer used a input parameter as well as bidirectional parameter -- not supported\n");
+					return -1;
+				}
+				else
+#endif
+				if (outputUsageCount == 0) {
+					// mark that this data object can be input to nodes with hierarchical_level = 1
+					agoSetDataHierarchicalLevel(data, 1);
+				}
+			}
+		}
+	}
+
+	////////////////////////////////////////////////
+	// identify nodes for hierarchical_level = 1 (head nodes)
+	// (i.e., nodes with hierarchical_level = 1 for all of its inputs)
+	////////////////////////////////////////////////
+	vx_uint32 num_nodes_marked = 0;
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next)
+	{
+		AgoKernel * kernel = node->akernel;
+		// a node is a head node if all its inputs have hierarchical_level == 1
+		bool is_head_node = true;
+		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+			AgoData * data = node->paramList[arg];
+			if (data && (kernel->argConfig[arg] & AGO_KERNEL_ARG_INPUT_FLAG) && !(data->hierarchical_level == 1))
+				is_head_node = false;
+		}
+		if (is_head_node) {
+			// mark that node is a head node
+			node->hierarchical_level = 1;
+			num_nodes_marked++;
+#if SHOW_DEBUG_HIERARCHICAL_LEVELS
+			printf("DEBUG: HIERARCHICAL NODE %3d %s\n", node->hierarchical_level, node->akernel->name);
+#endif
+			// set the hierarchical_level of outputs to 2
+			for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+				AgoData * data = node->paramList[arg];
+				if (data && (kernel->argConfig[arg] & AGO_KERNEL_ARG_OUTPUT_FLAG))
+					agoSetDataHierarchicalLevel(data, node->hierarchical_level + 1);
+			}
+		}
+	}
+
+	////////////////////////////////////////////////
+	// calculate hierarchical_level for rest of the nodes
+	////////////////////////////////////////////////
+	for (;;)
+	{
+		bool found_change = false;
+		for (AgoNode * node = graph->nodeList.head; node; node = node->next)
+		{
+			if (node->hierarchical_level == 0) {
+				// find min and max hierarchical_level of inputs
+				AgoKernel * kernel = node->akernel;
+				vx_uint32 hierarchical_level_min = INT_MAX, hierarchical_level_max = 0;
+				for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+					AgoData * data = node->paramList[arg];
+					if (data && (kernel->argConfig[arg] & AGO_KERNEL_ARG_INPUT_FLAG)) {
+						vx_uint32 hierarchical_level = data->hierarchical_level;
+						if (hierarchical_level_min > hierarchical_level)
+							hierarchical_level_min = hierarchical_level;
+						if (hierarchical_level_max < hierarchical_level)
+							hierarchical_level_max = hierarchical_level;
+					}
+				}
+				// check if all inputs have hierarchical_level set
+				if (hierarchical_level_min > 0) {
+					found_change = true;
+					// mark that node is at highest hierarchical_level of all its inputs
+					node->hierarchical_level = hierarchical_level_max;
+					num_nodes_marked++;
+#if SHOW_DEBUG_HIERARCHICAL_LEVELS
+					printf("DEBUG: HIERARCHICAL NODE %3d %s\n", node->hierarchical_level, node->akernel->name);
+#endif
+					// set the hierarchical_level of outputs to (node->hierarchical_level + 1)
+					for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+						AgoData * data = node->paramList[arg];
+						if (data && (kernel->argConfig[arg] & AGO_KERNEL_ARG_OUTPUT_FLAG))
+							agoSetDataHierarchicalLevel(data, node->hierarchical_level + 1);
+					}
+				}
+			}
+		}
+		if (!found_change)
+			break;
+	}
+	if (num_nodes_marked != graph->nodeList.count) {
+		vx_status status = VX_ERROR_INVALID_GRAPH;
+		vxAddLogEntry(&graph->ref, status, "ERROR: vxVerifyGraph: invalid graph: possible cycles? [%d|%d]\n", num_nodes_marked, graph->nodeList.count);
+		return status;
+	}
+	return VX_SUCCESS;
+}
+
+void agoOptimizeDramaSortGraphHierarchy(AgoGraph * graph)
+{
+	if (graph->nodeList.count > 1) {
+		for (;;) {
+			bool swapped = false;
+			AgoNode * prev_node = graph->nodeList.head;
+			AgoNode * node = prev_node->next;
+			// check for the order of hierarchical_level
+			if (node->hierarchical_level < prev_node->hierarchical_level) {
+				// swap prev_node and node in the list
+				prev_node->next = node->next;
+				node->next = prev_node;
+				prev_node = node;
+				node = prev_node->next;
+				graph->nodeList.head = prev_node;
+				swapped = true;
+			}
+			for (; node->next; prev_node = prev_node->next, node = node->next) {
+				AgoNode * next_node = node->next;
+				// check for the order of hierarchical_level
+				if (next_node->hierarchical_level < node->hierarchical_level) {
+					// swap node and next_node in the list
+					node->next = next_node->next;
+					next_node->next = node;
+					prev_node->next = next_node;
+					node = next_node;
+					swapped = true;
+				}
+			}
+			graph->nodeList.tail = node;
+			if (!swapped)
+				break;
+		}
+	}
+}
+
+int agoOptimizeDrama(AgoGraph * agraph)
+{
+	// get optimization level requested by user
+
+#if ENABLE_DEBUG_MESSAGES
+	agoWriteGraph(agraph, NULL, 0, stdout, "input-to-drama");
+#endif
+	// perform divide
+	if (agoOptimizeDramaCheckArgs(agraph))
+		return -1;
+	if (!(agraph->optimizer_flags & AGO_GRAPH_OPTIMIZER_FLAG_NO_DIVIDE)) { 
+		if(agoOptimizeDramaDivide(agraph)) 
+			return -1;
+	}
+#if ENABLE_DEBUG_MESSAGES
+	agoWriteGraph(agraph, NULL, 0, stdout, "after-divide");
+#endif
+	if (agoOptimizeDramaComputeGraphHierarchy(agraph))
+		return -1;
+	agoOptimizeDramaSortGraphHierarchy(agraph);
+
+	// perform remove
+	if (agoOptimizeDramaCheckArgs(agraph))
+		return -1;
+	if (agoOptimizeDramaRemove(agraph))
+		return -1;
+#if ENABLE_DEBUG_MESSAGES
+	agoWriteGraph(agraph, NULL, 0, stdout, "after-remove");
+#endif
+	if (agoOptimizeDramaComputeGraphHierarchy(agraph))
+		return -1;
+	agoOptimizeDramaSortGraphHierarchy(agraph);
+
+	// perform analyze
+	if (agoOptimizeDramaCheckArgs(agraph))
+		return -1;
+	if (agoOptimizeDramaAnalyze(agraph))
+		return -1;
+#if ENABLE_DEBUG_MESSAGES
+	agoWriteGraph(agraph, NULL, 0, stdout, "after-analyze");
+#endif
+
+	// perform merge
+	if (agoOptimizeDramaCheckArgs(agraph))
+		return -1;
+	if (agoOptimizeDramaMerge(agraph))
+		return -1;
+#if ENABLE_DEBUG_MESSAGES
+	agoWriteGraph(agraph, NULL, 0, stdout, "after-merge");
+#endif
+
+	// perform alloc
+	if (agoOptimizeDramaCheckArgs(agraph))
+		return -1;
+	if (agoOptimizeDramaAlloc(agraph))
+		return -1;
+#if ENABLE_DEBUG_MESSAGES
+	agoWriteGraph(agraph, NULL, 0, stdout, "after-alloc");
+#endif
+
+	return 0;
+}
diff --git a/openvx/ago/ago_drama_alloc.cpp b/openvx/ago/ago_drama_alloc.cpp
new file mode 100644
index 0000000..3e3b430
--- /dev/null
+++ b/openvx/ago/ago_drama_alloc.cpp
@@ -0,0 +1,570 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+static int agoOptimizeDramaAllocRemoveUnusedData(AgoGraph * agraph)
+{
+	for (;;)
+	{
+		bool doRepeat = false;
+
+		// check and mark data usage
+		agoOptimizeDramaMarkDataUsage(agraph);
+
+		// find and remove virtual nodes that are not used
+		for (AgoData * adata = agraph->dataList.head; adata;) {
+			bool relatedToDelayElement = false;
+			if (adata->ref.type == VX_TYPE_DELAY) {
+				// object can't be removed since it is a delay element
+				relatedToDelayElement = true;
+			}
+			else {
+				// object can't be removed since it is part of a delay element
+				relatedToDelayElement = agoIsPartOfDelay(adata);
+			}
+			if (!relatedToDelayElement && adata->isVirtual && (adata->outputUsageCount == 0) && (adata->inputUsageCount == 0) && (adata->inoutUsageCount == 0)) {
+				AgoData * next = adata->next;
+				agoRemoveDataInGraph(agraph, adata);
+				adata = next;
+				doRepeat = true; // to repeat the removal process again
+				continue;
+			}
+			adata = adata->next;
+		}
+		if (doRepeat)
+			continue;
+
+		break;
+	}
+	return 0;
+}
+
+#if ENABLE_OPENCL
+static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph)
+{
+	// check to make sure that GPU resources are needed
+	bool gpuNeeded = false;
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) {
+			gpuNeeded = true;
+			break;
+		}
+	}
+	if (gpuNeeded) {
+		// make sure to allocate context and command queue
+		if (!graph->opencl_cmdq) {
+			// make sure that the context has been created
+			vx_context context = graph->ref.context;
+			if (!context->opencl_context) {
+				if (agoGpuOclCreateContext(context, nullptr) < 0) {
+					return -1;
+				}
+			}
+			// create command queue: for now use device#0 -- TBD: this needs to be changed in future
+			cl_int err = -1;
+			graph->opencl_device = context->opencl_device_list[0];
+			graph->opencl_cmdq = clCreateCommandQueueWithProperties(context->opencl_context, graph->opencl_device, NULL, &err);
+			if (err) {
+				agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clCreateCommandQueueWithProperties(%p,%p,0,*) => %d\n", context->opencl_context, graph->opencl_device, err);
+				return -1;
+			}
+		}
+	}
+
+	// identify GPU groups and make sure that they all have same affinity
+	std::map<vx_uint32, AgoTargetAffinityInfo_> groupMap;
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		if (node->attr_affinity.group > 0) {
+			if (groupMap.find(node->attr_affinity.group) == groupMap.end()) {
+				groupMap.insert(std::pair<vx_uint32, AgoTargetAffinityInfo_>(node->attr_affinity.group, node->attr_affinity));
+			}
+			if (memcmp(&groupMap[node->attr_affinity.group], &node->attr_affinity, sizeof(node->attr_affinity)) != 0) {
+				agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAllocGpuResources: mismatched affinity in nodes of group#%d\n", node->attr_affinity.group);
+				return -1;
+			}
+		}
+		else if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) {
+			node->opencl_build_options = node->ref.context->opencl_build_options;
+			if (node->akernel->func) {
+				// generate kernel function code
+				int status = node->akernel->func(node, ago_kernel_cmd_opencl_codegen);
+				if (status == VX_SUCCESS) {
+					if (node->opencl_type & NODE_OPENCL_TYPE_FULL_KERNEL) {
+						strcpy(node->opencl_name, NODE_OPENCL_KERNEL_NAME);
+					}
+					else {
+						agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAllocGpuResources: doesn't support kernel %s as a standalone OpenCL kernel\n", node->akernel->name);
+						return -1;
+					}
+				}
+				else if (status != AGO_ERROR_KERNEL_NOT_IMPLEMENTED) {
+					agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAllocGpuResources: kernel %s failed to generate OpenCL code (error %d)\n", node->akernel->name, status);
+					return -1;
+				}
+			}
+			else if (node->akernel->opencl_codegen_callback_f) {
+				// generate kernel function
+				node->opencl_name[0] = 0;
+				node->opencl_work_dim = 0;
+				node->opencl_global_work[0] = 0;
+				node->opencl_global_work[1] = 0;
+				node->opencl_global_work[2] = 0;
+				node->opencl_local_work[0] = 0;
+				node->opencl_local_work[1] = 0;
+				node->opencl_local_work[2] = 0;
+				node->opencl_param_mem2reg_mask = 0;
+				node->opencl_param_discard_mask = 0;
+				node->opencl_param_atomic_mask = 0;
+				node->opencl_compute_work_multiplier = 0;
+				node->opencl_compute_work_param_index = 0;
+				node->opencl_output_array_param_index_plus1 = 0;
+				node->opencl_local_buffer_usage_mask = 0;
+				node->opencl_local_buffer_size_in_bytes = 0;
+				node->opencl_code = "";
+				int status = node->akernel->opencl_codegen_callback_f(node, false, node->opencl_name, node->opencl_code, node->opencl_build_options, node->opencl_work_dim, node->opencl_global_work,
+					node->opencl_local_work, node->opencl_local_buffer_usage_mask, node->opencl_local_buffer_size_in_bytes);
+				if (status == VX_SUCCESS) {
+					node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+					if (agoGpuOclSingleNodeFinalize(graph, node) < 0) {
+						return -1;
+					}
+				}
+				else if (status != AGO_ERROR_KERNEL_NOT_IMPLEMENTED) {
+					agoAddLogEntry(&node->akernel->ref, status, "ERROR: agoOptimizeDramaAllocGpuResources: kernel %s failed to generate OpenCL code (error %d)\n", node->akernel->name, status);
+					return -1;
+				}
+			}
+			else {
+				agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAllocGpuResources: doesn't support kernel %s on GPU\n", node->akernel->name);
+				return -1;
+			}
+		}
+	}
+	// create a supernode for each group
+	for (auto itgroup = groupMap.begin(); itgroup != groupMap.end(); itgroup++) {
+		AgoSuperNode * supernode = NULL;
+		// add individual nodes into supernode
+		for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+			if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && node->attr_affinity.group == itgroup->first) {
+				// make sure supernode is created for GPU
+				if (!supernode) {
+					supernode = new AgoSuperNode; if (!supernode) return -1;
+					supernode->group = itgroup->first;
+				}
+				// link supernode into node
+				node->supernode = supernode;
+				// make sure that the GPU buffer resources are allocated in node
+				if (agoGpuOclAllocBuffers(graph, node) < 0) {
+					return -1;
+				}
+				// initialize supernode with OpenCL information
+				supernode->isGpuOclSuperNode = true;
+				supernode->opencl_cmdq = graph->opencl_cmdq;
+				// add node functionality into supernode
+				if (agoGpuOclSuperNodeMerge(graph, supernode, node) < 0) {
+					return -1;
+				}
+			}
+		}
+		if (supernode) {
+			// finalize
+			if (agoGpuOclSuperNodeFinalize(graph, supernode) < 0) {
+				return -1;
+			}
+			// add supernode to the master list
+			supernode->next = graph->supernodeList;
+			graph->supernodeList = supernode;
+		}
+	}
+	// allocate and finalize single nodes with GPU
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && node->attr_affinity.group == 0) {
+			// make sure that the GPU buffer resources are allocated in node
+			if (agoGpuOclAllocBuffers(graph, node) < 0) {
+				return -1;
+			}
+			if (agoGpuOclSingleNodeFinalize(graph, node) < 0) {
+				return -1;
+			}
+		}
+	}
+
+	return 0;
+}
+#endif
+
+static int agoOptimizeDramaAllocSetDefaultTargets(AgoGraph * agraph)
+{
+	// get unused GPU group ID
+	vx_uint32 nextAvailGroupId = 1;
+	for (AgoNode * node = agraph->nodeList.head; node; node = node->next) {
+		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) {
+			if (node->attr_affinity.group >= nextAvailGroupId) {
+				nextAvailGroupId = node->attr_affinity.group + 1;
+			}
+		}
+	}
+
+	// get default target
+	vx_uint32 default_target = AGO_KERNEL_TARGET_DEFAULT;
+	char textBuffer[1024];
+	if (agoGetEnvironmentVariable("AGO_DEFAULT_TARGET", textBuffer, sizeof(textBuffer))) {
+		if (!strcmp(textBuffer, "GPU")) {
+			default_target = AGO_KERNEL_FLAG_DEVICE_GPU;
+		}
+		else if (!strcmp(textBuffer, "CPU")) {
+			default_target = AGO_KERNEL_FLAG_DEVICE_CPU;
+		}
+	}
+
+	for (AgoNode * node = agraph->nodeList.head; node; node = node->next) {
+		// get target support info
+		node->target_support_flags = 0;
+		if (node->akernel->func) {
+			node->akernel->func(node, ago_kernel_cmd_query_target_support);
+		}
+		else if (node->akernel->query_target_support_f) {
+			vx_uint32 supported_target_affinity = 0;
+#if ENABLE_OPENCL
+			vx_bool use_opencl_1_2 = (agraph->ref.context->opencl_config_flags & CONFIG_OPENCL_USE_1_2) ? vx_true_e : vx_false_e;
+			vx_status status = node->akernel->query_target_support_f(agraph, node, use_opencl_1_2, supported_target_affinity);
+			if (status) {
+				agoAddLogEntry(&node->akernel->ref, status, "ERROR: kernel %s: query_target_support_f(*,*,%d,*) => %d\n", node->akernel->name, use_opencl_1_2, status);
+				return -1;
+			}
+#else
+			vx_status status = node->akernel->query_target_support_f(agraph, node, vx_false_e, supported_target_affinity);
+			if (status) {
+				printf("ERROR: kernel %s: query_target_support_f(*,*,%d,*) => %d\n", node->akernel->name, vx_false_e, status);
+				return -1;
+			}
+			supported_target_affinity &= ~AGO_KERNEL_FLAG_DEVICE_GPU;
+#endif
+			node->target_support_flags = 0;
+			if (supported_target_affinity & AGO_KERNEL_FLAG_DEVICE_CPU) {
+				// mark that CPU target affinity is supported
+				node->target_support_flags |= AGO_KERNEL_FLAG_DEVICE_CPU;
+				supported_target_affinity &= ~AGO_KERNEL_FLAG_DEVICE_CPU;
+			}
+			if (supported_target_affinity & AGO_KERNEL_FLAG_DEVICE_GPU) {
+				// mark that GPU target affinity is supported with full kernels
+				node->target_support_flags |= AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL;
+				supported_target_affinity &= ~AGO_KERNEL_FLAG_DEVICE_GPU;
+			}
+			if (supported_target_affinity) {
+				agoAddLogEntry(&node->akernel->ref, status, "ERROR: kernel %s: query_target_support_f returned unsupported affinity flags: 0x%08x\n", node->akernel->name, supported_target_affinity);
+				return -1;
+			}
+		}
+		else {
+			// default: only CPU is supported
+			node->target_support_flags = AGO_KERNEL_FLAG_DEVICE_CPU;
+		}
+
+		// check to make sure that kernel supports CPU and/or GPU
+		if (!(node->target_support_flags & (AGO_KERNEL_FLAG_DEVICE_CPU | AGO_KERNEL_FLAG_DEVICE_GPU))) {
+			agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: kernel %s not supported yet\n", node->akernel->name);
+			return -1;
+		}
+
+		// set default targets
+		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU) {
+			if (node->target_support_flags & AGO_KERNEL_FLAG_DEVICE_CPU) {
+				// reset group
+				node->attr_affinity.device_info = 0;
+				node->attr_affinity.group = 0;
+			}
+			else {
+				// fall back to GPU
+				node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
+				node->attr_affinity.device_info = 0;
+				node->attr_affinity.group = 0;
+				if (node->target_support_flags & (AGO_KERNEL_FLAG_GPU_INTEG_R2R | AGO_KERNEL_FLAG_GPU_INTEG_M2R)) {
+					// use an unsed group Id
+					node->attr_affinity.group = nextAvailGroupId++;
+				}
+				//TBD: these messages may be useful when requested by user
+				//printf("WARNING: kernel %s not supported on CPU -- falling back to GPU\n", node->akernel->name);
+			}
+		}
+		else if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) {
+			if (node->target_support_flags & AGO_KERNEL_FLAG_DEVICE_GPU) {
+				if (node->target_support_flags & AGO_KERNEL_FLAG_GPU_INTEG_FULL) {
+					if (node->attr_affinity.group != 0) {
+						agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: kernel %s can't be grouped with other kernels on GPU\n", node->akernel->name);
+						return -1;
+					}
+				}
+				else if (node->target_support_flags & (AGO_KERNEL_FLAG_GPU_INTEG_R2R | AGO_KERNEL_FLAG_GPU_INTEG_M2R)) {
+						if (node->attr_affinity.group == 0) {
+							// use an unsed group Id
+							node->attr_affinity.group = nextAvailGroupId++;
+						}
+				}
+				// set default target as GPU
+				node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
+				node->attr_affinity.device_info = 0;
+			}
+			else {
+				// fall back to CPU
+				//TBD: these messages may be useful when requested by user
+				//printf("WARNING: kernel %s not supported on GPU -- falling back to CPU\n", node->akernel->name);
+				// set default target as CPU
+				node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
+				node->attr_affinity.device_info = 0;
+				node->attr_affinity.group = 0;
+			}
+		}
+		else {
+			if (default_target == AGO_KERNEL_FLAG_DEVICE_GPU) {
+				// choose GPU as default if supported
+				if (node->target_support_flags & AGO_KERNEL_FLAG_DEVICE_GPU) {
+					// set default target as GPU
+					node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
+					node->attr_affinity.device_info = 0;
+					node->attr_affinity.group = 0;
+					if (node->target_support_flags & (AGO_KERNEL_FLAG_GPU_INTEG_R2R | AGO_KERNEL_FLAG_GPU_INTEG_M2R)) {
+						// use an unsed group Id
+						node->attr_affinity.group = nextAvailGroupId++;
+					}
+				}
+				else {
+					// set default target as CPU
+					node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
+					node->attr_affinity.device_info = 0;
+					node->attr_affinity.group = 0;
+				}
+			}
+			else {
+				// choose CPU as default if supported
+				if (node->target_support_flags & AGO_KERNEL_FLAG_DEVICE_CPU) {
+					// set default target as CPU
+					node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
+					node->attr_affinity.device_info = 0;
+					node->attr_affinity.group = 0;
+				}
+				else {
+					// set default target as GPU
+					node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
+					node->attr_affinity.device_info = 0;
+					node->attr_affinity.group = 0;
+					if (node->target_support_flags & (AGO_KERNEL_FLAG_GPU_INTEG_R2R | AGO_KERNEL_FLAG_GPU_INTEG_M2R)) {
+						// use an unsed group Id
+						node->attr_affinity.group = nextAvailGroupId++;
+					}
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+#if ENABLE_OPENCL
+static int agoOptimizeDramaAllocMergeSuperNodes(AgoGraph * graph)
+{
+	// initialize groupInfo list with SuperNodeInfo
+	class SuperNodeInfo {
+	public:
+		vx_uint32 integ_flags;
+		vx_uint32 min_hierarchical_level;
+		vx_uint32 max_hierarchical_level;
+		std::list<AgoNode *> nodeList;
+		std::list<AgoData *> inputList;
+		std::list<AgoData *> outputList;
+	};
+	std::map<vx_uint32, SuperNodeInfo *> groupInfo;
+	for (auto node = graph->nodeList.head; node; node = node->next) {
+		vx_uint32 group = node->attr_affinity.group;
+		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && group > 0) {
+			auto it = groupInfo.find(group);
+			// create/get superNodeInfo for the current group
+			SuperNodeInfo * superNodeInfo = nullptr;
+			if (it == groupInfo.end()) {
+				superNodeInfo = new SuperNodeInfo;
+				superNodeInfo->integ_flags = 0;
+				superNodeInfo->min_hierarchical_level = INT_MAX;
+				superNodeInfo->max_hierarchical_level = 0;
+				groupInfo[group] = superNodeInfo;
+			}
+			else {
+				superNodeInfo = groupInfo[group];
+			}
+			// update superNodeInfo
+			superNodeInfo->integ_flags |= node->target_support_flags & AGO_KERNEL_FLAG_GPU_INTEG_MASK;
+			if (node->hierarchical_level < superNodeInfo->min_hierarchical_level) superNodeInfo->min_hierarchical_level = node->hierarchical_level;
+			if (node->hierarchical_level > superNodeInfo->max_hierarchical_level) superNodeInfo->max_hierarchical_level = node->hierarchical_level;
+			superNodeInfo->nodeList.push_back(node);
+			for (vx_uint32 i = 0; i < node->paramCount; i++) {
+				auto data = node->paramList[i];
+				if (data) {
+					auto it = std::find(superNodeInfo->inputList.begin(), superNodeInfo->inputList.end(), data);
+					if (it == superNodeInfo->inputList.end() && (node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL))
+						superNodeInfo->inputList.push_back(data);
+					it = std::find(superNodeInfo->outputList.begin(), superNodeInfo->outputList.end(), data);
+					if (it == superNodeInfo->outputList.end() && (node->parameters[i].direction == VX_OUTPUT || node->parameters[i].direction == VX_BIDIRECTIONAL))
+						superNodeInfo->outputList.push_back(data);
+				}
+			}
+		}
+	}
+	// perform  one hierarchical level at a time
+	for (auto enode = graph->nodeList.head; enode;) {
+		// get snode..enode with next hierarchical_level 
+		auto hierarchical_level = enode->hierarchical_level;
+		auto snode = enode; enode = enode->next;
+		while (enode && enode->hierarchical_level == hierarchical_level)
+			enode = enode->next;
+		// try to merge with supernodes from previous hierarchical levels
+		for (auto cnode = snode; cnode != enode; cnode = cnode->next) {
+			if (cnode->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && cnode->attr_affinity.group > 0) {
+				SuperNodeInfo * csuperNodeInfo = groupInfo[cnode->attr_affinity.group];
+				for (auto pnode = graph->nodeList.head; pnode != cnode; pnode = pnode->next) {
+					if (pnode->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && pnode->attr_affinity.group > 0 && pnode->attr_affinity.group != cnode->attr_affinity.group) {
+						SuperNodeInfo * psuperNodeInfo = groupInfo[pnode->attr_affinity.group];
+						// check and merge if csuperNodeInfo can be merged with psuperNodeInfo
+						auto conflictDetected = false;
+						if (cnode->target_support_flags & pnode->target_support_flags & AGO_KERNEL_FLAG_GPU_INTEG_M2R) {
+							// only one M2R allowed per supernode at this time
+							conflictDetected = true;
+						}
+						else if (pnode->paramList[0]->u.img.width != cnode->paramList[0]->u.img.width || pnode->paramList[0]->u.img.height != cnode->paramList[0]->u.img.height) {
+							// all destination images shall have same dimensions
+							conflictDetected = true;
+						}
+						else {
+							for (auto cit = csuperNodeInfo->inputList.begin(); cit != csuperNodeInfo->inputList.end(); cit++) {
+								auto pit = std::find(psuperNodeInfo->outputList.begin(), psuperNodeInfo->outputList.end(), *cit);
+								if (pit == psuperNodeInfo->outputList.end()) {
+									if ((*cit)->hierarchical_level > psuperNodeInfo->min_hierarchical_level) {
+										conflictDetected = true;
+										break;
+									}
+								}
+								else if (cnode->target_support_flags & AGO_KERNEL_FLAG_GPU_INTEG_M2R) {
+									// can't gather from output from the same supernode
+									conflictDetected = true;
+									break;
+								}
+							}
+						}
+						if (!conflictDetected) {
+							auto cgroup = cnode->attr_affinity.group;
+							psuperNodeInfo->integ_flags |= csuperNodeInfo->integ_flags;
+							psuperNodeInfo->min_hierarchical_level = min(psuperNodeInfo->min_hierarchical_level, csuperNodeInfo->min_hierarchical_level);
+							psuperNodeInfo->max_hierarchical_level = max(psuperNodeInfo->max_hierarchical_level, csuperNodeInfo->max_hierarchical_level);
+							for (auto it = csuperNodeInfo->nodeList.begin(); it != csuperNodeInfo->nodeList.end(); it++) {
+								(*it)->attr_affinity.group = pnode->attr_affinity.group;
+								psuperNodeInfo->nodeList.push_back(*it);
+							}
+							for (auto it = csuperNodeInfo->inputList.begin(); it != csuperNodeInfo->inputList.end(); it++)
+								psuperNodeInfo->inputList.push_back(*it);
+							for (auto it = csuperNodeInfo->outputList.begin(); it != csuperNodeInfo->outputList.end(); it++)
+								psuperNodeInfo->outputList.push_back(*it);
+							groupInfo.erase(cgroup);
+							delete csuperNodeInfo;
+							csuperNodeInfo = nullptr;
+							break;
+						}
+					}
+				}
+			}
+		}
+	}
+	// release
+	for (auto it = groupInfo.begin(); it != groupInfo.end(); it++) {
+		delete it->second;
+	}
+#if _DEBUG
+	// count number of CPU & GPU scheduled nodes
+	int nodeCpuCount = 0, nodeGpuCount = 0;
+	for (auto node = graph->nodeList.head; node; node = node->next) {
+		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU)
+			nodeGpuCount++;
+		else
+			nodeCpuCount++;
+	}
+	agoAddLogEntry(NULL, VX_SUCCESS, "OK: OpenVX scheduling %d nodes on CPU and %d nodes on GPU\n", nodeCpuCount, nodeGpuCount);
+#endif
+	return 0;
+}
+#endif
+
+int agoOptimizeDramaAlloc(AgoGraph * agraph)
+{
+	// return success if there is nothing to do
+	if (!agraph->nodeList.head)
+		return 0;
+
+	// make sure all buffers are properly checked and updated
+	for (AgoData * adata = agraph->dataList.head; adata; adata = adata->next) {
+		if (!adata->buffer && agoDataSanityCheckAndUpdate(adata)) {
+			return -1;
+		}
+	}
+	for (AgoData * adata = agraph->ref.context->dataList.head; adata; adata = adata->next) {
+		if (!adata->buffer && agoDataSanityCheckAndUpdate(adata)) {
+			return -1;
+		}
+	}
+
+	// set default target assignments
+	if (agoOptimizeDramaAllocSetDefaultTargets(agraph) < 0) {
+		return -1;
+	}
+
+#if ENABLE_OPENCL
+	if (!(agraph->optimizer_flags & AGO_GRAPH_OPTIMIZER_FLAG_NO_SUPERNODE_MERGE)) {
+		// merge super nodes
+		if (agoOptimizeDramaAllocMergeSuperNodes(agraph) < 0) {
+			return -1;
+		}
+	}
+
+	// allocate GPU resources
+	if (agoOptimizeDramaAllocGpuResources(agraph) < 0) {
+		return -1;
+	}
+#endif
+
+	// remove unused data
+	if (agoOptimizeDramaAllocRemoveUnusedData(agraph)) return -1;
+
+	// make sure all buffers are allocated and initialized
+	for (AgoData * adata = agraph->dataList.head; adata; adata = adata->next) {
+		if (agoAllocData(adata)) {
+			vx_char name[256]; agoGetDataName(name, adata); 
+			agoAddLogEntry(&adata->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAlloc: data allocation failed for %s\n", name);
+			return -1;
+		}
+	}
+	for (AgoData * adata = agraph->ref.context->dataList.head; adata; adata = adata->next) {
+		if (agoAllocData(adata)) {
+			vx_char name[256]; agoGetDataName(name, adata); 
+			agoAddLogEntry(&adata->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAlloc: data allocation failed for %s\n", name);
+			return -1;
+		}
+	}
+
+	return 0;
+}
diff --git a/openvx/ago/ago_drama_analyze.cpp b/openvx/ago/ago_drama_analyze.cpp
new file mode 100644
index 0000000..c6d717d
--- /dev/null
+++ b/openvx/ago/ago_drama_analyze.cpp
@@ -0,0 +1,29 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+int agoOptimizeDramaAnalyze(AgoGraph * agraph)
+{
+	return 0;
+}
diff --git a/openvx/ago/ago_drama_divide.cpp b/openvx/ago/ago_drama_divide.cpp
new file mode 100644
index 0000000..907d27e
--- /dev/null
+++ b/openvx/ago/ago_drama_divide.cpp
@@ -0,0 +1,1982 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+#define SANITY_CHECK_DATA_TYPE(data,data_type)          if(!data || data->ref.type != data_type) return -1
+#define SANITY_CHECK_DATA_TYPE_OPTIONAL(data,data_type) if( data && data->ref.type != data_type) return -1
+
+int agoDramaDivideAppend(AgoNodeList * nodeList, AgoNode * anode, vx_enum new_kernel_id)
+{
+	if (new_kernel_id == VX_KERNEL_AMD_INVALID) {
+		// TBD: error handling
+		agoAddLogEntry(&anode->akernel->ref, VX_FAILURE, "ERROR: agoDramaDivideAppend(*,0x%08x[%s],INVALID) not implemented\n", anode->akernel->id, anode->akernel->name);
+		return -1;
+	}
+	// create a new AgoNode and add it to the nodeList
+	AgoNode * childnode = agoCreateNode((AgoGraph *)anode->ref.scope, new_kernel_id);
+	for (vx_uint32 i = 0; i < anode->paramCount; i++) {
+		childnode->paramList[i] = anode->paramList[i];
+	}
+	// transfer attributes from anode to childnode
+	agoImportNodeConfig(childnode, anode);
+	// verify the node
+	return agoVerifyNode(childnode);
+}
+
+int agoDramaDivideColorConvertNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// get params
+	AgoData * srcParam = anode->paramList[0];
+	AgoData * dstParam = anode->paramList[1];
+	vx_df_image itype = srcParam->u.img.format;
+	vx_df_image otype = dstParam->u.img.format;
+	// divide the node
+	if (otype == VX_DF_IMAGE_RGB) {
+		if (itype == VX_DF_IMAGE_RGBX) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGB_RGBX);
+		}
+		else if (itype == VX_DF_IMAGE_UYVY) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY);
+		}
+		else if (itype == VX_DF_IMAGE_YUYV) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV);
+		}
+		else if (itype == VX_DF_IMAGE_NV12) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12);
+		}
+		else if (itype == VX_DF_IMAGE_NV21) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21);
+		}
+		else if (itype == VX_DF_IMAGE_IYUV) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramList[3] = srcParam->children[2];
+			anode->paramCount = 4;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV);
+		}
+	}
+	else if (otype == VX_DF_IMAGE_RGBX) {
+		if (itype == VX_DF_IMAGE_RGB) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGBX_RGB);
+		}
+		else if (itype == VX_DF_IMAGE_UYVY) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY);
+		}
+		else if (itype == VX_DF_IMAGE_YUYV) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV);
+		}
+		else if (itype == VX_DF_IMAGE_NV12) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12);
+		}
+		else if (itype == VX_DF_IMAGE_NV21) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21);
+		}
+		else if (itype == VX_DF_IMAGE_IYUV) {
+			anode->paramList[0] = dstParam;
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramList[3] = srcParam->children[2];
+			anode->paramCount = 4;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV);
+		}
+	}
+	else if (otype == VX_DF_IMAGE_NV12) {
+		if (itype == VX_DF_IMAGE_UYVY) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = dstParam->children[1];
+			anode->paramList[2] = srcParam;
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY);
+		}
+		else if (itype == VX_DF_IMAGE_YUYV) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = dstParam->children[1];
+			anode->paramList[2] = srcParam;
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV);
+		}
+		else if (itype == VX_DF_IMAGE_IYUV) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam->children[1];
+			anode->paramList[2] = srcParam->children[2];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_UV12_IUV);
+		}
+		else if (itype == VX_DF_IMAGE_RGB) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB);
+		}
+		else if (itype == VX_DF_IMAGE_RGBX) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX);
+		}
+	}
+	else if (otype == VX_DF_IMAGE_IYUV) {
+		if (itype == VX_DF_IMAGE_UYVY) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = dstParam->children[1];
+			anode->paramList[2] = dstParam->children[2];
+			anode->paramList[3] = srcParam;
+			anode->paramCount = 4;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY);
+		}
+		else if (itype == VX_DF_IMAGE_YUYV) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = dstParam->children[1];
+			anode->paramList[2] = dstParam->children[2];
+			anode->paramList[3] = srcParam;
+			anode->paramCount = 4;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV);
+		}
+		else if (itype == VX_DF_IMAGE_NV12) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = dstParam->children[2];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12);
+		}
+		else if (itype == VX_DF_IMAGE_NV21) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = dstParam->children[1];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12);
+		}
+		else if (itype == VX_DF_IMAGE_RGB) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB);
+		}
+		else if (itype == VX_DF_IMAGE_RGBX) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX);
+		}
+	}
+	else if (otype == VX_DF_IMAGE_YUV4) {
+		if (itype == VX_DF_IMAGE_IYUV) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam->children[1];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = srcParam->children[2];
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8);
+		}
+		else if (itype == VX_DF_IMAGE_NV12) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = dstParam->children[2];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12);
+		}
+		else if (itype == VX_DF_IMAGE_NV21) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam->children[0];
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = dstParam->children[1];
+			anode->paramList[2] = srcParam->children[1];
+			anode->paramCount = 3;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12);
+		}
+		else if (itype == VX_DF_IMAGE_RGB) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_U_RGB)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_V_RGB);
+		}
+		else if (itype == VX_DF_IMAGE_RGBX) {
+			anode->paramList[0] = dstParam->children[0];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX)) return -1;
+			anode->paramList[0] = dstParam->children[1];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_U_RGBX)) return -1;
+			anode->paramList[0] = dstParam->children[2];
+			anode->paramList[1] = srcParam;
+			anode->paramCount = 2;
+			return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_COLOR_CONVERT_V_RGBX);
+		}
+	}
+	return -1;
+}
+
+int agoDramaDivideChannelExtractNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// get params
+	AgoData * srcParam = anode->paramList[0];
+	AgoData * channelParam = anode->paramList[1];
+	AgoData * dstParam = anode->paramList[2];
+	vx_df_image itype = srcParam->u.img.format;
+	vx_enum channel_e = channelParam->u.scalar.u.e;
+	// divide the node
+	if (itype == VX_DF_IMAGE_RGB) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam;
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_R) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS0);
+		else if (channel_e == VX_CHANNEL_G) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS1);
+		else if (channel_e == VX_CHANNEL_B) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS2);
+	}
+	else if (itype == VX_DF_IMAGE_RGBX) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam;
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_R) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0);
+		else if (channel_e == VX_CHANNEL_G) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1);
+		else if (channel_e == VX_CHANNEL_B) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2);
+		else if (channel_e == VX_CHANNEL_A) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3);
+	}
+	else if (itype == VX_DF_IMAGE_NV12) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam->children[(channel_e != VX_CHANNEL_Y) ? 1 : 0];
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_Y) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+		else if (channel_e == VX_CHANNEL_U) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS0);
+		else if (channel_e == VX_CHANNEL_V) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS1);
+	}
+	else if (itype == VX_DF_IMAGE_NV21) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam->children[(channel_e != VX_CHANNEL_Y) ? 1 : 0];
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_Y) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+		else if (channel_e == VX_CHANNEL_U) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS1);
+		else if (channel_e == VX_CHANNEL_V) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS0);
+	}
+	else if (itype == VX_DF_IMAGE_UYVY) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam;
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_Y) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS1);
+		else if (channel_e == VX_CHANNEL_U) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0);
+		else if (channel_e == VX_CHANNEL_V) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2);
+	}
+	else if (itype == VX_DF_IMAGE_YUYV) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam;
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_Y) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS0);
+		else if (channel_e == VX_CHANNEL_U) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1);
+		else if (channel_e == VX_CHANNEL_V) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3);
+	}
+	else if (itype == VX_DF_IMAGE_IYUV || itype == VX_DF_IMAGE_YUV4) {
+		anode->paramList[0] = dstParam;
+		anode->paramList[1] = srcParam->children[channel_e - VX_CHANNEL_Y];
+		anode->paramCount = 2;
+		if (channel_e == VX_CHANNEL_Y) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+		else if (channel_e == VX_CHANNEL_U) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+		else if (channel_e == VX_CHANNEL_V) return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+	}
+	return -1;
+}
+
+int agoDramaDivideChannelCombineNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 5) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[2], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[3], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[4], VX_TYPE_IMAGE);
+	int inputMask = 3 | (anode->paramList[2] ? 4 : 0) | (anode->paramList[3] ? 8 : 0);
+	// perform the divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	vx_uint32 paramCount = anode->paramCount;
+	vx_df_image otype = paramList[4]->u.img.format;
+	if (otype == VX_DF_IMAGE_RGB) {
+		if (inputMask != 7) return -1;
+		anode->paramList[0] = paramList[4];
+		anode->paramList[1] = paramList[0];
+		anode->paramList[2] = paramList[1];
+		anode->paramList[3] = paramList[2];
+		anode->paramCount = 4;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COMBINE_U24_U8U8U8_RGB);
+	}
+	else if (otype == VX_DF_IMAGE_RGBX) {
+		if (inputMask != 15) return -1;
+		anode->paramList[0] = paramList[4];
+		anode->paramList[1] = paramList[0];
+		anode->paramList[2] = paramList[1];
+		anode->paramList[3] = paramList[2];
+		anode->paramList[4] = paramList[3];
+		anode->paramCount = 5;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8U8_RGBX);
+	}
+	else if (otype == VX_DF_IMAGE_UYVY) {
+		if (inputMask != 7) return -1;
+		anode->paramList[0] = paramList[4];
+		anode->paramList[1] = paramList[0];
+		anode->paramList[2] = paramList[1];
+		anode->paramList[3] = paramList[2];
+		anode->paramCount = 4;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_UYVY);
+	}
+	else if (otype == VX_DF_IMAGE_YUYV) {
+		if (inputMask != 7) return -1;
+		anode->paramList[0] = paramList[4];
+		anode->paramList[1] = paramList[0];
+		anode->paramList[2] = paramList[1];
+		anode->paramList[3] = paramList[2];
+		anode->paramCount = 4;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_YUYV);
+	}
+	else if (otype == VX_DF_IMAGE_NV12) {
+		if (inputMask != 7) return -1;
+		anode->paramList[0] = paramList[4]->children[0];
+		anode->paramList[1] = paramList[0]->children ? paramList[0]->children[0] : paramList[0];
+		anode->paramCount = 2;
+		if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+		anode->paramList[0] = paramList[4]->children[1];
+		anode->paramList[1] = paramList[1];
+		anode->paramList[2] = paramList[2];
+		anode->paramCount = 3;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COMBINE_U16_U8U8);
+	}
+	else if (otype == VX_DF_IMAGE_NV21) {
+		if (inputMask != 7) return -1;
+		anode->paramList[0] = paramList[4]->children[0];
+		anode->paramList[1] = paramList[0]->children ? paramList[0]->children[0] : paramList[0];
+		anode->paramCount = 2;
+		if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+		anode->paramList[0] = paramList[4]->children[1];
+		anode->paramList[1] = paramList[2];
+		anode->paramList[2] = paramList[1];
+		anode->paramCount = 3;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COMBINE_U16_U8U8);
+	}
+	else if ((otype == VX_DF_IMAGE_IYUV) || (otype == VX_DF_IMAGE_YUV4)) {
+		if (inputMask != 7) return -1;
+		anode->paramList[0] = paramList[4]->children[0];
+		anode->paramList[1] = paramList[0]->children ? paramList[0]->children[0] : paramList[0];
+		anode->paramCount = 2;
+		if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+		anode->paramList[0] = paramList[4]->children[1];
+		anode->paramList[1] = paramList[1]->children ? paramList[1]->children[0] : paramList[1];
+		anode->paramCount = 2;
+		if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)) return -1;
+		anode->paramList[0] = paramList[4]->children[2];
+		anode->paramList[1] = paramList[2]->children ? paramList[2]->children[0] : paramList[2];
+		anode->paramCount = 2;
+		return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+	}
+	return -1;
+}
+
+int agoDramaDivideSobel3x3Node(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	if (agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX)) return -1;
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY);
+}
+
+int agoDramaDivideMagnitudeNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MAGNITUDE_S16_S16S16);
+}
+
+int agoDramaDividePhaseNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_PHASE_U8_S16S16);
+}
+
+int agoDramaDivideScaleImageNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	if (anode->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || anode->paramList[1]->u.img.format != VX_DF_IMAGE_U8) return -1;
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	// check for special no-scale case
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if ((paramList[0]->u.img.width == paramList[1]->u.img.width) && (paramList[0]->u.img.height == paramList[1]->u.img.height)) {
+		// just perform copy
+		anode->paramList[0] = paramList[1];
+		anode->paramList[1] = paramList[0];
+		anode->paramCount = 2;
+		new_kernel_id = VX_KERNEL_AMD_CHANNEL_COPY_U8_U8;
+	}
+	else {
+		vx_enum interpolation = paramList[2]->u.scalar.u.e;
+		// identify scale kernel
+		anode->paramList[0] = paramList[1];
+		anode->paramList[1] = paramList[0];
+		anode->paramCount = 2;
+		if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED) {
+			if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST;
+			else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR;
+			else if (interpolation == VX_INTERPOLATION_TYPE_AREA) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA;
+		}
+		else if (anode->attr_border_mode.mode == VX_BORDER_MODE_REPLICATE) {
+			if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST; // TBD remove -- this should be an error
+			else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_REPLICATE;
+			else if (interpolation == VX_INTERPOLATION_TYPE_AREA) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA; // TBD remove -- this should be an error
+		}
+		else if (anode->attr_border_mode.mode == VX_BORDER_MODE_CONSTANT) {
+			if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST; // TBD remove -- this should be an error
+			else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) {
+				new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_CONSTANT;
+				// create scalar object for border mode
+				AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+				char desc[64]; sprintf(desc, "scalar-virtual:UINT8,%d", anode->attr_border_mode.constant_value);
+				AgoData * dataBorder = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+				if (!dataBorder) return -1;
+				agoGenerateVirtualDataName(agraph, "scalar", dataBorder->name);
+				agoAddData(&agraph->dataList, dataBorder);
+				// make it 3rd argument
+				anode->paramList[anode->paramCount++] = dataBorder;
+			}
+			else if (interpolation == VX_INTERPOLATION_TYPE_AREA) new_kernel_id = VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA; // TBD remove -- this should be an error
+		}
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideTableLookupNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_LUT);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_LUT_U8_U8);
+}
+
+int agoDramaDivideHistogramNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_DISTRIBUTION);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_HISTOGRAM_DATA_U8);
+}
+
+int agoDramaDivideEqualizeHistogramNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	// create virtual histogram and look-up table objects
+	AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+	AgoData * hist = agoCreateDataFromDescription(anode->ref.context, agraph, "distribution-virtual:256,0,256", false);
+	AgoData * lut = agoCreateDataFromDescription(anode->ref.context, agraph, "lut-virtual:UINT8,256", false);
+	if (!hist || !lut) return -1;
+	agoGenerateVirtualDataName(agraph, "histogram", hist->name);
+	agoGenerateVirtualDataName(agraph, "lut", lut->name);
+	agoAddData(&agraph->dataList, hist);
+	agoAddData(&agraph->dataList, lut);
+	// histogram
+	anode->paramList[0] = hist;
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	int status = agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_HISTOGRAM_DATA_U8);
+	// equalization
+	anode->paramList[0] = lut;
+	anode->paramList[1] = hist;
+	anode->paramCount = 2;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_EQUALIZE_DATA_DATA);
+	// table lookup
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = lut;
+	anode->paramCount = 3;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_LUT_U8_U8);
+	return status;
+}
+
+int agoDramaDivideAbsdiffNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (paramList[2]->u.img.format == VX_DF_IMAGE_U8) new_kernel_id = VX_KERNEL_AMD_ABS_DIFF_U8_U8U8;
+	else if (paramList[2]->u.img.format == VX_DF_IMAGE_S16) new_kernel_id = VX_KERNEL_AMD_ABS_DIFF_S16_S16S16_SAT;
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideMeanStddevNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	// create virtual AGO_TYPE_MEANSTDDEV_DATA
+	AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+	AgoData * data = agoCreateDataFromDescription(anode->ref.context, agraph, "ago-meanstddev-data-virtual:", false);
+	if (!data) return -1;
+	agoGenerateVirtualDataName(agraph, "meanstddev", data->name);
+	agoAddData(&agraph->dataList, data);
+	// compute sum and sum-of-squares
+	anode->paramList[0] = data;
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	int status = agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MEAN_STD_DEV_DATA_U8);
+	// compute mean and average
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[2];
+	anode->paramList[2] = data;
+	anode->paramCount = 3;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MEAN_STD_DEV_MERGE_DATA_DATA);
+	return status;
+}
+
+int agoDramaDivideThresholdNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_THRESHOLD);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (paramList[1]->u.thr.thresh_type == VX_THRESHOLD_TYPE_BINARY) new_kernel_id = VX_KERNEL_AMD_THRESHOLD_U8_U8_BINARY;
+	else if (paramList[1]->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) new_kernel_id = VX_KERNEL_AMD_THRESHOLD_U8_U8_RANGE;
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideIntegralImageNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_INTEGRAL_IMAGE_U32_U8);
+}
+
+int agoDramaDivideDilate3x3Node(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_DILATE_U8_U8_3x3);
+}
+
+int agoDramaDivideErode3x3Node(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_ERODE_U8_U8_3x3);
+}
+
+int agoDramaDivideMedian3x3Node(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MEDIAN_U8_U8_3x3);
+}
+
+int agoDramaDivideBox3x3Node(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_BOX_U8_U8_3x3);
+}
+
+int agoDramaDivideGaussian3x3Node(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_GAUSSIAN_U8_U8_3x3);
+}
+
+int agoDramaDivideCustomConvolutionNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_CONVOLUTION);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	vx_df_image dst_image_format = paramList[2]->u.img.format;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if ((paramList[1]->u.conv.rows & 1) && (paramList[1]->u.conv.columns & 1)) new_kernel_id = (dst_image_format == VX_DF_IMAGE_U8) ? VX_KERNEL_AMD_CONVOLVE_U8_U8 : VX_KERNEL_AMD_CONVOLVE_S16_U8;
+	else {
+		agoAddLogEntry(&paramList[1]->ref, VX_FAILURE, "ERROR: agoDramaDivideCustomConvolutionNode: convolution size " VX_FMT_SIZE "x" VX_FMT_SIZE " not supported\n", paramList[1]->u.conv.rows, paramList[1]->u.conv.columns);
+		return -1;
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideGaussianPyramidNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_PYRAMID);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	AgoData * nextInput = paramList[0]->children ? paramList[0]->children[0] : paramList[0];
+	int status = 0;
+	for (vx_uint32 level = 0; level < paramList[1]->numChildren; level++) {
+		anode->paramList[0] = paramList[1]->children[level];
+		anode->paramList[1] = nextInput;
+		anode->paramCount = 2;
+		vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+		if (level == 0) new_kernel_id = VX_KERNEL_AMD_CHANNEL_COPY_U8_U8;
+		else if (paramList[1]->u.pyr.scale == VX_SCALE_PYRAMID_HALF) new_kernel_id = VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5;
+		else if (paramList[1]->u.pyr.scale == VX_SCALE_PYRAMID_ORB) new_kernel_id = VX_KERNEL_AMD_SCALE_GAUSSIAN_ORB_U8_U8_5x5;
+		status |= agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+		nextInput = paramList[1]->children[level];
+	}
+	return status;
+}
+
+int agoDramaDivideAccumulateNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_ACCUMULATE_S16_S16U8_SAT);
+}
+
+int agoDramaDivideAccumulateWeightedNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_ACCUMULATE_WEIGHTED_U8_U8U8);
+}
+
+int agoDramaDivideAccumulateSquareNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_ACCUMULATE_SQUARED_S16_S16U8_SAT);
+}
+
+int agoDramaDivideMinmaxlocNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount < 3 || anode->paramCount > 7) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[3], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[4], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[5], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[6], VX_TYPE_SCALAR);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	// create virtual AGO_TYPE_MINMAXLOC_DATA
+	AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+	AgoData * data = agoCreateDataFromDescription(anode->ref.context, agraph, "ago-minmaxloc-data-virtual:", false);
+	AgoData * data_final = agoCreateDataFromDescription(anode->ref.context, agraph, "ago-minmaxloc-data-virtual:", false);
+	if (!data || !data_final) return -1;
+	agoGenerateVirtualDataName(agraph, "minmaxloc", data->name);
+	agoGenerateVirtualDataName(agraph, "minmaxloc-final", data_final->name);
+	agoAddData(&agraph->dataList, data);
+	agoAddData(&agraph->dataList, data_final);
+	// perform divide
+	int status = 0;
+	if (paramList[0]->u.img.format == VX_DF_IMAGE_U8) {
+		anode->paramList[0] = data;
+		anode->paramList[1] = paramList[0];
+		anode->paramCount = 2;
+		status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_DATA_U8);
+		anode->paramList[0] = paramList[1];
+		anode->paramList[1] = paramList[2];
+		anode->paramList[2] = data_final;
+		anode->paramList[3] = data;
+		anode->paramCount = 4;
+		status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_MERGE_DATA_DATA);
+		if (paramList[3] && paramList[4]) {
+			anode->paramList[0] = paramList[3];
+			anode->paramList[1] = paramList[4];
+			anode->paramList[2] = paramList[5];
+			anode->paramList[3] = paramList[6];
+			anode->paramList[4] = paramList[0];
+			anode->paramList[5] = data_final;
+			anode->paramCount = 6;
+			status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MINMAX_COUNT_MINMAX);
+		}
+		else if(paramList[3]) {
+			if (paramList[5] && !paramList[6]) {
+				anode->paramList[0] = paramList[3];
+				anode->paramList[1] = paramList[5];
+				anode->paramList[2] = paramList[0];
+				anode->paramList[3] = data_final;
+				anode->paramCount = 4;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MIN_COUNT_MIN);
+			}
+			else {
+				anode->paramList[0] = paramList[3];
+				anode->paramList[1] = paramList[5];
+				anode->paramList[2] = paramList[6];
+				anode->paramList[3] = paramList[0];
+				anode->paramList[4] = data_final;
+				anode->paramCount = 5;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MIN_COUNT_MINMAX);
+			}
+		}
+		else if (paramList[4]) {
+			if (!paramList[5] && paramList[6]) {
+				anode->paramList[0] = paramList[4];
+				anode->paramList[1] = paramList[6];
+				anode->paramList[2] = paramList[0];
+				anode->paramList[3] = data_final;
+				anode->paramCount = 4;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MAX_COUNT_MAX);
+			}
+			else {
+				anode->paramList[0] = paramList[4];
+				anode->paramList[1] = paramList[5];
+				anode->paramList[2] = paramList[6];
+				anode->paramList[3] = paramList[0];
+				anode->paramList[4] = data_final;
+				anode->paramCount = 5;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MAX_COUNT_MINMAX);
+			}
+		}
+		else {
+			if (paramList[5] && paramList[6]) {
+				anode->paramList[0] = paramList[5];
+				anode->paramList[1] = paramList[6];
+				anode->paramList[2] = paramList[0];
+				anode->paramList[3] = data_final;
+				anode->paramCount = 4;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MINMAX);
+			}
+			else if (paramList[5]) {
+				anode->paramList[0] = paramList[5];
+				anode->paramList[1] = paramList[0];
+				anode->paramList[2] = data_final;
+				anode->paramCount = 3;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MIN);
+			}
+			else if (paramList[6]) {
+				anode->paramList[0] = paramList[6];
+				anode->paramList[1] = paramList[0];
+				anode->paramList[2] = data_final;
+				anode->paramCount = 3;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MAX);
+			}
+		}
+	}
+	else if (paramList[0]->u.img.format == VX_DF_IMAGE_S16) {
+		anode->paramList[0] = data;
+		anode->paramList[1] = paramList[0];
+		anode->paramCount = 2;
+		status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_DATA_S16);
+		anode->paramList[0] = paramList[1];
+		anode->paramList[1] = paramList[2];
+		anode->paramList[2] = data_final;
+		anode->paramList[3] = data;
+		anode->paramCount = 4;
+		status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_MERGE_DATA_DATA);
+		if (paramList[3] && paramList[4]) {
+			anode->paramList[0] = paramList[3];
+			anode->paramList[1] = paramList[4];
+			anode->paramList[2] = paramList[5];
+			anode->paramList[3] = paramList[6];
+			anode->paramList[4] = paramList[0];
+			anode->paramList[5] = data_final;
+			anode->paramCount = 6;
+			status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MINMAX_COUNT_MINMAX);
+		}
+		else if (paramList[3]) {
+			if (paramList[5] && !paramList[6]) {
+				anode->paramList[0] = paramList[3];
+				anode->paramList[1] = paramList[5];
+				anode->paramList[2] = paramList[0];
+				anode->paramList[3] = data_final;
+				anode->paramCount = 4;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MIN_COUNT_MIN);
+			}
+			else {
+				anode->paramList[0] = paramList[3];
+				anode->paramList[1] = paramList[5];
+				anode->paramList[2] = paramList[6];
+				anode->paramList[3] = paramList[0];
+				anode->paramList[4] = data_final;
+				anode->paramCount = 5;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MIN_COUNT_MINMAX);
+			}
+		}
+		else if (paramList[4]) {
+			if (!paramList[5] && paramList[6]) {
+				anode->paramList[0] = paramList[4];
+				anode->paramList[1] = paramList[6];
+				anode->paramList[2] = paramList[0];
+				anode->paramList[3] = data_final;
+				anode->paramCount = 4;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MAX);
+			}
+			else {
+				anode->paramList[0] = paramList[4];
+				anode->paramList[1] = paramList[5];
+				anode->paramList[2] = paramList[6];
+				anode->paramList[3] = paramList[0];
+				anode->paramList[4] = data_final;
+				anode->paramCount = 5;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MINMAX);
+			}
+		}
+		else {
+			if (paramList[5] && paramList[6]) {
+				anode->paramList[0] = paramList[5];
+				anode->paramList[1] = paramList[6];
+				anode->paramList[2] = paramList[0];
+				anode->paramList[3] = data_final;
+				anode->paramCount = 4;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MINMAX);
+			}
+			else if (paramList[5]) {
+				anode->paramList[0] = paramList[5];
+				anode->paramList[1] = paramList[0];
+				anode->paramList[2] = data_final;
+				anode->paramCount = 3;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MIN);
+			}
+			else if (paramList[6]) {
+				anode->paramList[0] = paramList[6];
+				anode->paramList[1] = paramList[0];
+				anode->paramList[2] = data_final;
+				anode->paramCount = 3;
+				status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MAX);
+			}
+		}
+	}
+	else status = -1;
+	return status;
+}
+
+int agoDramaDivideConvertDepthNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 4) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_SCALAR);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[3];
+	anode->paramCount = 3;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (paramList[1]->u.img.format == VX_DF_IMAGE_S16 || paramList[0]->u.img.format == VX_DF_IMAGE_U8) {
+		new_kernel_id = VX_KERNEL_AMD_COLOR_DEPTH_S16_U8;
+	}
+	else if (paramList[1]->u.img.format == VX_DF_IMAGE_U8 || paramList[0]->u.img.format == VX_DF_IMAGE_S16) {
+		if (paramList[2]->u.scalar.u.e == VX_CONVERT_POLICY_WRAP) new_kernel_id = VX_KERNEL_AMD_COLOR_DEPTH_U8_S16_WRAP;
+		else if (paramList[2]->u.scalar.u.e == VX_CONVERT_POLICY_SATURATE) new_kernel_id = VX_KERNEL_AMD_COLOR_DEPTH_U8_S16_SAT;
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideAndNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_AND_U8_U8U8);
+}
+
+int agoDramaDivideOrNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_OR_U8_U8U8);
+}
+
+int agoDramaDivideXorNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[2];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_XOR_U8_U8U8);
+}
+
+int agoDramaDivideNotNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 2) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_NOT_U8_U8);
+}
+
+int agoDramaDivideMultiplyNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 6) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[4], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[5], VX_TYPE_IMAGE);
+	// get and re-order parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	vx_uint32 paramCount = anode->paramCount;
+	vx_df_image otype = paramList[5]->u.img.format;
+	vx_df_image itypeA = paramList[0]->u.img.format;
+	vx_df_image itypeB = paramList[1]->u.img.format;
+	vx_enum overflow_policy = paramList[3]->u.scalar.u.e;
+	vx_enum rounding_policy = paramList[4]->u.scalar.u.e;
+	anode->paramList[0] = paramList[5];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramList[3] = paramList[2];
+	anode->paramCount = 4;
+	// divide
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_U8)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_ZERO)
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_U8_U8U8_SAT_TRUNC : VX_KERNEL_AMD_MUL_U8_U8U8_WRAP_TRUNC;
+		else
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_U8_U8U8_SAT_ROUND : VX_KERNEL_AMD_MUL_U8_U8U8_WRAP_ROUND;
+	}
+	else if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_S16)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_ZERO)
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_U8U8_SAT_TRUNC : VX_KERNEL_AMD_MUL_S16_U8U8_WRAP_TRUNC;
+		else
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_U8U8_SAT_ROUND : VX_KERNEL_AMD_MUL_S16_U8U8_WRAP_ROUND;
+	}
+	else if ((itypeA == VX_DF_IMAGE_S16) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_S16)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_ZERO)
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_S16U8_SAT_TRUNC : VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_TRUNC;
+		else
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_S16U8_SAT_ROUND : VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_ROUND;
+	}
+	else if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_S16) && (otype == VX_DF_IMAGE_S16)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_ZERO)
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_S16U8_SAT_TRUNC : VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_TRUNC;
+		else
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_S16U8_SAT_ROUND : VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_ROUND;
+		// switch A & B parameters
+		anode->paramList[1] = paramList[1];
+		anode->paramList[2] = paramList[0];
+	}
+	else if ((itypeA == VX_DF_IMAGE_S16) && (itypeB == VX_DF_IMAGE_S16) && (otype == VX_DF_IMAGE_S16)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_ZERO)
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_S16S16_SAT_TRUNC : VX_KERNEL_AMD_MUL_S16_S16S16_WRAP_TRUNC;
+		else
+			new_kernel_id = (overflow_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_MUL_S16_S16S16_SAT_ROUND : VX_KERNEL_AMD_MUL_S16_S16S16_WRAP_ROUND;
+	}
+	else if ((itypeA == VX_DF_IMAGE_RGB) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_RGB)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN && overflow_policy == VX_CONVERT_POLICY_SATURATE)
+			new_kernel_id = VX_KERNEL_AMD_MUL_U24_U24U8_SAT_ROUND;
+	}
+	else if ((itypeA == VX_DF_IMAGE_RGBX) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_RGBX)) {
+		if (rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN && overflow_policy == VX_CONVERT_POLICY_SATURATE)
+			new_kernel_id = VX_KERNEL_AMD_MUL_U32_U32U8_SAT_ROUND;
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideAddNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 4) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_IMAGE);
+	// get and re-order parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	vx_uint32 paramCount = anode->paramCount;
+	vx_df_image otype = paramList[3]->u.img.format;
+	vx_df_image itypeA = paramList[0]->u.img.format;
+	vx_df_image itypeB = paramList[1]->u.img.format;
+	vx_enum convert_policy = paramList[2]->u.scalar.u.e;
+	anode->paramList[0] = paramList[3];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	// divide
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_U8)) {
+		new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_ADD_U8_U8U8_SAT : VX_KERNEL_AMD_ADD_U8_U8U8_WRAP;
+	}
+	else if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_S16)) {
+		new_kernel_id = VX_KERNEL_AMD_ADD_S16_U8U8;
+	}
+	else if ((itypeA == VX_DF_IMAGE_S16) && (itypeB == VX_DF_IMAGE_U8) && (otype == VX_DF_IMAGE_S16)) {
+		new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_ADD_S16_S16U8_SAT : VX_KERNEL_AMD_ADD_S16_S16U8_WRAP;
+	}
+	else if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_S16) && (otype == VX_DF_IMAGE_S16)) {
+		new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_ADD_S16_S16U8_SAT : VX_KERNEL_AMD_ADD_S16_S16U8_WRAP;
+		// switch A & B parameters
+		anode->paramList[1] = paramList[1];
+		anode->paramList[2] = paramList[0];
+	}
+	else if ((itypeA == VX_DF_IMAGE_S16) && (itypeB == VX_DF_IMAGE_S16) && (otype == VX_DF_IMAGE_S16)) {
+		new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_ADD_S16_S16S16_SAT : VX_KERNEL_AMD_ADD_S16_S16S16_WRAP;
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideSubtractNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 4) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_IMAGE);
+	// get and re-order parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	vx_uint32 paramCount = anode->paramCount;
+	vx_df_image otype = paramList[3]->u.img.format;
+	vx_df_image itypeA = paramList[0]->u.img.format;
+	vx_df_image itypeB = paramList[1]->u.img.format;
+	vx_enum convert_policy = paramList[2]->u.scalar.u.e;
+	anode->paramList[0] = paramList[3];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	// divide
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (otype == VX_DF_IMAGE_U8) {
+		if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_U8)) {
+			new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_SUB_U8_U8U8_SAT : VX_KERNEL_AMD_SUB_U8_U8U8_WRAP;
+		}
+	}
+	else if (otype == VX_DF_IMAGE_S16) {
+		if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_U8)) {
+			new_kernel_id = VX_KERNEL_AMD_SUB_S16_U8U8;
+		}
+		else if ((itypeA == VX_DF_IMAGE_S16) && (itypeB == VX_DF_IMAGE_U8)) {
+			new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_SUB_S16_S16U8_SAT : VX_KERNEL_AMD_SUB_S16_S16U8_WRAP;
+		}
+		else if ((itypeA == VX_DF_IMAGE_U8) && (itypeB == VX_DF_IMAGE_S16)) {
+			new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_SUB_S16_U8S16_SAT : VX_KERNEL_AMD_SUB_S16_U8S16_WRAP;
+		}
+		else if ((itypeA == VX_DF_IMAGE_S16) && (itypeB == VX_DF_IMAGE_S16)) {
+			new_kernel_id = (convert_policy == VX_CONVERT_POLICY_SATURATE) ? VX_KERNEL_AMD_SUB_S16_S16S16_SAT : VX_KERNEL_AMD_SUB_S16_S16S16_WRAP;
+		}
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideHalfscaleGaussianNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 3) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[1];
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (paramList[2]->u.scalar.u.i == 3) new_kernel_id = VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_3x3;
+	else if (paramList[2]->u.scalar.u.i == 5) new_kernel_id = VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5;
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideRemapNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 4) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_REMAP);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[3];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	vx_enum interpolation = paramList[2]->u.scalar.u.e;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (anode->paramList[0]->u.img.format == VX_DF_IMAGE_U8 && anode->paramList[1]->u.img.format == VX_DF_IMAGE_U8) {
+		if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED) {
+			if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_REMAP_U8_U8_NEAREST;
+			else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_REMAP_U8_U8_BILINEAR;
+		}
+		else if (anode->attr_border_mode.mode == VX_BORDER_MODE_CONSTANT) {
+			if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_REMAP_U8_U8_NEAREST_CONSTANT;
+			else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_REMAP_U8_U8_BILINEAR_CONSTANT;
+			if (new_kernel_id != VX_KERNEL_AMD_INVALID) {
+				// create scalar object for border mode
+				AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+				char desc[64]; sprintf(desc, "scalar-virtual:UINT8,%d", anode->attr_border_mode.constant_value);
+				AgoData * dataBorder = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+				if (!dataBorder) return -1;
+				agoGenerateVirtualDataName(agraph, "scalar", dataBorder->name);
+				agoAddData(&agraph->dataList, dataBorder);
+				// make it 4th argument
+				anode->paramList[anode->paramCount++] = dataBorder;
+			}
+		}
+	}
+	else if (anode->paramList[0]->u.img.format == VX_DF_IMAGE_RGB && anode->paramList[1]->u.img.format == VX_DF_IMAGE_RGB) {
+		if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED && interpolation == VX_INTERPOLATION_TYPE_BILINEAR) 
+			new_kernel_id = VX_KERNEL_AMD_REMAP_U24_U24_BILINEAR;
+	}
+	else if (anode->paramList[0]->u.img.format == VX_DF_IMAGE_RGB && anode->paramList[1]->u.img.format == VX_DF_IMAGE_RGBX) {
+		if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED && interpolation == VX_INTERPOLATION_TYPE_BILINEAR)
+			new_kernel_id = VX_KERNEL_AMD_REMAP_U24_U32_BILINEAR;
+	}
+	else if (anode->paramList[0]->u.img.format == VX_DF_IMAGE_RGBX && anode->paramList[1]->u.img.format == VX_DF_IMAGE_RGBX) {
+		if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED && interpolation == VX_INTERPOLATION_TYPE_BILINEAR)
+			new_kernel_id = VX_KERNEL_AMD_REMAP_U32_U32_BILINEAR;
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideWarpAffineNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 4) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_MATRIX);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[3];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	vx_enum interpolation = paramList[2]->u.scalar.u.e;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED) {
+		if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_WARP_AFFINE_U8_U8_NEAREST;
+		else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_WARP_AFFINE_U8_U8_BILINEAR;
+	}
+	else if (anode->attr_border_mode.mode == VX_BORDER_MODE_CONSTANT) {
+		if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_WARP_AFFINE_U8_U8_NEAREST_CONSTANT;
+		else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_WARP_AFFINE_U8_U8_BILINEAR_CONSTANT;
+		if (new_kernel_id != VX_KERNEL_AMD_INVALID) {
+			// create scalar object for border mode
+			AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+			char desc[64]; sprintf(desc, "scalar-virtual:UINT8,%d", anode->attr_border_mode.constant_value);
+			AgoData * dataBorder = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+			if (!dataBorder) return -1;
+			agoGenerateVirtualDataName(agraph, "scalar", dataBorder->name);
+			agoAddData(&agraph->dataList, dataBorder);
+			// make it 4th argument
+			anode->paramList[anode->paramCount++] = dataBorder;
+		}
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideWarpPerspectiveNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 4) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_MATRIX);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_IMAGE);
+	// perform divide
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[3];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramCount = 3;
+	vx_enum interpolation = paramList[2]->u.scalar.u.e;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (anode->attr_border_mode.mode == VX_BORDER_MODE_UNDEFINED) {
+		if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_NEAREST;
+		else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_BILINEAR;
+	}
+	else if (anode->attr_border_mode.mode == VX_BORDER_MODE_CONSTANT) {
+		if (interpolation == VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR) new_kernel_id = VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_NEAREST_CONSTANT;
+		else if (interpolation == VX_INTERPOLATION_TYPE_BILINEAR) new_kernel_id = VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_BILINEAR_CONSTANT;
+		if (new_kernel_id != VX_KERNEL_AMD_INVALID) {
+			// create scalar object for border mode
+			AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+			char desc[64]; sprintf(desc, "scalar-virtual:UINT8,%d", anode->attr_border_mode.constant_value);
+			AgoData * dataBorder = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+			if (!dataBorder) return -1;
+			agoGenerateVirtualDataName(agraph, "scalar", dataBorder->name);
+			agoAddData(&agraph->dataList, dataBorder);
+			// make it 4th argument
+			anode->paramList[anode->paramCount++] = dataBorder;
+		}
+	}
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideCannyEdgeDetectorNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 5) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_THRESHOLD);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[4], VX_TYPE_IMAGE);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	vx_int32 gradient_size = paramList[2]->u.scalar.u.i;
+	vx_enum norm_type = paramList[3]->u.scalar.u.e;
+	// create virtual stack data for canny edges
+	//   stack size: TBD (currently set the size of the image)
+	vx_uint32 canny_stack_size = paramList[0]->u.img.width * paramList[0]->u.img.height;
+	char desc[256]; sprintf(desc, "ago-canny-stack-virtual:%u", canny_stack_size);
+	AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+	AgoData * data = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+	if (!data) return -1;
+	agoGenerateVirtualDataName(agraph, "canny-stack", data->name);
+	agoAddData(&agraph->dataList, data);
+#if USE_AGO_CANNY_SOBEL_SUPP_THRESHOLD
+	// compute sobel, nonmax-supression, and threshold
+	anode->paramList[0] = paramList[4];
+	anode->paramList[1] = data;
+	anode->paramList[2] = paramList[0];
+	anode->paramList[3] = paramList[1];
+	anode->paramCount = 4;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (norm_type == VX_NORM_L1) {
+		if (gradient_size == 3) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_3x3_L1NORM;
+		else if (gradient_size == 5) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_5x5_L1NORM;
+		else if (gradient_size == 7) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_7x7_L1NORM;
+	}
+	else if (norm_type == VX_NORM_L2) {
+		if (gradient_size == 3) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_3x3_L2NORM;
+		else if (gradient_size == 5) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_5x5_L2NORM;
+		else if (gradient_size == 7) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_7x7_L2NORM;
+	}
+	int status = agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+#else
+	// create virtual data for sobel output
+	char descSobel[64]; sprintf(descSobel, "image-virtual:U016,%d,%d", paramList[0]->u.img.width, paramList[0]->u.img.height);
+	AgoData * dataSobel = agoCreateDataFromDescription(anode->ref.context, agraph, descSobel, false);
+	if (!dataSobel) return -1;
+	agoGenerateVirtualDataName(agraph, "canny-sobel", dataSobel->name);
+	agoAddData(&agraph->dataList, dataSobel);
+	// compute sobel
+	anode->paramList[0] = dataSobel;
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (norm_type == VX_NORM_L1) {
+		if (gradient_size == 3) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM;
+		else if (gradient_size == 5) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM;
+		else if (gradient_size == 7) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM;
+	}
+	else if (norm_type == VX_NORM_L2) {
+		if (gradient_size == 3) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L2NORM;
+		else if (gradient_size == 5) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L2NORM;
+		else if (gradient_size == 7) new_kernel_id = VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM;
+	}
+	int status = agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+	// compute nonmax-supression and threshold
+	anode->paramList[0] = paramList[4];
+	anode->paramList[1] = data;
+	anode->paramList[2] = dataSobel;
+	anode->paramList[3] = paramList[1];
+	anode->paramList[4] = paramList[2];
+	anode->paramCount = 5;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3);
+#endif
+	// run edge trace
+	anode->paramList[0] = paramList[4];
+	anode->paramList[1] = data;
+	anode->paramCount = 2;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8XY);
+	return status;
+}
+
+int agoDramaDivideHarrisCornersNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 7 && anode->paramCount != 8) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[4], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[5], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[6], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[7], VX_TYPE_SCALAR);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	vx_int32 window_size = paramList[4]->u.scalar.u.i;
+	vx_int32 block_size = paramList[5]->u.scalar.u.i;
+	// create virtual images for HG3, HVC, and XYS
+	AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+	char desc[64];
+	sprintf(desc, "image-virtual:F332,%d,%d", paramList[0]->u.img.width, paramList[0]->u.img.height);
+	AgoData * dataHG3 = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+	sprintf(desc, "image-virtual:F032,%d,%d", paramList[0]->u.img.width, paramList[0]->u.img.height);
+	AgoData * dataHVC = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+	sprintf(desc, "array-virtual:KEYPOINT_XYS,%d", paramList[0]->u.img.width * paramList[0]->u.img.height); // TBD: this array can have smaller capacity
+	AgoData * dataXYS = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+	sprintf(desc, "scalar-virtual:UINT32,%d", paramList[0]->u.img.width);
+	AgoData * dataWidth = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+	sprintf(desc, "scalar-virtual:UINT32,%d", paramList[0]->u.img.height);
+	AgoData * dataHeight = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false);
+	if (!dataHG3 || !dataHVC || !dataXYS || !dataWidth || !dataHeight) return -1;
+	agoGenerateVirtualDataName(agraph, "HG3", dataHG3->name);
+	agoGenerateVirtualDataName(agraph, "HVC", dataHVC->name);
+	agoGenerateVirtualDataName(agraph, "XYS", dataXYS->name);
+	agoGenerateVirtualDataName(agraph, "Width", dataWidth->name);
+	agoGenerateVirtualDataName(agraph, "Height", dataHeight->name);
+	agoAddData(&agraph->dataList, dataHG3);
+	agoAddData(&agraph->dataList, dataHVC);
+	agoAddData(&agraph->dataList, dataXYS);
+	agoAddData(&agraph->dataList, dataWidth);
+	agoAddData(&agraph->dataList, dataHeight);
+	// compute HG3
+	anode->paramList[0] = dataHG3;
+	anode->paramList[1] = paramList[0];
+	anode->paramCount = 2;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (window_size == 3) new_kernel_id = VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_3x3;
+	else if (window_size == 5) new_kernel_id = VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_5x5;
+	else if (window_size == 7) new_kernel_id = VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_7x7;
+	else {
+		agoAddLogEntry(&anode->ref, VX_FAILURE, "ERROR: agoDramaDivideHarrisCornersNode: unsupported windows size: %d\n", window_size);
+		return -1;
+	}
+	int status = agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+	// compute HVC
+	anode->paramList[0] = dataHVC;
+	anode->paramList[1] = dataHG3;
+	anode->paramList[2] = paramList[3];
+	anode->paramList[3] = paramList[1];
+	anode->paramList[4] = paramList[4];
+	anode->paramCount = 5;
+	new_kernel_id = VX_KERNEL_AMD_INVALID;
+	if (block_size == 3) new_kernel_id = VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_3x3;
+	else if (block_size == 5) new_kernel_id = VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_5x5;
+	else if (block_size == 7) new_kernel_id = VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_7x7;
+	else {
+		agoAddLogEntry(&anode->ref, VX_FAILURE, "ERROR: agoDramaDivideHarrisCornersNode: unsupported block size: %d\n", block_size);
+		return -1;
+	}
+	status |= agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+	// non-max suppression
+	anode->paramList[0] = dataXYS;
+	anode->paramList[1] = dataHVC;
+	anode->paramCount = 2;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_NON_MAX_SUPP_XY_ANY_3x3);
+	// sort and pick corners
+	anode->paramList[0] = paramList[6];
+	anode->paramList[1] = paramList[7];
+	anode->paramList[2] = dataXYS;
+	anode->paramList[3] = paramList[2];
+	anode->paramList[4] = dataWidth;
+	anode->paramList[5] = dataHeight;
+	anode->paramCount = 6;
+	status |= agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_HARRIS_MERGE_SORT_AND_PICK_XY_XYS);
+	return status;
+}
+
+int agoDramaDivideFastCornersNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount < 4 || anode->paramCount > 5) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_IMAGE);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE_OPTIONAL(anode->paramList[4], VX_TYPE_SCALAR);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+	anode->paramList[0] = paramList[3];
+	anode->paramList[1] = paramList[4];
+	anode->paramList[2] = paramList[0];
+	anode->paramList[3] = paramList[1];
+	anode->paramCount = 4;
+	vx_enum new_kernel_id = VX_KERNEL_AMD_FAST_CORNERS_XY_U8_SUPRESSION;
+	if (paramList[2]->u.scalar.u.i == 0) new_kernel_id = VX_KERNEL_AMD_FAST_CORNERS_XY_U8_NOSUPRESSION;
+	return agoDramaDivideAppend(nodeList, anode, new_kernel_id);
+}
+
+int agoDramaDivideOpticalFlowPyrLkNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// sanity checks
+	if (anode->paramCount != 10) return -1;
+	SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_PYRAMID);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[1], VX_TYPE_PYRAMID);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[2], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[3], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[4], VX_TYPE_ARRAY);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[5], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[6], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[7], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[8], VX_TYPE_SCALAR);
+	SANITY_CHECK_DATA_TYPE(anode->paramList[9], VX_TYPE_SCALAR);
+	// save parameters
+	AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList));
+#if 0 // TBD -- enable this when low-level primitives are ready
+	AgoGraph * agraph = (AgoGraph *)anode->ref.scope;
+	vx_status status;
+	char desc[256];
+	// add VX_KERNEL_AMD_OPTICAL_FLOW_PREPARE_LK_XY_XY node
+	sprintf(desc, "array-virtual:INT32,%d", paramList[1]->u.arr.capacity);
+	AgoData * dataXYmap = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false); if (!dataXYmap) return -1; 
+	dataXYmap->name = agoGenerateVirtualDataName(agraph, "XYmap"); agoAddData(&agraph->dataList, dataXYmap);
+	sprintf(desc, "array-virtual:COORDINATES2D,%d", paramList[1]->u.arr.capacity);
+	AgoData * dataXY0 = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false); if (!dataXY0) return -1;
+	dataXY0->name = agoGenerateVirtualDataName(agraph, "XY"); agoAddData(&agraph->dataList, dataXY0);
+	anode->paramList[0] = dataXY0;      // tmpXY
+	anode->paramList[1] = dataXYmap;    // XYmap
+	anode->paramList[2] = paramList[2]; // old_points
+	anode->paramList[3] = paramList[3]; // new_points_estimates
+	anode->paramList[4] = paramList[8]; // use_initial_estimate
+	anode->paramCount = 5;
+	status = agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_OPTICAL_FLOW_PREPARE_LK_XY_XY); if (status) return status;
+	// add VX_KERNEL_AMD_OPTICAL_FLOW_IMAGE_LK_XY_XY node per each image in reverse order from the pyramids
+	float scale = anode->paramList[0]->u.pyr.scale * anode->paramList[0]->u.pyr.levels;
+	for (vx_int32 child = (vx_int32)anode->paramList[0]->u.pyr.levels - 1; child >= 0; child--) {
+		AgoData * imgOld = paramList[0]->children[child]; if (!imgOld) return VX_ERROR_INVALID_REFERENCE;
+		AgoData * imgNew = paramList[1]->children[child]; if (!imgNew) return VX_ERROR_INVALID_REFERENCE;
+		sprintf(desc, "array-virtual:COORDINATES2D,%d", paramList[1]->u.arr.capacity);
+		AgoData * dataXY1 = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false); if (!dataXY1) return -1;
+		dataXY1->name = agoGenerateVirtualDataName(agraph, "XY"); agoAddData(&agraph->dataList, dataXY1);
+		sprintf(desc, "scalar-virtual:FLOAT,%g", scale);
+		AgoData * dataScale = agoCreateDataFromDescription(anode->ref.context, agraph, desc, false); if (!dataScale) return -1;
+		dataScale->name = agoGenerateVirtualDataName(agraph, "scale"); agoAddData(&agraph->dataList, dataScale);
+		anode->paramList[0] = dataXY1;      // new points
+		anode->paramList[1] = dataXY0;      // old points
+		anode->paramList[2] = imgOld;       // old image
+		anode->paramList[3] = imgNew;       // new image
+		anode->paramList[4] = paramList[5]; // termination
+		anode->paramList[5] = paramList[6]; // epsilon
+		anode->paramList[6] = paramList[7]; // num_iterations
+		anode->paramList[7] = paramList[9]; // window_dimension 
+		anode->paramList[8] = dataScale;    // scale factor
+		anode->paramCount = 9;
+		status = agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_OPTICAL_FLOW_IMAGE_LK_XY_XY); if (status) return status;
+		// save dataXY1 for future reference and set scale factor to inverse of pyramid scale
+		dataXY0 = dataXY1;
+		scale = 1.0f / anode->paramList[0]->u.pyr.scale;
+	}
+	// add VX_KERNEL_AMD_OPTICAL_FLOW_FINAL_LK_XY_XY node
+	anode->paramList[0] = paramList[4]; // new_points
+	anode->paramList[1] = dataXY0;      // tmpXY
+	anode->paramList[2] = dataXYmap;    // XYmap
+	anode->paramCount = 3;
+	status = agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_OPTICAL_FLOW_FINAL_LK_XY_XY); if (status) return status;
+	return status;
+#else
+	anode->paramList[0] = paramList[4];
+	anode->paramList[1] = paramList[0];
+	anode->paramList[2] = paramList[1];
+	anode->paramList[3] = paramList[2];
+	anode->paramList[4] = paramList[3];
+	anode->paramList[5] = paramList[5];
+	anode->paramList[6] = paramList[6];
+	anode->paramList[7] = paramList[7];
+	anode->paramList[8] = paramList[8];
+	anode->paramList[9] = paramList[9];
+	anode->paramCount = 10;
+	return agoDramaDivideAppend(nodeList, anode, VX_KERNEL_AMD_OPTICAL_FLOW_PYR_LK_XY_XY);
+#endif
+}
+
+int agoDramaDivideNode(AgoNodeList * nodeList, AgoNode * anode)
+{
+	// save parameter list
+	AgoData * paramList[AGO_MAX_PARAMS];
+	memcpy(paramList, anode->paramList, sizeof(paramList));
+	// divide the node depending on the type
+	int status = -1;
+	switch (anode->akernel->id)
+	{
+		case VX_KERNEL_COLOR_CONVERT:
+			status = agoDramaDivideColorConvertNode(nodeList, anode);
+			break;
+		case VX_KERNEL_CHANNEL_EXTRACT:
+			status = agoDramaDivideChannelExtractNode(nodeList, anode);
+			break;
+		case VX_KERNEL_CHANNEL_COMBINE:
+			status = agoDramaDivideChannelCombineNode(nodeList, anode);
+			break;
+		case VX_KERNEL_SOBEL_3x3:
+			status = agoDramaDivideSobel3x3Node(nodeList, anode);
+			break;
+		case VX_KERNEL_MAGNITUDE:
+			status = agoDramaDivideMagnitudeNode(nodeList, anode);
+			break;
+		case VX_KERNEL_PHASE:
+			status = agoDramaDividePhaseNode(nodeList, anode);
+			break;
+		case VX_KERNEL_SCALE_IMAGE:
+			status = agoDramaDivideScaleImageNode(nodeList, anode);
+			break;
+		case VX_KERNEL_TABLE_LOOKUP:
+			status = agoDramaDivideTableLookupNode(nodeList, anode);
+			break;
+		case VX_KERNEL_HISTOGRAM:
+			status = agoDramaDivideHistogramNode(nodeList, anode);
+			break;
+		case VX_KERNEL_EQUALIZE_HISTOGRAM:
+			status = agoDramaDivideEqualizeHistogramNode(nodeList, anode);
+			break;
+		case VX_KERNEL_ABSDIFF:
+			status = agoDramaDivideAbsdiffNode(nodeList, anode);
+			break;
+		case VX_KERNEL_MEAN_STDDEV:
+			status = agoDramaDivideMeanStddevNode(nodeList, anode);
+			break;
+		case VX_KERNEL_THRESHOLD:
+			status = agoDramaDivideThresholdNode(nodeList, anode);
+			break;
+		case VX_KERNEL_INTEGRAL_IMAGE:
+			status = agoDramaDivideIntegralImageNode(nodeList, anode);
+			break;
+		case VX_KERNEL_DILATE_3x3:
+			status = agoDramaDivideDilate3x3Node(nodeList, anode);
+			break;
+		case VX_KERNEL_ERODE_3x3:
+			status = agoDramaDivideErode3x3Node(nodeList, anode);
+			break;
+		case VX_KERNEL_MEDIAN_3x3:
+			status = agoDramaDivideMedian3x3Node(nodeList, anode);
+			break;
+		case VX_KERNEL_BOX_3x3:
+			status = agoDramaDivideBox3x3Node(nodeList, anode);
+			break;
+		case VX_KERNEL_GAUSSIAN_3x3:
+			status = agoDramaDivideGaussian3x3Node(nodeList, anode);
+			break;
+		case VX_KERNEL_CUSTOM_CONVOLUTION:
+			status = agoDramaDivideCustomConvolutionNode(nodeList, anode);
+			break;
+		case VX_KERNEL_GAUSSIAN_PYRAMID:
+			status = agoDramaDivideGaussianPyramidNode(nodeList, anode);
+			break;
+		case VX_KERNEL_ACCUMULATE:
+			status = agoDramaDivideAccumulateNode(nodeList, anode);
+			break;
+		case VX_KERNEL_ACCUMULATE_WEIGHTED:
+			status = agoDramaDivideAccumulateWeightedNode(nodeList, anode);
+			break;
+		case VX_KERNEL_ACCUMULATE_SQUARE:
+			status = agoDramaDivideAccumulateSquareNode(nodeList, anode);
+			break;
+		case VX_KERNEL_MINMAXLOC:
+			status = agoDramaDivideMinmaxlocNode(nodeList, anode);
+			break;
+		case VX_KERNEL_CONVERTDEPTH:
+			status = agoDramaDivideConvertDepthNode(nodeList, anode);
+			break;
+		case VX_KERNEL_CANNY_EDGE_DETECTOR:
+			status = agoDramaDivideCannyEdgeDetectorNode(nodeList, anode);
+			break;
+		case VX_KERNEL_AND:
+			status = agoDramaDivideAndNode(nodeList, anode);
+			break;
+		case VX_KERNEL_OR:
+			status = agoDramaDivideOrNode(nodeList, anode);
+			break;
+		case VX_KERNEL_XOR:
+			status = agoDramaDivideXorNode(nodeList, anode);
+			break;
+		case VX_KERNEL_NOT:
+			status = agoDramaDivideNotNode(nodeList, anode);
+			break;
+		case VX_KERNEL_MULTIPLY:
+			status = agoDramaDivideMultiplyNode(nodeList, anode);
+			break;
+		case VX_KERNEL_ADD:
+			status = agoDramaDivideAddNode(nodeList, anode);
+			break;
+		case VX_KERNEL_SUBTRACT:
+			status = agoDramaDivideSubtractNode(nodeList, anode);
+			break;
+		case VX_KERNEL_WARP_AFFINE:
+			status = agoDramaDivideWarpAffineNode(nodeList, anode);
+			break;
+		case VX_KERNEL_WARP_PERSPECTIVE:
+			status = agoDramaDivideWarpPerspectiveNode(nodeList, anode);
+			break;
+		case VX_KERNEL_HARRIS_CORNERS:
+			status = agoDramaDivideHarrisCornersNode(nodeList, anode);
+			break;
+		case VX_KERNEL_FAST_CORNERS:
+			status = agoDramaDivideFastCornersNode(nodeList, anode);
+			break;
+		case VX_KERNEL_OPTICAL_FLOW_PYR_LK:
+			status = agoDramaDivideOpticalFlowPyrLkNode(nodeList, anode);
+			break;
+		case VX_KERNEL_REMAP:
+			status = agoDramaDivideRemapNode(nodeList, anode);
+			break;
+		case VX_KERNEL_HALFSCALE_GAUSSIAN:
+			status = agoDramaDivideHalfscaleGaussianNode(nodeList, anode);
+			break;
+		default:
+			break;
+	}
+	// revert parameter list
+	memcpy(anode->paramList, paramList, sizeof(anode->paramList));
+	return status;
+}
+
+int agoOptimizeDramaDivide(AgoGraph * agraph)
+{
+	int astatus = 0;
+	for (AgoNode * anode = agraph->nodeList.head, *aprev = 0; anode;) {
+		// check if current node is a general VX node, that needs division
+		if ((anode->akernel->flags & AGO_KERNEL_FLAG_GROUP_MASK) == AGO_KERNEL_FLAG_GROUP_OVX10) {
+			// divide the current node
+			if (!agoDramaDivideNode(&agraph->nodeList, anode)) {
+				// remove and release the current node
+				if (aprev) aprev->next = anode->next;
+				else agraph->nodeList.head = anode->next;
+				agraph->nodeList.count--;
+				if (agraph->nodeList.tail == anode) {
+					agraph->nodeList.tail = aprev;
+				}
+				AgoNode * next = anode->next;
+				// move anode to trash
+				anode->ref.internal_count = 0;
+				anode->next = agraph->nodeList.trash;
+				agraph->nodeList.trash = anode;
+				// advance to next node
+				anode = next;
+			}
+			else {
+				if (anode->akernel->id == VX_KERNEL_INVALID) {
+					agraph->detectedInvalidNode = true;
+				}
+				else {
+					// TBD: error handling
+					agoAddLogEntry(&anode->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaDivide: failed for node %s\n", anode->akernel->name);
+					astatus = -1;
+				}
+				// advance to next node, since node divide failed
+				aprev = anode;
+				anode = anode->next;
+			}
+		}
+		else if (anode->akernel->regen_callback_f) {
+			// try regenerating the node
+			vx_bool regen_not_needed = vx_true_e;
+			vx_status status = anode->akernel->regen_callback_f(agraph, anode, regen_not_needed);
+			if (status == VX_SUCCESS) {
+				if (regen_not_needed == vx_false_e) {
+					// remove and release the current node
+					if (aprev) aprev->next = anode->next;
+					else agraph->nodeList.head = anode->next;
+					agraph->nodeList.count--;
+					if (agraph->nodeList.tail == anode) {
+						agraph->nodeList.tail = aprev;
+					}
+					AgoNode * next = anode->next;
+					// move anode to trash
+					anode->ref.internal_count = 0;
+					anode->next = agraph->nodeList.trash;
+					agraph->nodeList.trash = anode;
+					// advance to next node
+					anode = next;
+				}
+				else {
+					// advance to next node
+					aprev = anode;
+					anode = anode->next;
+				}
+			}
+			else {
+				// TBD: error handling
+				agoAddLogEntry(&anode->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaDivide: failed for node %s\n", anode->akernel->name);
+				astatus = -1;
+				// advance to next node, since node divide failed
+				aprev = anode;
+				anode = anode->next;
+			}
+		}
+		else {
+			// advance to next node
+			aprev = anode;
+			anode = anode->next;
+		}
+	}
+	return astatus;
+}
diff --git a/openvx/ago/ago_drama_merge.cpp b/openvx/ago/ago_drama_merge.cpp
new file mode 100644
index 0000000..0cb9a2a
--- /dev/null
+++ b/openvx/ago/ago_drama_merge.cpp
@@ -0,0 +1,37 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+int agoOptimizeDramaMerge(AgoGraph * agraph)
+{
+	for (int graphGotModified = !0; graphGotModified;)
+	{
+		// check and mark data usage
+		agoOptimizeDramaMarkDataUsage(agraph);
+
+		// TBD
+		graphGotModified = 0;
+	}
+	return 0;
+}
diff --git a/openvx/ago/ago_drama_remove.cpp b/openvx/ago/ago_drama_remove.cpp
new file mode 100644
index 0000000..6f92522
--- /dev/null
+++ b/openvx/ago/ago_drama_remove.cpp
@@ -0,0 +1,1470 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// rule book for merge part of remove
+#define CHILD(n)                   (0x10 | ((n) << 8))
+#define SOLITARY                   AGO_MERGE_RULE_SOLITARY_FLAG // This should be 0x20
+#define BYTE2U1                    (0x40)
+#define WRITEONLY                  (0x80)
+#define ARG_INDEX(arg_spec)        ((arg_spec) & 0x0f)
+#define ARG_HAS_CHILD(arg_spec)    ((arg_spec) & 0x10)
+#define ARG_GET_CHILD(arg_spec)    ((arg_spec) >> 8)
+#define ARG_IS_SOLITARY(arg_spec)  ((arg_spec) & SOLITARY)
+#define ARG_IS_BYTE2U1(arg_spec)   ((arg_spec) & BYTE2U1)
+#define ARG_IS_WRITEONLY(arg_spec) ((arg_spec) & WRITEONLY)
+static AgoNodeMergeRule s_merge_rule[] = {
+		{ // RGB to YUV4
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_U_RGB, { 3, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_V_RGB, { 4, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_YUV4_RGB, { 2, 3, 4, 1 } },
+			}
+		},
+		{ // RGB to NV12
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB, { 3, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB, { 2, 3, 1 } },
+			}
+		},
+		{ // RGB to IYUV
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB, { 3, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB, { 4, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB, { 2, 3, 4, 1 } },
+			}
+		},
+		{ // RGB to IUV
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB, { 3, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGB, { 2, 3, 1 } },
+			}
+		},
+		{ // RGBX to YUV4
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_U_RGBX, { 3, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_V_RGBX, { 4, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_YUV4_RGBX, { 2, 3, 4, 1 } },
+			}
+		},
+		{ // RGBX to NV12
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX, { 3, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX, { 2, 3, 1 } },
+			}
+		},
+		{ // RGBX to IYUV
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX, { 3, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX, { 4, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX, { 2, 3, 4, 1 } },
+			}
+		},
+		{ // RGBX to IUV
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX, { 2, 1 } },
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX, { 3, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGBX, { 2, 3, 1 } },
+			}
+		},
+		{ // combined channel extract of RGBX from RGBX
+			{
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0, { 2, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1, { 3, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2, { 4, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3, { 5, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8U8_U32, { 2, 3, 4, 5, 1 } },
+			}
+		},
+		{ // combined channel extract of RGB from RGBX
+			{
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0, { 2, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1, { 3, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2, { 4, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8_U32, { 2, 3, 4, 1 } },
+			}
+		},
+		{ // combined channel extract of RGB from RGB
+			{
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS0, { 2, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS1, { 3, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS2, { 4, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8_U24, { 2, 3, 4, 1 } },
+			}
+		},
+		{ // SOBEL GX + GY = GXY
+			{
+				{ VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX, { 2, 1 } },
+				{ VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY, { 3, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY, { 2, 3, 1 } },
+			}
+		},
+		{ // SOBEL + MAGNITUDE + PHASE = SOBEL_MAGNITUDE_PHASE
+			{
+				{ VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY, { 2 | SOLITARY, 3 | SOLITARY, 1 } },
+				{ VX_KERNEL_AMD_MAGNITUDE_S16_S16S16, { 4, 2 | SOLITARY, 3 | SOLITARY } },
+				{ VX_KERNEL_AMD_PHASE_U8_S16S16, { 5, 2 | SOLITARY, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_SOBEL_MAGNITUDE_PHASE_S16U8_U8_3x3, { 4, 5, 1 } },
+			}
+		},
+		{ // SOBEL + MAGNITUDE = SOBEL_MAGNITUDE
+			{
+				{ VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY, { 2 | SOLITARY, 3 | SOLITARY, 1 } },
+				{ VX_KERNEL_AMD_MAGNITUDE_S16_S16S16, { 4, 2 | SOLITARY, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_SOBEL_MAGNITUDE_S16_U8_3x3, { 4, 1 } },
+			}
+		},
+		{ // SOBEL + PHASE = SOBEL_PHASE
+			{
+				{ VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY, { 2 | SOLITARY, 3 | SOLITARY, 1 } },
+				{ VX_KERNEL_AMD_PHASE_U8_S16S16, { 4, 2 | SOLITARY, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_SOBEL_PHASE_U8_U8_3x3, { 4, 1 } },
+			}
+		},
+		{ // AND_U8_U8U8 + NOT_U8_U8 = NAND_U8_U8U8
+			{
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 4, 2, 1 } },
+			}
+		},
+		{ // OR_U8_U8U8 + NOT_U8_U8 = NOR_U8_U8U8
+			{
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 4, 2, 1 } },
+			}
+		},
+		{ // XOR_U8_U8U8 + NOT_U8_U8 = XNOR_U8_U8U8
+			{
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 4, 2, 1 } },
+			}
+		},
+		{ // NAND_U8_U8U8 + NOT_U8_U8 = AND_U8_U8U8
+			{
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 4, 2, 1 } },
+			}
+		},
+		{ // NOR_U8_U8U8 + NOT_U8_U8 = OR_U8_U8U8
+			{
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 4, 2, 1 } },
+			}
+		},
+		{ // XNOR_U8_U8U8 + NOT_U8_U8 = XOR_U8_U8U8
+			{
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 4, 2, 1 } },
+			}
+		},
+		{ // THRESHOLD + NOT = THRESHOLD_NOT (U8 U8 U8 BINARY)
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_U8_U8_BINARY, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_BINARY, { 4, 2, 1 } },
+			}
+		},
+		{ // THRESHOLD + NOT = THRESHOLD_NOT (U8 U8 U8 RANGE)
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_U8_U8_RANGE, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_RANGE, { 4, 2, 1 } },
+			}
+		},
+		{ // THRESHOLD + NOT = THRESHOLD_NOT (U8 U1 U8 BINARY)
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_U1_U8_BINARY, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U1, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_BINARY, { 4, 2, 1 } },
+			}
+		},
+		{ // THRESHOLD + NOT = THRESHOLD_NOT (U8 U1 U8 RANGE)
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_U1_U8_RANGE, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U1, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_RANGE, { 4, 2, 1 } },
+			}
+		},
+		{ // THRESHOLD + NOT = THRESHOLD_NOT (U1 U1 U8 BINARY)
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_U1_U8_BINARY, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U1_U1, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_BINARY, { 4, 2, 1 } },
+			}
+		},
+		{ // THRESHOLD + NOT = THRESHOLD_NOT (U1 U1 U8 RANGE)
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_U1_U8_RANGE, { 3 | SOLITARY, 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U1_U1, { 4, 3 | SOLITARY } },
+			},
+			{
+				{ VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_RANGE, { 4, 2, 1 } },
+			}
+		},
+		{ // AND_U8_U8U8(same-inputs) = COPY
+			{
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 2, 1, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 2, 1 } },
+			}
+		},
+		{ // OR_U8_U8U8(same-inputs) = COPY
+			{
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 2, 1, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 2, 1 } },
+			}
+		},
+		{ // XOR_U8_U8U8(same-inputs) = ZERO
+			{
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 2, 1, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+			}
+		},
+		{ // NAND_U8_U8U8(same-inputs) = NOT_U8_U8
+			{
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 2, 1, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+			}
+		},
+		{ // NOR_U8_U8U8(same-inputs) = NOT_U8_U8
+			{
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 2, 1, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+			}
+		},
+		{ // XNOR_U8_U8U8(same-inputs) = FF
+			{
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 2, 1, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+			}
+		},
+		{ // 00-NOT to 00-FF
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+			}
+		},
+		{ // 00-AND to 00-00
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // 00-AND to 00-00
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // FF-AND to FF-COPY
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // FF-AND to FF-COPY
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // FF-OR to FF-FF
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // FF-OR to FF-FF
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // 00-OR to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-OR to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // 00-XOR to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-XOR to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // FF-XOR to FF-NOT
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // FF-XOR to FF-NOT
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // 00-NAND to 00-FF
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // 00-NAND to 00-FF
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // FF-NAND to FF-NOT
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // FF-NAND to FF-NOT
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // FF-NOR to FF-00
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // FF-NOR to FF-00
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // 00-NOR to 00-NOT
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-NOR to 00-NOT
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // 00-XNOR to 00-NOT
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-XNOR to 00-NOT
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // FF-XNOR to FF-COPY
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // FF-XNOR to FF-COPY
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // 00-ADD(wrap) to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_ADD_U8_U8U8_WRAP, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-ADD(wrap) to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_ADD_U8_U8U8_WRAP, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // 00-ADD(sat) to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_ADD_U8_U8U8_SAT, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-ADD(sat) to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_ADD_U8_U8U8_SAT, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // 00-SUB(wrap) to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SUB_U8_U8U8_WRAP, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-SUB(sat) to 00-COPY
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SUB_U8_U8U8_SAT, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 2 } },
+			}
+		},
+		{ // 00-ACCUMULATE to 00
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_ACCUMULATE_S16_S16U8_SAT, { 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+			}
+		},
+		{ // 00-ACCUMULATE_SQUARED to 00
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+				{ VX_KERNEL_AMD_ACCUMULATE_SQUARED_S16_S16U8_SAT, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+			}
+		},
+		{ // NOT-NOT to NOT-COPY
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 3, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, { 3, 1 } },
+			}
+		},
+		{ // NOT-AND to NOT-00
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // NOT-AND to NOT-00
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_AND_U8_U8U8, { 3, 1, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // NOT-OR to NOT-FF
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // NOT-OR to NOT-FF
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_OR_U8_U8U8, { 3, 1, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // NOT-XOR to NOT-FF
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // NOT-XOR to NOT-FF
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_XOR_U8_U8U8, { 3, 1, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // NOT-NAND to NOT-FF
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // NOT-NAND to NOT-FF
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_NAND_U8_U8U8, { 3, 1, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 3 } },
+			}
+		},
+		{ // NOT-NOR to NOT-00
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // NOT-NOR to NOT-00
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_NOR_U8_U8U8, { 3, 1, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // NOT-XNOR to NOT-00
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3, 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // NOT-XNOR to NOT-00
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_XNOR_U8_U8U8, { 3, 1, 2 } },
+			},
+			{
+				{ VX_KERNEL_AMD_NOT_U8_U8, { 2, 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 3 } },
+			}
+		},
+		{ // 00-DILATE to 00-00
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_ERODE_U8_U8_3x3, { 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+			}
+		},
+		{ // FF-DILATE to FF-FF
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_ERODE_U8_U8_3x3, { 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+			}
+		},
+		{ // 00-ERODE to 00-00
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_ERODE_U8_U8_3x3, { 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_00_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_00_U8, { 2 } },
+			}
+		},
+		{ // FF-ERODE to FF-FF
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_ERODE_U8_U8_3x3, { 2, 1 } },
+			},
+			{
+				{ VX_KERNEL_AMD_SET_FF_U8, { 1 } },
+				{ VX_KERNEL_AMD_SET_FF_U8, { 2 } },
+			}
+		},
+};
+static vx_uint32 s_merge_rule_count = sizeof(s_merge_rule) / sizeof(s_merge_rule[0]);
+
+///////////////////////////////////////////////////////////////////////////////
+// rule book for VX_DF_IMAGE_U8 to VX_DF_IMAGE_U1_AMD conversion
+typedef struct AgoImageU8toU1Rule_t {
+	vx_enum    find_kernel_id;
+	vx_int32   arg_index;
+	vx_enum    replace_kernel_id;
+} AgoImageU8toU1Rule;
+static AgoImageU8toU1Rule s_U8toU1_rule[] = {
+	// VX_KERNEL_AMD_NOT_* kernels
+	{ VX_KERNEL_AMD_NOT_U8_U8, 1, VX_KERNEL_AMD_NOT_U8_U1 },
+	{ VX_KERNEL_AMD_NOT_U1_U8, 1, VX_KERNEL_AMD_NOT_U1_U1 },
+	{ VX_KERNEL_AMD_NOT_U8_U1, 0, VX_KERNEL_AMD_NOT_U1_U1 },
+	// VX_KERNEL_AMD_AND_* kernels
+	{ VX_KERNEL_AMD_AND_U8_U8U8, 2, VX_KERNEL_AMD_AND_U8_U8U1 },
+	{ VX_KERNEL_AMD_AND_U8_U8U8, 1, VX_KERNEL_AMD_AND_U8_U1U8 },
+	{ VX_KERNEL_AMD_AND_U8_U8U1, 1, VX_KERNEL_AMD_AND_U8_U1U1 },
+	{ VX_KERNEL_AMD_AND_U8_U1U8, 2, VX_KERNEL_AMD_AND_U8_U1U1 },
+	{ VX_KERNEL_AMD_AND_U8_U1U1, 0, VX_KERNEL_AMD_AND_U1_U1U1 },
+	// VX_KERNEL_AMD_OR_* kernels
+	{ VX_KERNEL_AMD_OR_U8_U8U8, 2, VX_KERNEL_AMD_OR_U8_U8U1 },
+	{ VX_KERNEL_AMD_OR_U8_U8U8, 1, VX_KERNEL_AMD_OR_U8_U1U8 },
+	{ VX_KERNEL_AMD_OR_U8_U8U1, 1, VX_KERNEL_AMD_OR_U8_U1U1 },
+	{ VX_KERNEL_AMD_OR_U8_U1U8, 2, VX_KERNEL_AMD_OR_U8_U1U1 },
+	{ VX_KERNEL_AMD_OR_U8_U1U1, 0, VX_KERNEL_AMD_OR_U1_U1U1 },
+	// VX_KERNEL_AMD_XOR_* kernels
+	{ VX_KERNEL_AMD_XOR_U8_U8U8, 2, VX_KERNEL_AMD_XOR_U8_U8U1 },
+	{ VX_KERNEL_AMD_XOR_U8_U8U8, 1, VX_KERNEL_AMD_XOR_U8_U1U8 },
+	{ VX_KERNEL_AMD_XOR_U8_U8U1, 1, VX_KERNEL_AMD_XOR_U8_U1U1 },
+	{ VX_KERNEL_AMD_XOR_U8_U1U8, 2, VX_KERNEL_AMD_XOR_U8_U1U1 },
+	{ VX_KERNEL_AMD_XOR_U8_U1U1, 0, VX_KERNEL_AMD_XOR_U1_U1U1 },
+	// VX_KERNEL_AMD_NAND_* kernels
+	{ VX_KERNEL_AMD_NAND_U8_U8U8, 2, VX_KERNEL_AMD_NAND_U8_U8U1 },
+	{ VX_KERNEL_AMD_NAND_U8_U8U8, 1, VX_KERNEL_AMD_NAND_U8_U1U8 },
+	{ VX_KERNEL_AMD_NAND_U8_U8U1, 1, VX_KERNEL_AMD_NAND_U8_U1U1 },
+	{ VX_KERNEL_AMD_NAND_U8_U1U8, 2, VX_KERNEL_AMD_NAND_U8_U1U1 },
+	{ VX_KERNEL_AMD_NAND_U8_U1U1, 0, VX_KERNEL_AMD_NAND_U1_U1U1 },
+	// VX_KERNEL_AMD_NOR_* kernels
+	{ VX_KERNEL_AMD_NOR_U8_U8U8, 2, VX_KERNEL_AMD_NOR_U8_U8U1 },
+	{ VX_KERNEL_AMD_NOR_U8_U8U8, 1, VX_KERNEL_AMD_NOR_U8_U1U8 },
+	{ VX_KERNEL_AMD_NOR_U8_U8U1, 1, VX_KERNEL_AMD_NOR_U8_U1U1 },
+	{ VX_KERNEL_AMD_NOR_U8_U1U8, 2, VX_KERNEL_AMD_NOR_U8_U1U1 },
+	{ VX_KERNEL_AMD_NOR_U8_U1U1, 0, VX_KERNEL_AMD_NOR_U1_U1U1 },
+	// VX_KERNEL_AMD_XNOR_* kernels
+	{ VX_KERNEL_AMD_XNOR_U8_U8U8, 2, VX_KERNEL_AMD_XNOR_U8_U8U1 },
+	{ VX_KERNEL_AMD_XNOR_U8_U8U8, 1, VX_KERNEL_AMD_XNOR_U8_U1U8 },
+	{ VX_KERNEL_AMD_XNOR_U8_U8U1, 1, VX_KERNEL_AMD_XNOR_U8_U1U1 },
+	{ VX_KERNEL_AMD_XNOR_U8_U1U8, 2, VX_KERNEL_AMD_XNOR_U8_U1U1 },
+	{ VX_KERNEL_AMD_XNOR_U8_U1U1, 0, VX_KERNEL_AMD_XNOR_U1_U1U1 },
+	// VX_KERNEL_AMD_THRESHOLD_* kernels
+	{ VX_KERNEL_AMD_THRESHOLD_U8_U8_BINARY,     0, VX_KERNEL_AMD_THRESHOLD_U1_U8_BINARY },
+	{ VX_KERNEL_AMD_THRESHOLD_U8_U8_RANGE,      0, VX_KERNEL_AMD_THRESHOLD_U1_U8_RANGE },
+	{ VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_BINARY, 0, VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_BINARY },
+	{ VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_RANGE,  0, VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_RANGE },
+	// VX_KERNEL_AMD_DILATE_* kernels
+	{ VX_KERNEL_AMD_DILATE_U8_U8_3x3, 1, VX_KERNEL_AMD_DILATE_U8_U1_3x3 },
+	{ VX_KERNEL_AMD_DILATE_U1_U8_3x3, 1, VX_KERNEL_AMD_DILATE_U1_U1_3x3 },
+	{ VX_KERNEL_AMD_DILATE_U8_U1_3x3, 0, VX_KERNEL_AMD_DILATE_U1_U1_3x3 },
+	// VX_KERNEL_AMD_ERODE_* kernels
+	{ VX_KERNEL_AMD_ERODE_U8_U8_3x3, 1, VX_KERNEL_AMD_ERODE_U8_U1_3x3 },
+	{ VX_KERNEL_AMD_ERODE_U1_U8_3x3, 1, VX_KERNEL_AMD_ERODE_U1_U1_3x3 },
+	{ VX_KERNEL_AMD_ERODE_U8_U1_3x3, 0, VX_KERNEL_AMD_ERODE_U1_U1_3x3 },
+	// VX_KERNEL_AMD_CHANNEL_COPY_* kernels
+	{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U8, 1, VX_KERNEL_AMD_CHANNEL_COPY_U8_U1 },
+	{ VX_KERNEL_AMD_CHANNEL_COPY_U1_U8, 1, VX_KERNEL_AMD_CHANNEL_COPY_U1_U1 },
+	{ VX_KERNEL_AMD_CHANNEL_COPY_U8_U1, 0, VX_KERNEL_AMD_CHANNEL_COPY_U1_U1 },
+};
+static vx_uint32 s_U8toU1_rule_count = sizeof(s_U8toU1_rule) / sizeof(s_U8toU1_rule[0]);
+
+int agoOptimizeDramaRemoveCopyNodes(AgoGraph * agraph)
+{
+	// find and remove COPY nodes with virtual buffers
+	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+		AgoKernel * akernel = anode->akernel;
+		bool nodeCanBeRemoved = false;
+		if (anode->akernel->id == VX_KERNEL_AMD_CHANNEL_COPY_U8_U8)
+		{
+			// copy of a virtual data can be removed by just replacing the virtual data
+			// TBD: need to handle possible optimizations with buffers in delay object
+			AgoData * dstParam = anode->paramList[0];
+			AgoData * srcParam = anode->paramList[1];
+			bool replaceSrc = false;
+			bool replaceDst = false;
+			if (dstParam->isVirtual && !agoIsPartOfDelay(dstParam)) {
+				replaceDst = true;
+			}
+			if (srcParam->isVirtual && !agoIsPartOfDelay(srcParam)) {
+				replaceSrc = true;
+			}
+			if (replaceSrc && replaceDst) {
+				// prioritize between src and dst
+				if (dstParam->parent && srcParam->parent) {
+					if (dstParam->parent->ref.type == VX_TYPE_PYRAMID && srcParam->parent->ref.type == VX_TYPE_PYRAMID) {
+						// if both pyramids are used by a node, needs special handling
+						if (dstParam->parent->inputUsageCount > 0 && srcParam->parent->inputUsageCount > 0) {
+							// TBD: this needs to be optimized carefully
+							replaceDst = false;
+							replaceSrc = false;
+						}
+					}
+					else if (dstParam->parent->ref.type == VX_TYPE_PYRAMID) {
+						replaceDst = false;
+					}
+				}
+				else if (dstParam->parent) {
+					replaceDst = false;
+				}
+			}
+			if (replaceDst) {
+#if ENABLE_DEBUG_MESSAGES
+				vx_char srcName[256], dstName[256];
+				agoGetDataName(srcName, srcParam);
+				agoGetDataName(dstName, dstParam);
+				debug_printf("agoOptimizeDramaRemoveCopyNodes: replacing %s(dst) with %s(src)\n", dstName[0] ? dstName : "<?>", srcName[0] ? srcName : "<?>");
+#endif
+				nodeCanBeRemoved = true;
+				// replace all occurances of dstParam with srcParam
+				agoReplaceDataInGraph(agraph, dstParam, srcParam);
+			}
+			else if (replaceSrc) {
+#if ENABLE_DEBUG_MESSAGES
+				vx_char srcName[256], dstName[256];
+				agoGetDataName(srcName, srcParam);
+				agoGetDataName(dstName, dstParam);
+				debug_printf("agoOptimizeDramaRemoveCopyNodes: replacing %s(src) with %s(dst)\n", srcName[0] ? srcName : "<?>", dstName[0] ? dstName : "<?>");
+#endif
+				nodeCanBeRemoved = true;
+				// replace all occurances of srcParam with dstParam
+				agoReplaceDataInGraph(agraph, srcParam, dstParam);
+			}
+		}
+		if (nodeCanBeRemoved) {
+			debug_printf("INFO: agoOptimizeDramaRemoveCopyNodes: removing node %s\n", anode->akernel->name);
+			// remove the node
+			if (agoRemoveNode(&agraph->nodeList, anode, true)) {
+				agoAddLogEntry(&anode->akernel->ref, -1, "ERROR: agoOptimizeDramaRemoveCopyNodes: agoRemoveNode(*,%s) failed\n", anode->akernel->name);
+				return -1;
+			}
+			// make only one change at a time
+			return 1;
+		}
+	}
+	// no changes happened to the graph
+	return 0;
+}
+
+int agoOptimizeDramaRemoveNodesWithUnusedOutputs(AgoGraph * agraph)
+{
+	// find and remove nodes who's outputs are not used
+	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+		AgoKernel * akernel = anode->akernel;
+		bool nodeCanBeRemoved = true;
+		for (vx_uint32 arg = 0; arg < anode->paramCount; arg++) {
+			if (anode->paramList[arg]) {
+				vx_uint32 inputUsageCount = anode->paramList[arg]->inputUsageCount;
+				for (AgoData * pdata = anode->paramList[arg]->parent; pdata; pdata = pdata->parent) {
+					inputUsageCount += pdata->inputUsageCount;
+				}
+				if (anode->paramList[arg]->isVirtual && (akernel->argConfig[arg] & AGO_KERNEL_ARG_OUTPUT_FLAG) && (inputUsageCount > 0)) {
+					// found a virtual output data that is being used elsewhere
+					nodeCanBeRemoved = false;
+					break;
+				}
+				else if (!anode->paramList[arg]->isVirtual && (akernel->argConfig[arg] & AGO_KERNEL_ARG_OUTPUT_FLAG)) {
+					// found a physical output data that can be accessed by user
+					nodeCanBeRemoved = false;
+					break;
+				}
+			}
+		}
+		if (nodeCanBeRemoved) {
+			debug_printf("INFO: agoOptimizeDramaRemoveNodesWithUnusedOutputs: removing node %s\n", anode->akernel->name);
+			// remove the node
+			if (agoRemoveNode(&agraph->nodeList, anode, true)) {
+				agoAddLogEntry(&anode->akernel->ref, -1, "ERROR: agoOptimizeDramaRemoveNodesWithUnusedOutputs: agoRemoveNode(*,%s) failed\n", anode->akernel->name);
+				return -1;
+			}
+			// make only one change at a time
+			return 1;
+		}
+	}
+	// no changes happened to the graph
+	return 0;
+}
+
+int agoOptimizeDramaRemoveNodeMerge(AgoGraph * agraph)
+{
+	// apply node merge rules
+	int ruleSet = 0;
+	vx_uint32 rule_count = s_merge_rule_count;
+	for (vx_uint32 iRule = 0; iRule <= rule_count; iRule++) {
+		AgoNodeMergeRule * rule;
+		if (iRule == rule_count) {
+			if (ruleSet++ == 1)
+				break;
+			rule_count = (vx_uint32)agraph->ref.context->merge_rules.size();
+			if (rule_count == 0)
+				break;
+			iRule = 0;
+		}
+		if (ruleSet == 0) {
+			rule = &s_merge_rule[iRule];
+		}
+		else {
+			rule = &agraph->ref.context->merge_rules[iRule];
+		}
+		// find match
+		vx_uint32 numMatchNodes = 0;
+		for (vx_uint32 iNode = 0; iNode < AGO_MERGE_RULE_MAX_FIND && rule->find[iNode].kernel_id; iNode++) {
+			numMatchNodes++;
+		}
+		AgoData * mdata[AGO_MAX_PARAMS] = { 0 };
+		AgoNode * stack[AGO_MERGE_RULE_MAX_FIND] { agraph->nodeList.head };
+		vx_int32 stackTop = 0;
+		for (;;) {
+			bool foundMatch = false;
+			if (stack[stackTop]->akernel->id == rule->find[stackTop].kernel_id) {
+				foundMatch = true;
+				memset(mdata, 0, sizeof(mdata));
+				for (vx_int32 iNode = 0; iNode <= stackTop; iNode++) {
+					for (vx_uint32 arg = 0; arg < AGO_MAX_PARAMS; arg++) {
+						// get argument specificaiton from the rule of current node
+						vx_uint32 arg_spec = rule->find[iNode].arg_spec[arg];
+						if (arg_spec) {
+							if (!(arg < stack[iNode]->paramCount && stack[iNode]->paramList[arg])) {
+								// node doesn't have required argument
+								foundMatch = false;
+								break;
+							}
+						}
+						else {
+							if (arg < stack[iNode]->paramCount && stack[iNode]->paramList[arg]) {
+								// node has argument that is missing in the rule
+								foundMatch = false;
+								break;
+							}
+							// this matches the rule
+							continue;
+						}
+						AgoData * data = stack[iNode]->paramList[arg];
+
+						// get argument info and sanity checks
+						vx_int32 arg_index = ARG_INDEX(arg_spec);
+						vx_int32 arg_child = ARG_HAS_CHILD(arg_spec) ? ARG_GET_CHILD(arg_spec) : -1;
+						if (arg_child >= 0) {
+							if (!data->parent || !(arg_child < (vx_int32)data->parent->numChildren) || !(data->parent->children[arg_child] == data)) {
+								// node doesn't have required argument as a child
+								foundMatch = false;
+								break;
+							}
+							data = data->parent;
+						}
+						if (!mdata[arg_index]) {
+							// save the data object for comparison with other parameter comparision
+							mdata[arg_index] = data;
+						}
+						if (mdata[arg_index] != data) {
+							// data doesn't match with previously saved parameter as dectated by the rule
+							foundMatch = false;
+							break;
+						}
+						if ((ARG_IS_SOLITARY(arg_spec) || ARG_IS_WRITEONLY(arg_spec)) && !data->isVirtual) {
+							// data virtual properties doesn't match with the rule requirements
+							foundMatch = false;
+							break;
+						}
+						if (ARG_IS_WRITEONLY(arg_spec) && data->inputUsageCount > 0) {
+							// data write-only properties doesn't match with the rule requirements
+							foundMatch = false;
+							break;
+						}
+					}
+				}
+			}
+			// check if a match is found, proceed to next step in the search
+			if (foundMatch) {
+				if ((stackTop + 1) == numMatchNodes) {
+					// check for virtual node removal criteria
+					for (vx_int32 arg_index = 0; arg_index < AGO_MAX_PARAMS; arg_index++) {
+						if (mdata[arg_index]) {
+							// check if data in find rule spec is missing in the replace rule spec, or
+							// solitary check is requested
+							bool data_missing_in_replace = true;
+							bool solitary_requested = false;
+							for (vx_uint32 iNode = 0; iNode < AGO_MERGE_RULE_MAX_REPLACE && rule->replace[iNode].kernel_id; iNode++) {
+								for (vx_uint32 arg = 0; arg < AGO_MAX_PARAMS; arg++) {
+									vx_int32 arg_spec = rule->replace[iNode].arg_spec[arg];
+									if (arg_spec) {
+										if (arg_index == ARG_INDEX(arg_spec)) {
+											data_missing_in_replace = false;
+											if (ARG_IS_SOLITARY(arg_spec)) {
+												solitary_requested = true;
+											}
+										}
+									}
+								}
+							}
+							if (data_missing_in_replace || solitary_requested) {
+								// make sure that the data is virtual and no other nodes except nodes in the stack[] use this data
+								if (!mdata[arg_index]->isVirtual)
+									foundMatch = false;
+								else {
+									for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+										bool node_on_stack = false;
+										for (vx_int32 i = 0; i <= stackTop; i++) {
+											if (stack[i] == anode) {
+												node_on_stack = true;
+												break;
+											}
+										}
+										if (!node_on_stack) {
+											// check if data used by the node
+											bool data_used_outside_rule = false;
+											for (vx_uint32 i = 0; i < anode->paramCount; i++) {
+												if (anode->paramList[i] == mdata[arg_index]) {
+													data_used_outside_rule = true;
+													break;
+												}
+											}
+											if (data_used_outside_rule) {
+												// the data can't be discarded by this rule
+												foundMatch = false;
+												break;
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+					if (foundMatch) {
+						// found a match to the complete rule
+						stackTop++;
+						break;
+					}
+				}
+				if (foundMatch) {
+					// skip to next-node in the rule and start searching
+					stack[++stackTop] = agraph->nodeList.head;
+				}
+			}
+			if(!foundMatch) {
+				// skip to next node, since no match has been found at stackTop-node in the rule
+				stack[stackTop] = stack[stackTop]->next;
+				// when end-of-node-list is reached, go back one node in the rule and try next node
+				while (!stack[stackTop]) {
+					stackTop--;
+					if (stackTop < 0) {
+						break;
+					}
+					stack[stackTop] = stack[stackTop]->next;
+				}
+				if (stackTop < 0) {
+					// reached end of search and no matched were found
+					break;
+				}
+			}
+		}
+
+		if (stackTop == numMatchNodes) {
+			// get affinity, border_mode, and callback attributes
+			AgoTargetAffinityInfo_ attr_affinity = { 0 };
+			vx_border_mode_t attr_border_mode = { 0 };
+			vx_nodecomplete_f callback = NULL;
+			for (vx_int32 iNode = 0; iNode < stackTop; iNode++) {
+				if (stack[iNode]->callback) {
+					callback = stack[iNode]->callback;
+				}
+				if (stack[iNode]->attr_affinity.device_type) {
+					attr_affinity.device_type = stack[iNode]->attr_affinity.device_type;
+				}
+				if (stack[iNode]->attr_border_mode.mode) {
+					// TBD: check whether to progate border mode
+					// attr_border_mode = stack[iNode]->attr_border_mode;
+				}
+			}
+			// add new nodes per rule's replace[] specification
+			for (vx_uint32 iNode = 0; iNode < AGO_MERGE_RULE_MAX_REPLACE && rule->replace[iNode].kernel_id; iNode++) {
+				// create a new AgoNode and add it to the nodeList
+				AgoNode * childnode = agoCreateNode(agraph, rule->replace[iNode].kernel_id);
+				for (vx_uint32 arg = 0; arg < AGO_MAX_PARAMS; arg++) {
+					vx_int32 arg_spec = rule->replace[iNode].arg_spec[arg];
+					if (arg_spec) {
+						vx_int32 arg_index = ARG_INDEX(arg_spec);
+						vx_int32 arg_child = ARG_HAS_CHILD(arg_spec) ? ARG_GET_CHILD(arg_spec) : -1;
+						AgoData * data = mdata[arg_index];
+						if (arg_child >= 0) {
+							if (!(arg_child < (vx_int32)data->numChildren) || !data->children[arg_child]) {
+								// TBD: error handling
+								agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoOptimizeDramaRemoveNodeMerge: invalid child(%d) in arg:%d of replace-node:%d of rule:%d\n", arg_child, arg, iNode, iRule);
+								return -1;
+							}
+							data = data->children[arg_child];
+						}
+						if (ARG_IS_BYTE2U1(arg_spec)) {
+							// process the request to convert U8 image to U1 image
+							if (data->ref.type == VX_TYPE_IMAGE && data->u.img.format == VX_DF_IMAGE_U8) {
+								data->u.img.format = VX_DF_IMAGE_U1_AMD;
+							}
+						}
+						childnode->paramList[arg] = data;
+					}
+				}
+				// transfer configuration from rule to childnode
+				childnode->attr_affinity = attr_affinity;
+				childnode->attr_border_mode;
+				childnode->callback;
+				debug_printf("INFO: agoOptimizeDramaRemoveNodeMerge: added node %s\n", childnode->akernel->name);
+				// verify the node
+				if (agoVerifyNode(childnode)) {
+					return -1;
+				}
+			}
+			// remove the nodes that matched with rule's find[]
+			for (vx_int32 iNode = 0; iNode < stackTop; iNode++) {
+				debug_printf("INFO: agoOptimizeDramaRemoveNodeMerge: removing node %s\n", stack[iNode]->akernel->name);
+				if (agoRemoveNode(&agraph->nodeList, stack[iNode], true)) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoOptimizeDramaRemoveNodeMerge: agoRemoveNode(*,%s) failed\n", stack[iNode]->akernel->name);
+					return -1;
+				}
+				stack[iNode] = 0;
+			}
+			// make only one change at a time
+			return 1;
+		}
+	}
+
+	// try special case node mapping
+	for (AgoNode * node = agraph->nodeList.head; node; node = node->next)
+	{
+		AgoKernel * kernel = node->akernel;
+		if (kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST || kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR ||
+			kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_REPLICATE || kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_CONSTANT ||
+			kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA)
+		{
+			AgoNode * childnode = NULL;
+			AgoData * oImg = node->paramList[0];
+			AgoData * iImg = node->paramList[1];
+			// 1:1 scale image is same as channel copy
+			vx_float32 offset = (kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA) ? -0.5f : 0.0f;
+			if ((iImg->u.img.width == oImg->u.img.width) && (iImg->u.img.height == oImg->u.img.height)) {
+				// replace the node with VX_KERNEL_AMD_CHANNEL_COPY_U8_U8
+				childnode = agoCreateNode(agraph, VX_KERNEL_AMD_CHANNEL_COPY_U8_U8);
+				childnode->paramList[0] = oImg;
+				childnode->paramList[1] = iImg;
+			}
+			// approaximate AREA interpolation mode with scale factors not greater than 1.0f with BILINEAR
+			else if (kernel->id == VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA && !((iImg->u.img.width > oImg->u.img.width) && (iImg->u.img.height > oImg->u.img.height))) {
+				// replace the node with VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST
+				childnode = agoCreateNode(agraph, VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST);
+				childnode->paramList[0] = oImg;
+				childnode->paramList[1] = iImg;
+			}
+			if (childnode) {
+				// transfer configuration from node to childnode
+				childnode->attr_affinity = node->attr_affinity;
+				agoImportNodeConfig(childnode, node);
+				debug_printf("INFO: agoOptimizeDramaRemoveNodeMerge: added node %s\n", childnode->akernel->name);
+				// remove the original node
+				debug_printf("INFO: agoOptimizeDramaRemoveNodeMerge: removing node %s\n", node->akernel->name);
+				if (agoRemoveNode(&agraph->nodeList, node, true)) {
+					agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoOptimizeDramaRemoveNodeMerge: agoRemoveNode(*,%s) failed\n", node->akernel->name);
+					return -1;
+				}
+				// verify the node
+				if (agoVerifyNode(childnode)) {
+					return -1;
+				}
+				// make only one change at a time
+				return 1;
+			}
+		}
+	}
+
+	// no changes happened to the graph
+	return 0;
+}
+
+int agoOptimizeDramaRemoveImageU8toU1(AgoGraph * agraph)
+{
+	int status = 0;
+	// browse through all virtual data in the graph for VX_DF_IMAGE_U8 objects
+	// that can be potentially converted into VX_DF_IMAGE_U1_AMD
+	for (AgoData * adata = agraph->dataList.head; adata; adata = adata->next) {
+		if (adata->ref.type == VX_TYPE_IMAGE &&
+			adata->u.img.format == VX_DF_IMAGE_U8 && 
+			adata->inputUsageCount >= 1 && 
+			adata->outputUsageCount == 1 && 
+			adata->inoutUsageCount == 0)
+		{
+			bool U8toU1_possible = true;
+
+			// loop through all connected images, such as ROI
+			AgoData * pdata = adata->u.img.roiMasterImage ? adata->u.img.roiMasterImage : adata;
+			for (AgoData * data = agraph->dataList.head; data && U8toU1_possible; data = data->next)
+			{
+				if (data->ref.type == VX_TYPE_IMAGE && (data == adata || data->u.img.roiMasterImage == pdata))
+				{
+					// if ROI, make sure start_x and end_x are multiple of 8
+					if (data->u.img.isROI && ((data->u.img.rect_roi.start_x & 7) || (data->u.img.rect_roi.end_x & 7))) {
+						// can not convert it to U1 since ROI accesses on non-byte boundaries
+						U8toU1_possible = false;
+						break;
+					}
+					// make sure all the nodes that access this data can be converted to use VX_DF_IMAGE_U1_AMD
+					for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+						vx_int32 arg_index = -1;
+						for (vx_uint32 i = 0; i < anode->paramCount; i++) {
+							if (anode->paramList[i] == data) {
+								arg_index = i;
+								break;
+							}
+						}
+						// check if data is used by anode
+						if (arg_index >= 0) {
+							// check if anode is part of U8toU1 conversion rule
+							bool matched = false;
+							for (vx_uint32 rule = 0; rule < s_U8toU1_rule_count; rule++) {
+								if (s_U8toU1_rule[rule].find_kernel_id == anode->akernel->id &&
+									s_U8toU1_rule[rule].arg_index == arg_index)
+								{
+									matched = true;
+									break;
+								}
+							}
+							if (!matched) {
+								// data is used by nodes that are not in U8toU1 conversion rule
+								U8toU1_possible = false;
+								break;
+							}
+						}
+					}
+				}
+			}
+
+			// if U8toU1_possible is TRUE:
+			// - replace adata image type from VX_DF_IMAGE_U8 to VX_DF_IMAGE_U1_AMD
+			// - change node type to use VX_DF_IMAGE_U1_AMD instead of VX_DF_IMAGE_U8
+			if (U8toU1_possible) {
+				// loop through all connected images, such as ROI
+				AgoData * pdata = adata->u.img.roiMasterImage ? adata->u.img.roiMasterImage : adata;
+				for (AgoData * data = agraph->dataList.head; data && U8toU1_possible; data = data->next)
+				{
+					if (data->ref.type == VX_TYPE_IMAGE && (data == adata || data->u.img.roiMasterImage == pdata))
+					{
+						data->u.img.format = VX_DF_IMAGE_U1_AMD;
+						for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+							vx_int32 arg_index = -1;
+							for (vx_uint32 i = 0; i < anode->paramCount; i++) {
+								if (anode->paramList[i] == data) {
+									arg_index = i;
+									break;
+								}
+							}
+							// check if data is used by anode
+							if (arg_index >= 0) {
+								// check if anode is part of U8toU1 conversion rule
+								for (vx_uint32 rule = 0; rule < s_U8toU1_rule_count; rule++) {
+									if (s_U8toU1_rule[rule].find_kernel_id == anode->akernel->id &&
+										s_U8toU1_rule[rule].arg_index == arg_index)
+									{
+										anode->akernel = agoFindKernelByEnum(agraph->ref.context, s_U8toU1_rule[rule].replace_kernel_id);
+										if (!anode->akernel) {
+											agoAddLogEntry(&anode->ref, VX_FAILURE, "ERROR: agoOptimizeDramaRemoveImageU8toU1: agoFindKernelByEnum(0x%08x) failed for rule:%d\n", s_U8toU1_rule[rule].replace_kernel_id, rule);
+											return -1;
+										}
+										break;
+									}
+								}
+							}
+						}
+					}
+				}
+				// mark that graph has been modified
+				status = 1;
+			}
+		}
+	}
+	return status;
+}
+
+int agoOptimizeDramaRemove(AgoGraph * agraph)
+{
+#if ENABLE_DEBUG_MESSAGES > 1
+	int iteration = 0;
+#endif
+	for (int graphGotModified = !0; agraph->nodeList.head && graphGotModified;)
+	{
+		// check and mark data usage
+		agoOptimizeDramaMarkDataUsage(agraph);
+
+#if ENABLE_DEBUG_MESSAGES > 1
+		printf("************************************************************************** agoOptimizeDramaRemove: ITER %04d\n", ++iteration);
+		agoWriteGraph(agraph, NULL, 0, stdout, "[agoOptimizeDramaRemove]");
+#endif
+
+		if (!(agraph->optimizer_flags & AGO_GRAPH_OPTIMIZER_FLAG_NO_REMOVE_COPY_NODES)) {
+			// try removing COPY nodes with virtual buffers
+			if ((graphGotModified = agoOptimizeDramaRemoveCopyNodes(agraph)) < 0)
+				return -1;
+			if (graphGotModified)
+				continue;
+		}
+
+		if (!(agraph->optimizer_flags & AGO_GRAPH_OPTIMIZER_FLAG_NO_REMOVE_UNUSED_OUTPUTS)) {
+			// try remove nodes who's outputs are not used
+			if ((graphGotModified = agoOptimizeDramaRemoveNodesWithUnusedOutputs(agraph)) < 0)
+				return -1;
+			if (graphGotModified)
+				continue;
+		}
+
+		if (!(agraph->optimizer_flags & AGO_GRAPH_OPTIMIZER_FLAG_NO_NODE_MERGE)) {
+			// try merging nodes that will further result in removal of redundancies
+			if ((graphGotModified = agoOptimizeDramaRemoveNodeMerge(agraph)) < 0)
+				return -1;
+			if (graphGotModified)
+				continue;
+		}
+
+		if (!(agraph->optimizer_flags & AGO_GRAPH_OPTIMIZER_FLAG_NO_CONVERT_8BIT_TO_1BIT)) {
+			// try converting VX_DF_IMAGE_U8 images to VX_DF_IMAGE_U1_AMD images
+			if ((graphGotModified = agoOptimizeDramaRemoveImageU8toU1(agraph)) < 0)
+				return -1;
+			if (graphGotModified)
+				continue;
+		}
+
+		graphGotModified = 0;
+	}
+	return 0;
+}
diff --git a/openvx/ago/ago_haf_cpu.cpp b/openvx/ago/ago_haf_cpu.cpp
new file mode 100644
index 0000000..273f3c7
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu.cpp
@@ -0,0 +1,234 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+int HafCpu_ColorConvert_IU_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_IU_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_IV_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_IV_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_IUV_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_IUV_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_UV12_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImageChroma,
+		vx_uint32     dstImageChromaStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_ColorConvert_UV12_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImageChroma,
+		vx_uint32     dstImageChromaStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_3x3_L2NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_5x5_L2NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_7x7_L2NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_CannyEdgeTrace_U8_U8
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[]
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_Convolve_U8_U8_MxN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_uint32     convolutionWidth,
+		vx_uint32     convolutionHeight,
+		vx_int32      shift
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_Convolve_S16_U8_MxN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_uint32     convolutionWidth,
+		vx_uint32     convolutionHeight,
+		vx_int32      shift
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
diff --git a/openvx/ago/ago_haf_cpu.h b/openvx/ago/ago_haf_cpu.h
new file mode 100644
index 0000000..60af1cd
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu.h
@@ -0,0 +1,3025 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __ago_haf_cpu_h__
+#define __ago_haf_cpu_h__
+
+#include <VX/vx.h>
+
+#define TWOPI			6.283185307f
+#define PI				3.1415926535898f
+#define CAST_S16(x)		(int16_t)((x) < -32768 ? -32768 : (x) > 32767 ? 32767 : (x))
+#define atan2_p0        (0.273*0.3183098862f)
+#define atan2_p1		(0.9997878412794807f*57.29577951308232f)
+#define atan2_p3		(-0.3258083974640975f*57.29577951308232f)
+#define atan2_p5		(0.1555786518463281f*57.29577951308232f)
+#define atan2_p7		(-0.04432655554792128f*57.29577951308232f)
+
+
+typedef struct {
+	vx_uint16 x;
+	vx_uint16 y;
+} ago_coord2d_ushort_t;
+
+typedef struct {
+	vx_int16 x;
+	vx_int16 y;
+} ago_coord2d_short_t;
+
+typedef struct {
+	vx_int32 x;
+	vx_int32 y;
+} ago_coord2d_int_t;
+
+typedef struct {
+	vx_float32 x;
+	vx_float32 y;
+} ago_coord2d_float_t;
+
+typedef struct {
+	vx_float32 matrix[3][2];
+} ago_affine_matrix_t;
+
+typedef struct {
+	vx_float32 matrix[3][3];
+} ago_perspective_matrix_t;
+
+typedef struct AgoConfigScaleMatrix ago_scale_matrix_t;
+
+typedef struct {
+	vx_int16   x; // x-coordinate
+	vx_int16   y; // y-coordinate
+	vx_float32 s; // stregnth
+} ago_keypoint_xys_t;
+
+typedef struct {
+	vx_uint32      width;
+	vx_uint32      height;
+	vx_uint32      strideInBytes;
+	vx_uint8     * pImage;
+	vx_bool        imageAlreadyComputed;
+} ago_pyramid_u8_t;
+
+typedef struct {
+	vx_uint32  sampleCount;
+	vx_float32 sum;
+	vx_float32 sumSquared;
+} ago_meanstddev_data_t;
+
+typedef struct {
+	vx_int32 min;
+	vx_int32 max;
+} ago_minmaxloc_data_t;
+
+typedef struct {
+	vx_float32 x;
+	vx_float32 y;
+} ago_keypoint_t;
+
+typedef struct {
+	vx_uint32 width;
+	vx_uint32 height;
+	vx_uint32 cellSize;
+	vx_uint32 gridBufSize;
+} ago_harris_grid_header_t;
+
+int HafCpu_Not_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Not_U8_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Not_U1_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Not_U1_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Lut_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLut
+	);
+int HafCpu_Threshold_U8_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	);
+int HafCpu_Threshold_U8_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	);
+int HafCpu_Threshold_U1_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	);
+int HafCpu_Threshold_U1_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	);
+int HafCpu_ThresholdNot_U8_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	);
+int HafCpu_ThresholdNot_U8_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	);
+int HafCpu_ThresholdNot_U1_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	);
+int HafCpu_ThresholdNot_U1_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	);
+int HafCpu_ColorDepth_U8_S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int32      shift
+	);
+int HafCpu_ColorDepth_U8_S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int32      shift
+	);
+int HafCpu_ColorDepth_S16_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int32      shift
+	);
+int HafCpu_Add_U8_U8U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Add_U8_U8U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_U8_U8U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_U8_U8U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Mul_U8_U8U8_Wrap_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_U8_U8U8_Wrap_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_U8_U8U8_Sat_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_U8_U8U8_Sat_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_And_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_And_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_And_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_And_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_And_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_And_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Or_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Or_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Or_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Or_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Or_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Or_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xor_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nand_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nand_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nand_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nand_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nand_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nand_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nor_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Nor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xnor_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xnor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xnor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xnor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xnor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Xnor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_AbsDiff_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_AccumulateWeighted_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_float32    alpha
+	);
+int HafCpu_Add_S16_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_S16_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Mul_S16_U8U8_Wrap_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_U8U8_Wrap_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_U8U8_Sat_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_U8U8_Sat_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Add_S16_S16U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Add_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Accumulate_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Sub_S16_S16U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Mul_S16_S16U8_Wrap_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_S16U8_Wrap_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_S16U8_Sat_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_S16U8_Sat_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_AccumulateSquared_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint32     shift
+	);
+int HafCpu_Sub_S16_U8S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_S16_U8S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_AbsDiff_S16_S16S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Add_S16_S16S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Add_S16_S16S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_S16_S16S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Sub_S16_S16S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_Mul_S16_S16S16_Wrap_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_S16S16_Wrap_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_S16S16_Sat_Trunc
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Mul_S16_S16S16_Sat_Round
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_float32    scale
+	);
+int HafCpu_Magnitude_S16_S16S16
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pMagImage,
+		vx_uint32     magImageStrideInBytes,
+		vx_int16    * pGxImage,
+		vx_uint32     gxImageStrideInBytes,
+		vx_int16    * pGyImage,
+		vx_uint32     gyImageStrideInBytes
+	);
+int HafCpu_Phase_U8_S16S16
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pPhaseImage,
+		vx_uint32     phaseImageStrideInBytes,
+		vx_int16    * pGxImage,
+		vx_uint32     gxImageStrideInBytes,
+		vx_int16    * pGyImage,
+		vx_uint32     gyImageStrideInBytes
+	);
+int HafCpu_MemSet_U8
+	(
+		vx_size       count,
+		vx_uint8    * pDstBuf,
+		vx_uint8      value
+	);
+int HafCpu_MemSet_U16
+	(
+		vx_size       count,
+		vx_uint16   * pDstBuf,
+		vx_uint16     value
+	);
+int HafCpu_MemSet_U24
+	(
+		vx_size       count,
+		vx_uint8    * pDstBuf,
+		vx_uint32     value
+	);
+int HafCpu_MemSet_U32
+	(
+		vx_size       count,
+		vx_uint32   * pDstBuf,
+		vx_uint32     value
+	);
+int HafCpu_BinaryCopy_U8_U8
+	(
+		vx_size       size,
+		vx_uint8    * pDstBuf,
+		vx_uint8    * pSrcBuf
+	);
+int HafCpu_ChannelCopy_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_BufferCopyDisperseInDst
+	(
+		vx_uint32	  dstWidth,
+		vx_uint32	  dstHeight,
+		vx_uint32	  pixelSizeInBytes,
+		vx_uint8	* pDstImage,
+		vx_uint32	  dstImageStrideYInBytes,
+		vx_uint32	  dstImageStrideXInBytes,
+		vx_uint8	* pSrcImage,
+		vx_uint32	  srcImageStrideYInBytes
+	);
+int HafCpu_BufferCopyDisperseInSrc
+	(
+		vx_uint32	  dstWidth,
+		vx_uint32	  dstHeight,
+		vx_uint32	  pixelSizeInBytes,
+		vx_uint8	* pDstImage,
+		vx_uint32	  dstImageStrideYInBytes,
+		vx_uint8	* pSrcImage,
+		vx_uint32	  srcImageStrideYInBytes,
+		vx_uint32	  srcImageStrideXInBytes
+	);
+int HafCpu_ChannelCopy_U8_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelCopy_U1_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelCopy_U1_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U16_Pos0
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U16_Pos1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U24_Pos0
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U24_Pos1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U24_Pos2
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U32_Pos0
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U32_Pos1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U32_Pos2
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8_U32_Pos3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8U8U8_U24
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage0,
+		vx_uint8    * pDstImage1,
+		vx_uint8    * pDstImage2,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8U8U8_U32
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage0,
+		vx_uint8    * pDstImage1,
+		vx_uint8    * pDstImage2,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelExtract_U8U8U8U8_U32
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage0,
+		vx_uint8    * pDstImage1,
+		vx_uint8    * pDstImage2,
+		vx_uint8    * pDstImage3,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ChannelCombine_U16_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes
+	);
+int HafCpu_ChannelCombine_U24_U8U8U8_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_ChannelCombine_U32_U8U8U8_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_ChannelCombine_U32_U8U8U8_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	);
+int HafCpu_ChannelCombine_U32_U8U8U8U8_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_uint8    * pSrcImage3,
+		vx_uint32     srcImage3StrideInBytes
+	);
+int HafCpu_ColorConvert_RGB_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGB_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGB_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGB_IYUV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcYImage,
+		vx_uint32     srcYImageStrideInBytes,
+		vx_uint8    * pSrcUImage,
+		vx_uint32     srcUImageStrideInBytes,
+		vx_uint8    * pSrcVImage,
+		vx_uint32     srcVImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGB_NV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGB_NV21
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGBX_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGBX_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGBX_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGBX_IYUV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcYImage,
+		vx_uint32     srcYImageStrideInBytes,
+		vx_uint8    * pSrcUImage,
+		vx_uint32     srcUImageStrideInBytes,
+		vx_uint8    * pSrcVImage,
+		vx_uint32     srcVImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGBX_NV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	);
+int HafCpu_ColorConvert_RGBX_NV21
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	);
+int HafCpu_ColorConvert_YUV4_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_YUV4_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ScaleUp2x2_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_UV_UV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IYUV_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IYUV_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_IYUV_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_IYUV_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_IUV_UV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	);
+int HafCpu_ColorConvert_NV12_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_NV12_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_NV12_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_NV12_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FormatConvert_UV12_IUV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcUImage,
+		vx_uint32     srcUImageStrideInBytes,
+		vx_uint8    * pSrcVImage,
+		vx_uint32     srcVImageStrideInBytes
+	);
+int HafCpu_ColorConvert_Y_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_Y_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_U_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_U_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_V_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_V_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IU_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IU_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IV_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IV_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IUV_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_IUV_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_UV12_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImageChroma,
+		vx_uint32     dstImageChromaStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_ColorConvert_UV12_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImageChroma,
+		vx_uint32     dstImageChromaStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Box_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pScratch
+	);
+int HafCpu_Dilate_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Erode_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Median_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Gaussian_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	);
+int HafCpu_ScaleGaussianHalf_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_ScaleGaussianHalf_U8_U8_5x5
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		bool		  sampleFirstRow,
+		bool		  sampleFirstColumn,
+		vx_uint8	* pScratch
+	);
+int HafCpu_ScaleGaussianOrb_U8_U8_5x5
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_Convolve_U8_U8_3xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_U8_U8_5xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_U8_U8_7xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_U8_U8_9xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_U8_U8_MxN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_uint32     convolutionWidth,
+		vx_uint32     convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_S16_U8_3xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_S16_U8_5xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_S16_U8_7xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_S16_U8_9xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_Convolve_S16_U8_MxN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_uint32     convolutionWidth,
+		vx_uint32     convolutionHeight,
+		vx_int32      shift
+	);
+int HafCpu_SobelMagnitude_S16_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstMagImage,
+		vx_uint32     dstMagImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_SobelPhase_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstPhaseImage,
+		vx_uint32     dstPhaseImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	);
+int HafCpu_SobelMagnitudePhase_S16U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstMagImage,
+		vx_uint32     dstMagImageStrideInBytes,
+		vx_uint8    * pDstPhaseImage,
+		vx_uint32     dstPhaseImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Sobel_S16S16_U8_3x3_GXY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstGxImage,
+		vx_uint32     dstGxImageStrideInBytes,
+		vx_int16    * pDstGyImage,
+		vx_uint32     dstGyImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	);
+int HafCpu_Sobel_S16_U8_3x3_GX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstGxImage,
+		vx_uint32     dstGxImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	);
+int HafCpu_Sobel_S16_U8_3x3_GY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstGyImage,
+		vx_uint32     dstGyImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	);
+int HafCpu_Dilate_U1_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Erode_U1_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Dilate_U1_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Erode_U1_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Dilate_U8_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Erode_U8_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_FastCorners_XY_U8_Supression
+	(
+		vx_uint32       capacityOfDstCorner,
+		vx_keypoint_t   dstCorner[],
+		vx_uint32     * pDstCornerCount,
+		vx_uint32       srcWidth,
+		vx_uint32       srcHeight,
+		vx_uint8      * pSrcImage,
+		vx_uint32       srcImageStrideInBytes,
+		vx_float32      strength_threshold,
+		vx_uint8	  * pScratch
+	);
+int HafCpu_FastCorners_XY_U8_NoSupression
+	(
+		vx_uint32       capacityOfDstCorner,
+		vx_keypoint_t   dstCorner[],
+		vx_uint32     * pDstCornerCount,
+		vx_uint32       srcWidth,
+		vx_uint32       srcHeight,
+		vx_uint8      * pSrcImage,
+		vx_uint32       srcImageStrideInBytes,
+		vx_float32      strength_threshold
+	);
+int HafCpu_HarrisSobel_HG3_U8_3x3
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstGxy,
+		vx_uint32          dstGxyStrideInBytes,
+		vx_uint8         * pSrcImage,
+		vx_uint32          srcImageStrideInBytes,
+		vx_uint8		 * pScratch
+	);
+int HafCpu_HarrisSobel_HG3_U8_5x5
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstGxy,
+		vx_uint32          dstGxyStrideInBytes,
+		vx_uint8         * pSrcImage,
+		vx_uint32          srcImageStrideInBytes,
+		vx_uint8		 * pScratch
+	);
+int HafCpu_HarrisSobel_HG3_U8_7x7
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstGxy,
+		vx_uint32          dstGxyStrideInBytes,
+		vx_uint8         * pSrcImage,
+		vx_uint32          srcImageStrideInBytes,
+		vx_uint8		 * pScratch
+	);
+int HafCpu_HarrisScore_HVC_HG3_3x3
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstVc,
+		vx_uint32          dstVcStrideInBytes,
+		vx_float32       * pSrcGxy,
+		vx_uint32          srcGxyStrideInBytes,
+		vx_float32         sensitivity,
+		vx_float32         strength_threshold,
+		vx_float32		   normalization_factor
+	);
+int HafCpu_HarrisScore_HVC_HG3_5x5
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstVc,
+		vx_uint32          dstVcStrideInBytes,
+		vx_float32       * pSrcGxy,
+		vx_uint32          srcGxyStrideInBytes,
+		vx_float32         sensitivity,
+		vx_float32         strength_threshold,
+		vx_float32		   normalization_factor
+	);
+int HafCpu_HarrisScore_HVC_HG3_7x7
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstVc,
+		vx_uint32          dstVcStrideInBytes,
+		vx_float32       * pSrcGxy,
+		vx_uint32          srcGxyStrideInBytes,
+		vx_float32         sensitivity,
+		vx_float32         strength_threshold,
+		vx_float32		   normalization_factor
+	);
+int HafCpu_NonMaxSupp_XY_ANY_3x3
+	(
+		vx_uint32            capacityOfList,
+		ago_keypoint_xys_t * dstList,
+		vx_uint32          * pDstListCount,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_float32         * pSrcImg,
+		vx_uint32            srcStrideInBytes
+	);
+int HafCpu_HarrisMergeSortAndPick_XY_XYS
+	(
+		vx_uint32                  capacityOfDstCorner,
+		vx_keypoint_t            * dstCorner,
+		vx_uint32                * pDstCornerCount,
+		ago_keypoint_xys_t       * srcList,
+		vx_uint32                  srcListCount,
+		vx_float32                 min_distance,
+		ago_harris_grid_header_t * gridInfo,
+		ago_coord2d_short_t      * gridBuf
+	);
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_3x3_L1NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper,
+		vx_uint8			 * pScratch
+	);
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_3x3_L2NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	);
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_5x5_L1NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	);
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_5x5_L2NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	);
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_7x7_L1NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	);
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_7x7_L2NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	);
+int HafCpu_CannySobel_U16_U8_3x3_L1NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_CannySobel_U16_U8_3x3_L2NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_CannySobel_U16_U8_5x5_L1NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_CannySobel_U16_U8_5x5_L2NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_CannySobel_U16_U8_7x7_L1NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_CannySobel_U16_U8_7x7_L2NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	);
+int HafCpu_CannySuppThreshold_U8XY_U16_3x3
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint16            * pSrc,
+		vx_uint32              srcStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	);
+int HafCpu_Remap_U8_U8_Nearest
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              srcWidth,
+		vx_uint32              srcHeight,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		ago_coord2d_ushort_t * pMap,
+		vx_uint32              mapStrideInBytes
+	);
+int HafCpu_Remap_U8_U8_Nearest_Constant
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              srcWidth,
+		vx_uint32              srcHeight,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		ago_coord2d_ushort_t * pMap,
+		vx_uint32              mapStrideInBytes,
+		vx_uint8               border
+	);
+int HafCpu_Remap_U8_U8_Bilinear
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              srcWidth,
+		vx_uint32              srcHeight,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		ago_coord2d_ushort_t * pMap,
+		vx_uint32              mapStrideInBytes
+	);
+int HafCpu_Remap_U8_U8_Bilinear_Constant
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              srcWidth,
+		vx_uint32              srcHeight,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		ago_coord2d_ushort_t * pMap,
+		vx_uint32              mapStrideInBytes,
+		vx_uint8               border
+	);
+int HafCpu_WarpAffine_U8_U8_Nearest
+	(
+		vx_uint32             dstWidth,
+		vx_uint32             dstHeight,
+		vx_uint8            * pDstImage,
+		vx_uint32             dstImageStrideInBytes,
+		vx_uint32             srcWidth,
+		vx_uint32             srcHeight,
+		vx_uint8            * pSrcImage,
+		vx_uint32             srcImageStrideInBytes,
+		ago_affine_matrix_t * matrix,
+		vx_uint8			* pLocalData
+	);
+int HafCpu_WarpAffine_U8_U8_Nearest_Constant
+	(
+		vx_uint32             dstWidth,
+		vx_uint32             dstHeight,
+		vx_uint8            * pDstImage,
+		vx_uint32             dstImageStrideInBytes,
+		vx_uint32             srcWidth,
+		vx_uint32             srcHeight,
+		vx_uint8            * pSrcImage,
+		vx_uint32             srcImageStrideInBytes,
+		ago_affine_matrix_t * matrix,
+		vx_uint8              border,
+		vx_uint8			* pLocalData
+	);
+int HafCpu_WarpAffine_U8_U8_Bilinear
+	(
+		vx_uint32             dstWidth,
+		vx_uint32             dstHeight,
+		vx_uint8            * pDstImage,
+		vx_uint32             dstImageStrideInBytes,
+		vx_uint32             srcWidth,
+		vx_uint32             srcHeight,
+		vx_uint8            * pSrcImage,
+		vx_uint32             srcImageStrideInBytes,
+		ago_affine_matrix_t * matrix,
+		vx_uint8			* pLocalData
+	);
+int HafCpu_WarpAffine_U8_U8_Bilinear_Constant
+	(
+		vx_uint32             dstWidth,
+		vx_uint32             dstHeight,
+		vx_uint8            * pDstImage,
+		vx_uint32             dstImageStrideInBytes,
+		vx_uint32             srcWidth,
+		vx_uint32             srcHeight,
+		vx_uint8            * pSrcImage,
+		vx_uint32             srcImageStrideInBytes,
+		ago_affine_matrix_t * matrix,
+		vx_uint8              border,
+		vx_uint8			* pLocalData
+	);
+int HafCpu_WarpPerspective_U8_U8_Nearest
+	(
+		vx_uint32                  dstWidth,
+		vx_uint32                  dstHeight,
+		vx_uint8                 * pDstImage,
+		vx_uint32                  dstImageStrideInBytes,
+		vx_uint32                  srcWidth,
+		vx_uint32                  srcHeight,
+		vx_uint8                 * pSrcImage,
+		vx_uint32                  srcImageStrideInBytes,
+		ago_perspective_matrix_t * matrix,
+		vx_uint8				 * pLocalData
+	);
+int HafCpu_WarpPerspective_U8_U8_Nearest_Constant
+	(
+		vx_uint32                  dstWidth,
+		vx_uint32                  dstHeight,
+		vx_uint8                 * pDstImage,
+		vx_uint32                  dstImageStrideInBytes,
+		vx_uint32                  srcWidth,
+		vx_uint32                  srcHeight,
+		vx_uint8                 * pSrcImage,
+		vx_uint32                  srcImageStrideInBytes,
+		ago_perspective_matrix_t * matrix,
+		vx_uint8                   border,
+		vx_uint8				 * pLocalData
+	);
+int HafCpu_WarpPerspective_U8_U8_Bilinear
+	(
+		vx_uint32                  dstWidth,
+		vx_uint32                  dstHeight,
+		vx_uint8                 * pDstImage,
+		vx_uint32                  dstImageStrideInBytes,
+		vx_uint32                  srcWidth,
+		vx_uint32                  srcHeight,
+		vx_uint8                 * pSrcImage,
+		vx_uint32                  srcImageStrideInBytes,
+		ago_perspective_matrix_t * matrix,
+		vx_uint8				 * pLocalData
+	);
+int HafCpu_WarpPerspective_U8_U8_Bilinear_Constant
+	(
+		vx_uint32                  dstWidth,
+		vx_uint32                  dstHeight,
+		vx_uint8                 * pDstImage,
+		vx_uint32                  dstImageStrideInBytes,
+		vx_uint32                  srcWidth,
+		vx_uint32                  srcHeight,
+		vx_uint8                 * pSrcImage,
+		vx_uint32                  srcImageStrideInBytes,
+		ago_perspective_matrix_t * matrix,
+		vx_uint8                   border,
+		vx_uint8				 * pLocalData
+	);
+int HafCpu_ScaleImage_U8_U8_Nearest
+	(
+		vx_uint32            dstWidth,
+		vx_uint32            dstHeight,
+		vx_uint8           * pDstImage,
+		vx_uint32            dstImageStrideInBytes,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes,
+		ago_scale_matrix_t * matrix
+	);
+int HafCpu_ScaleImage_U8_U8_Nearest_Constant
+	(
+		vx_uint32            dstWidth,
+		vx_uint32            dstHeight,
+		vx_uint8           * pDstImage,
+		vx_uint32            dstImageStrideInBytes,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes,
+		ago_scale_matrix_t * matrix,
+		vx_uint8             border
+	);
+int HafCpu_ScaleImage_U8_U8_Bilinear
+	(
+		vx_uint32            dstWidth,
+		vx_uint32            dstHeight,
+		vx_uint8           * pDstImage,
+		vx_uint32            dstImageStrideInBytes,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes,
+		ago_scale_matrix_t * matrix
+	);
+int HafCpu_ScaleImage_U8_U8_Bilinear_Replicate
+	(
+		vx_uint32            dstWidth,
+		vx_uint32            dstHeight,
+		vx_uint8           * pDstImage,
+		vx_uint32            dstImageStrideInBytes,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes,
+		ago_scale_matrix_t * matrix
+	);
+int HafCpu_ScaleImage_U8_U8_Bilinear_Constant
+	(
+		vx_uint32            dstWidth,
+		vx_uint32            dstHeight,
+		vx_uint8           * pDstImage,
+		vx_uint32            dstImageStrideInBytes,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes,
+		ago_scale_matrix_t * matrix,
+		vx_uint8             border
+	);
+int HafCpu_ScaleImage_U8_U8_Area
+	(
+		vx_uint32            dstWidth,
+		vx_uint32            dstHeight,
+		vx_uint8           * pDstImage,
+		vx_uint32            dstImageStrideInBytes,
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes,
+		ago_scale_matrix_t * matrix
+	);
+int HafCpu_OpticalFlowPyrLK_XY_XY_Generic
+(
+	vx_keypoint_t      newKeyPoint[],
+	vx_float32         pyramidScale,
+	vx_uint32          pyramidLevelCount,
+	ago_pyramid_u8_t * oldPyramid,
+	ago_pyramid_u8_t * newPyramid,
+	vx_uint32          keyPointCount,
+	vx_keypoint_t      oldKeyPoint[],
+	vx_keypoint_t      newKeyPointEstimate[],
+	vx_enum            termination,
+	vx_float32         epsilon,
+	vx_uint32          num_iterations,
+	vx_bool            use_initial_estimate,
+	vx_uint32		   dataStrideInBytes,
+	vx_uint8		 * DataPtr,
+	vx_int32		   window_dimension
+);
+
+int HafCpu_HarrisMergeSortAndPick_XY_HVC
+	(
+		vx_uint32         capacityOfDstCorner,
+		vx_keypoint_t     dstCorner[],
+		vx_uint32       * pDstCornerCount,
+		vx_uint32         srcWidth,
+		vx_uint32         srcHeight,
+		vx_float32      * pSrcVc,
+		vx_uint32         srcVcStrideInBytes,
+		vx_float32        min_distance
+	);
+int HafCpu_FastCornerMerge_XY_XY
+	(
+		vx_uint32       capacityOfDstCorner,
+		vx_keypoint_t   dstCorner[],
+		vx_uint32     * pDstCornerCount,
+		vx_uint32		numSrcCornerBuffers,
+		vx_keypoint_t * pSrcCorners[],
+		vx_uint32       numSrcCorners[]
+	);
+int HafCpu_CannyEdgeTrace_U8_U8
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[]
+	);
+int HafCpu_CannyEdgeTrace_U8_U8XY
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32              xyStackTop
+	);
+int HafCpu_IntegralImage_U32_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint32   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Histogram_DATA_U8
+	(
+		vx_uint32     dstHist[],
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_HistogramFixedBins_DATA_U8
+	(
+		vx_uint32     dstHist[],
+		vx_uint32     distBinCount,
+		vx_uint32     distOffset,
+		vx_uint32     distRange,
+		vx_uint32     distWindow,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_MeanStdDev_DATA_U8
+	(
+		vx_float32  * pSum,
+		vx_float32  * pSumOfSquared,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_Equalize_DATA_DATA
+	(
+		vx_uint8    * pLut,
+		vx_uint32     numPartitions,
+		vx_uint32   * pPartSrcHist[]
+	);
+int HafCpu_HistogramMerge_DATA_DATA
+	(
+		vx_uint32     dstHist[],
+		vx_uint32     numPartitions,
+		vx_uint32   * pPartSrcHist[]
+	);
+int HafCpu_MeanStdDevMerge_DATA_DATA
+	(
+		vx_float32  * mean,
+		vx_float32  * stddev,
+		vx_uint32	  totalSampleCount,
+		vx_uint32     numPartitions,
+		vx_float32    partSum[],
+		vx_float32    partSumOfSquared[]
+	);
+int HafCpu_MinMax_DATA_U8
+	(
+		vx_int32    * pDstMinValue,
+		vx_int32    * pDstMaxValue,
+		vx_uint32     srcWidth,
+		vx_uint32     srcWeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_MinMax_DATA_S16
+	(
+		vx_int32    * pDstMinValue,
+		vx_int32    * pDstMaxValue,
+		vx_uint32     srcWidth,
+		vx_uint32     srcWeight,
+		vx_int16    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	);
+int HafCpu_MinMaxMerge_DATA_DATA
+	(
+		vx_int32    * pDstMinValue,
+		vx_int32    * pDstMaxValue,
+		vx_uint32     numDataPartitions,
+		vx_int32      srcMinValue[],
+		vx_int32      srcMaxValue[]
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_MinMax_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	);
+int HafCpu_MinMaxLocMerge_DATA_DATA
+	(
+		vx_uint32          * pDstLocCount,
+		vx_uint32            capacityOfDstLocList,
+		vx_coordinates2d_t   dstLocList[],
+		vx_uint32            numDataPartitions,
+		vx_uint32            partLocCount[],
+		vx_coordinates2d_t * partLocList[]
+	);
+
+// helper functions for phase
+float HafCpu_FastAtan2_deg
+(
+	vx_int16	  Gx,
+	vx_int16      Gy
+);
+
+float HafCpu_FastAtan2_rad
+(
+	vx_int16	  Gx,
+	vx_int16      Gy
+);
+
+int HafCpu_FastAtan2_Canny
+(
+	vx_int16	  Gx,
+	vx_int16      Gy
+);
+
+#endif // __ago_haf_cpu_h__
diff --git a/openvx/ago/ago_haf_cpu_arithmetic.cpp b/openvx/ago/ago_haf_cpu_arithmetic.cpp
new file mode 100644
index 0000000..b82b0a8
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_arithmetic.cpp
@@ -0,0 +1,7674 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+int HafCpu_Add_U8_U8U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_add_epi8(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int16 temp = (vx_int16)(*pLocalSrc1++) + (vx_int16)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8)temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_add_epi8(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int16 temp = (vx_int16)(*pLocalSrc1++) + (vx_int16)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8)temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Add_U8_U8U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_adds_epu8(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				int temp = (int)(*pLocalSrc1++) + (int)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8) min(temp, UINT8_MAX);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		{
+			for (int height = 0; height < (int)dstHeight; height++)
+			{
+				pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+				pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+				pLocalDst_xmm = (__m128i*) pDstImage;
+
+				for (int width = 0; width < alignedWidth; width += 16)
+				{
+					pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+					pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+					pixels1 = _mm_adds_epu8(pixels1, pixels2);
+					_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+				}
+
+				pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+				pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+				pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+				for (int width = 0; width < postfixWidth; width++)
+				{
+					int temp = (int)(*pLocalSrc1++) + (int)(*pLocalSrc2++);
+					*pLocalDst++ = (vx_uint8)min(temp, UINT8_MAX);
+				}
+
+				pSrcImage1 += srcImage1StrideInBytes;
+				pSrcImage2 += srcImage2StrideInBytes;
+				pDstImage += dstImageStrideInBytes;
+			}
+		}
+	}
+
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_U8_U8U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_sub_epi8(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int16 temp = (vx_int16)(*pLocalSrc1++) - (vx_int16)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8)temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_sub_epi8(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				int temp = (int)(*pLocalSrc1++) - (int)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8)temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_U8_U8U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_subs_epu8(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				int temp = (int)(*pLocalSrc1++) - (int)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8)max(min(temp, UINT8_MAX), 0);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_subs_epu8(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				int temp = (int)(*pLocalSrc1++) - (int)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_uint8)max(min(temp, UINT8_MAX), 0);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Add_S16_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int16 *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int) dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_add_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_add_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (int)(*pLocalSrc1++) + (int)(*pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		{
+			for (int height = 0; height < (int) dstHeight; height++)
+			{
+				pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+				pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+				pLocalDst_xmm = (__m128i*) pDstImage;
+
+				for (int width = 0; width < alignedWidth; width += 16)
+				{
+					pixels1L = _mm_loadu_si128(pLocalSrc1_xmm++);
+					pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+					pixels1L = _mm_cvtepu8_epi16(pixels1L);
+					pixels2L = _mm_loadu_si128(pLocalSrc2_xmm++);
+					pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+					pixels2L = _mm_cvtepu8_epi16(pixels2L);
+					pixels1L = _mm_add_epi16(pixels1L, pixels2L);
+					pixels1H = _mm_add_epi16(pixels1H, pixels2H);
+					_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+					_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+				}
+
+				pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+				pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+				pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+				for (int width = 0; width < postfixWidth; width++)
+				{
+					*pLocalDst++ = (int)(*pLocalSrc1++) + (int)(*pLocalSrc2++);
+				}
+
+				pSrcImage1 += srcImage1StrideInBytes;
+				pSrcImage2 += srcImage2StrideInBytes;
+				pDstImage += (dstImageStrideInBytes >> 1);
+			}
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int16 *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_int16)(*pLocalSrc1++) - (vx_int16)(*pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (int)(*pLocalSrc1++) - (int)(*pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Add_S16_S16U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc16_xmm, *pLocalSrc8_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc8;
+	vx_int16 *pLocalSrc16, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (alignedWidth)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_load_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_load_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_add_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_add_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc16++ + (vx_int16)(*pLocalSrc8++);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_loadu_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_loadu_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_add_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_add_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc16++ + (vx_int16)(*pLocalSrc8++);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Add_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc16_xmm, *pLocalSrc8_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc8;
+	vx_int16 *pLocalSrc16, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_load_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_load_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_adds_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_adds_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc16++) + (vx_int32)(*pLocalSrc8++);
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_loadu_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_loadu_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_adds_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_adds_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc16++) + (vx_int32)(*pLocalSrc8++);
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_S16U8_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc16_xmm, *pLocalSrc8_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc8;
+	vx_int16 *pLocalSrc16, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_load_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_load_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc16++ - (vx_int16)(*pLocalSrc8++);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_loadu_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_loadu_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc16++ - (vx_int16)(*pLocalSrc8++);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc16_xmm, *pLocalSrc8_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc8;
+	vx_int16 *pLocalSrc16, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_load_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_load_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_subs_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_subs_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc16++) - (vx_int32)(*pLocalSrc8++);
+				*pLocalDst++ = max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc16_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc8_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc16_xmm++);
+				pixels1H = _mm_loadu_si128(pLocalSrc16_xmm++);;
+				pixels2L = _mm_loadu_si128(pLocalSrc8_xmm++);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+				pixels1L = _mm_subs_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_subs_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc16++) - (vx_int32)(*pLocalSrc8++);
+				*pLocalDst++ = max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_U8S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc16_xmm, *pLocalSrc8_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc8;
+	vx_int16 *pLocalSrc16, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc8_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc16_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc8_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_load_si128(pLocalSrc16_xmm++);
+				pixels2H = _mm_load_si128(pLocalSrc16_xmm++);;
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_int16)(*pLocalSrc8++) - *pLocalSrc16++;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc8_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc16_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc8_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_loadu_si128(pLocalSrc16_xmm++);
+				pixels2H = _mm_loadu_si128(pLocalSrc16_xmm++);;
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_int16)(*pLocalSrc8++) - *pLocalSrc16++;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_U8S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc16_xmm, *pLocalSrc8_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc8;
+	vx_int16 *pLocalSrc16, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc8_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc16_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc8_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_load_si128(pLocalSrc16_xmm++);
+				pixels2H = _mm_load_si128(pLocalSrc16_xmm++);;
+				pixels1L = _mm_subs_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_subs_epi16(pixels1H, pixels2H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+				_mm_store_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc8++) - (vx_int32)(*pLocalSrc16++);
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc8_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc16_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc8_xmm++);
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_loadu_si128(pLocalSrc16_xmm++);
+				pixels2H = _mm_loadu_si128(pLocalSrc16_xmm++);;
+				pixels1L = _mm_subs_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_subs_epi16(pixels1H, pixels2H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1H);
+			}
+
+			pLocalSrc16 = (vx_int16 *)pLocalSrc16_xmm;
+			pLocalSrc8 = (vx_uint8 *)pLocalSrc8_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc8++) - (vx_int32)(*pLocalSrc16++);
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Add_S16_S16S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_load_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_add_epi16(pixels1, pixels3);
+				pixels2 = _mm_add_epi16(pixels2, pixels4);
+
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+				_mm_store_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) + (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16)temp;
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_loadu_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_add_epi16(pixels1, pixels3);
+				pixels2 = _mm_add_epi16(pixels2, pixels4);
+
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) + (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16)temp;
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Add_S16_S16S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_load_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_adds_epi16(pixels1, pixels3);
+				pixels2 = _mm_adds_epi16(pixels2, pixels4);
+
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+				_mm_store_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) + (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_loadu_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_adds_epi16(pixels1, pixels3);
+				pixels2 = _mm_adds_epi16(pixels2, pixels4);
+
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) + (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_S16S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_load_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_sub_epi16(pixels1, pixels3);
+				pixels2 = _mm_sub_epi16(pixels2, pixels4);
+
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+				_mm_store_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) - (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16)temp;
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_loadu_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_sub_epi16(pixels1, pixels3);
+				pixels2 = _mm_sub_epi16(pixels2, pixels4);
+
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) - (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16)temp;
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Sub_S16_S16S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_load_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_subs_epi16(pixels1, pixels3);
+				pixels2 = _mm_subs_epi16(pixels2, pixels4);
+
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+				_mm_store_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) - (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16) max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels3 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels4 = _mm_loadu_si128(pLocalSrc2_xmm++);
+
+				pixels1 = _mm_subs_epi16(pixels1, pixels3);
+				pixels2 = _mm_subs_epi16(pixels2, pixels4);
+
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels2);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_int32 temp = (vx_int32)(*pLocalSrc1++) - (vx_int32)(*pLocalSrc2++);
+				*pLocalDst++ = (vx_int16) max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_AbsDiff_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2L = _mm_load_si128(pLocalSrc2_xmm++);
+
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_abs_epi16(pixels1H);
+				pixels1L = _mm_abs_epi16(pixels1L);
+
+				pixels1L = _mm_packus_epi16(pixels1L, pixels1H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_uint8)abs((vx_int16)(*pLocalSrc1++) - (vx_int16)(*pLocalSrc2++));
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2L = _mm_loadu_si128(pLocalSrc2_xmm++);
+
+				pixels1H = _mm_unpackhi_epi8(pixels1L, zeromask);
+				pixels2H = _mm_unpackhi_epi8(pixels2L, zeromask);
+				pixels1L = _mm_cvtepu8_epi16(pixels1L);
+				pixels2L = _mm_cvtepu8_epi16(pixels2L);
+
+				pixels1H = _mm_sub_epi16(pixels1H, pixels2H);
+				pixels1L = _mm_sub_epi16(pixels1L, pixels2L);
+				pixels1H = _mm_abs_epi16(pixels1H);
+				pixels1L = _mm_abs_epi16(pixels1L);
+
+				pixels1L = _mm_packus_epi16(pixels1L, pixels1H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_uint8)abs((vx_int16)(*pLocalSrc1++) - (vx_int16)(*pLocalSrc2++));
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_AbsDiff_S16_S16S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_int16    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+
+	__m128i pixels1H, pixels1L, pixels2H, pixels2L;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~7;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 8)
+			{
+				pixels1L = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2L = _mm_load_si128(pLocalSrc2_xmm++);
+
+				pixels1H = _mm_srli_si128(pixels1L, 8);
+				pixels1H = _mm_cvtepi16_epi32(pixels1H);
+				pixels1L = _mm_cvtepi16_epi32(pixels1L);
+				pixels2H = _mm_srli_si128(pixels2L, 8);
+				pixels2H = _mm_cvtepi16_epi32(pixels2H);
+				pixels2L = _mm_cvtepi16_epi32(pixels2L);
+				
+				pixels1H = _mm_sub_epi32(pixels1H, pixels2H);
+				pixels1L = _mm_sub_epi32(pixels1L, pixels2L);
+				pixels1H = _mm_abs_epi32(pixels1H);
+				pixels1L = _mm_abs_epi32(pixels1L);
+				
+				pixels1L = _mm_packs_epi32(pixels1L, pixels1H);
+				_mm_store_si128(pLocalDst_xmm++, pixels1L);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_int16)abs((vx_int32)(*pLocalSrc1++) - (vx_int32)(*pLocalSrc2++));
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 8)
+			{
+				pixels1L = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2L = _mm_loadu_si128(pLocalSrc2_xmm++);
+
+				pixels1H = _mm_srli_si128(pixels1L, 8);
+				pixels1H = _mm_cvtepi16_epi32(pixels1H);
+				pixels1L = _mm_cvtepi16_epi32(pixels1L);
+				pixels2H = _mm_srli_si128(pixels2L, 8);
+				pixels2H = _mm_cvtepi16_epi32(pixels2H);
+				pixels2L = _mm_cvtepi16_epi32(pixels2L);
+
+				pixels1H = _mm_sub_epi32(pixels1H, pixels2H);
+				pixels1L = _mm_sub_epi32(pixels1L, pixels2L);
+				pixels1H = _mm_abs_epi32(pixels1H);
+				pixels1L = _mm_abs_epi32(pixels1L);
+
+				pixels1L = _mm_packs_epi32(pixels1L, pixels1H);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1L);
+			}
+
+			pLocalSrc1 = (vx_int16 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_int16 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = (vx_int16)abs((vx_int32)(*pLocalSrc1++) - (vx_int32)(*pLocalSrc2++));
+			}
+
+			pSrcImage1 += (srcImage1StrideInBytes >> 1);
+			pSrcImage2 += (srcImage2StrideInBytes >> 1);
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_AccumulateSquared_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint32     shift
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc;
+	vx_int16 *pLocalDst;
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i resultHH, resultHL, resultLH, resultLL, pixelsHH, pixelsHL, pixelsLH, pixelsLL;
+
+	int height = (int)dstHeight;
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		while (height)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			int width = alignedWidth >> 4;						// 16 pixels at a time
+			while (width)
+			{
+				pixelsLL = _mm_load_si128(pLocalSrc_xmm++);
+				resultLL = _mm_load_si128(pLocalDst_xmm);
+				resultHL = _mm_load_si128(pLocalDst_xmm + 1);
+
+				// Convert input to 32 bit
+				pixelsHL = _mm_unpackhi_epi8(pixelsLL, zeromask);
+				pixelsHH = _mm_unpackhi_epi16(pixelsHL, zeromask);
+				pixelsHL = _mm_cvtepu16_epi32(pixelsHL);
+				pixelsLL = _mm_unpacklo_epi8(pixelsLL, zeromask);
+				pixelsLH = _mm_unpackhi_epi16(pixelsLL, zeromask);
+				pixelsLL = _mm_cvtepu16_epi32(pixelsLL);
+
+				// Convert result to 32 bit
+				resultHH = _mm_srli_si128(resultHL, 8);
+				resultHH = _mm_cvtepi16_epi32(resultHH);
+				resultHL = _mm_cvtepi16_epi32(resultHL);
+				resultLH = _mm_srli_si128(resultLL, 8);
+				resultLH = _mm_cvtepi16_epi32(resultLH);
+				resultLL = _mm_cvtepi16_epi32(resultLL);
+
+				// Multiply
+				pixelsHH = _mm_mullo_epi32(pixelsHH, pixelsHH);
+				pixelsHL = _mm_mullo_epi32(pixelsHL, pixelsHL);
+				pixelsLH = _mm_mullo_epi32(pixelsLH, pixelsLH);
+				pixelsLL = _mm_mullo_epi32(pixelsLL, pixelsLL);
+
+				pixelsHH = _mm_srai_epi32(pixelsHH, shift);
+				pixelsHL = _mm_srai_epi32(pixelsHL, shift);
+				pixelsLH = _mm_srai_epi32(pixelsLH, shift);
+				pixelsLL = _mm_srai_epi32(pixelsLL, shift);
+
+				resultHH = _mm_add_epi32(resultHH, pixelsHH);
+				resultHL = _mm_add_epi32(resultHL, pixelsHL);
+				resultLH = _mm_add_epi32(resultLH, pixelsLH);
+				resultLL = _mm_add_epi32(resultLL, pixelsLL);
+
+				resultHL = _mm_packs_epi32(resultHL, resultHH);
+				resultLL = _mm_packs_epi32(resultLL, resultLH);
+
+				_mm_store_si128(pLocalDst_xmm++, resultLL);
+				_mm_store_si128(pLocalDst_xmm++, resultHL);
+
+				width--;
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc++)
+			{
+				vx_int32 temp = ((vx_int32)*pLocalSrc * (vx_int32)*pLocalSrc) >> shift;
+				temp += (vx_int32)*pLocalDst;
+				temp = max(min(temp, (vx_int32)INT16_MAX), (vx_int32)INT16_MIN);
+				*pLocalDst++ = (vx_int16)temp;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+			height--;
+		}
+	}
+	else
+	{
+		while (height)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			int width = alignedWidth >> 4;						// 16 pixels at a time
+			while (width)
+			{
+				pixelsLL = _mm_loadu_si128(pLocalSrc_xmm++);
+				resultLL = _mm_loadu_si128(pLocalDst_xmm);
+				resultHL = _mm_loadu_si128(pLocalDst_xmm + 1);
+
+				// Convert input to 32 bit
+				pixelsHL = _mm_unpackhi_epi8(pixelsLL, zeromask);
+				pixelsHH = _mm_unpackhi_epi16(pixelsHL, zeromask);
+				pixelsHL = _mm_cvtepu16_epi32(pixelsHL);
+				pixelsLL = _mm_unpacklo_epi8(pixelsLL, zeromask);
+				pixelsLH = _mm_unpackhi_epi16(pixelsLL, zeromask);
+				pixelsLL = _mm_cvtepu16_epi32(pixelsLL);
+
+				// Convert result to 32 bit
+				resultHH = _mm_srli_si128(resultHL, 8);
+				resultHH = _mm_cvtepi16_epi32(resultHH);
+				resultHL = _mm_cvtepi16_epi32(resultHL);
+				resultLH = _mm_srli_si128(resultLL, 8);
+				resultLH = _mm_cvtepi16_epi32(resultLH);
+				resultLL = _mm_cvtepi16_epi32(resultLL);
+
+				// Multiply
+				pixelsHH = _mm_mullo_epi32(pixelsHH, pixelsHH);
+				pixelsHL = _mm_mullo_epi32(pixelsHL, pixelsHL);
+				pixelsLH = _mm_mullo_epi32(pixelsLH, pixelsLH);
+				pixelsLL = _mm_mullo_epi32(pixelsLL, pixelsLL);
+
+				pixelsHH = _mm_srai_epi32(pixelsHH, shift);
+				pixelsHL = _mm_srai_epi32(pixelsHL, shift);
+				pixelsLH = _mm_srai_epi32(pixelsLH, shift);
+				pixelsLL = _mm_srai_epi32(pixelsLL, shift);
+
+				resultHH = _mm_add_epi32(resultHH, pixelsHH);
+				resultHL = _mm_add_epi32(resultHL, pixelsHL);
+				resultLH = _mm_add_epi32(resultLH, pixelsLH);
+				resultLL = _mm_add_epi32(resultLL, pixelsLL);
+
+				resultHL = _mm_packs_epi32(resultHL, resultHH);
+				resultLL = _mm_packs_epi32(resultLL, resultLH);
+
+				_mm_storeu_si128(pLocalDst_xmm++, resultLL);
+				_mm_storeu_si128(pLocalDst_xmm++, resultHL);
+
+				width--;
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc++)
+			{
+				vx_int32 temp = ((vx_int32)*pLocalSrc * (vx_int32)*pLocalSrc) >> shift;
+				temp += (vx_int32)*pLocalDst;
+				temp = max(min(temp, (vx_int32)INT16_MAX), (vx_int32)INT16_MIN);
+				*pLocalDst++ = (vx_int16)temp;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+			height--;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_Accumulate_S16_S16U8_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc;
+	vx_int16 *pLocalDst;
+
+	__m128i resultL, resultH, pixelsL, pixelsH;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				resultL = _mm_load_si128(pLocalDst_xmm);
+				resultH = _mm_load_si128(pLocalDst_xmm + 1);
+				pixelsL = _mm_load_si128(pLocalSrc_xmm++);
+				pixelsH = _mm_unpackhi_epi8(pixelsL, zeromask);
+				pixelsL = _mm_cvtepu8_epi16(pixelsL);
+				resultL = _mm_adds_epi16(resultL, pixelsL);
+				resultH = _mm_adds_epi16(resultH, pixelsH);
+				_mm_store_si128(pLocalDst_xmm++, resultL);
+				_mm_store_si128(pLocalDst_xmm++, resultH);
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc++)
+			{
+				vx_int32 temp = (vx_int32)*pLocalDst + (vx_int32)*pLocalSrc;
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				resultL = _mm_loadu_si128(pLocalDst_xmm);
+				resultH = _mm_loadu_si128(pLocalDst_xmm + 1);
+				pixelsL = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixelsH = _mm_unpackhi_epi8(pixelsL, zeromask);
+				pixelsL = _mm_cvtepu8_epi16(pixelsL);
+				resultL = _mm_adds_epi16(resultL, pixelsL);
+				resultH = _mm_adds_epi16(resultH, pixelsH);
+				_mm_storeu_si128(pLocalDst_xmm++, resultL);
+				_mm_storeu_si128(pLocalDst_xmm++, resultH);
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_int16 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc++)
+			{
+				vx_int32 temp = (vx_int32)*pLocalDst + (vx_int32)*pLocalSrc;
+				*pLocalDst++ = (vx_int16)max(min(temp, INT16_MAX), INT16_MIN);
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += (dstImageStrideInBytes >> 1);
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorDepth_U8_S16_Wrap
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int32      shift
+	)
+{
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i maskL = _mm_set_epi8((char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0x0E, (char)0x0C, (char)0x0A, (char)0x08, (char)0x06, (char)0x04, (char)0x02, (char)0x0);
+	__m128i maskH = _mm_set_epi8((char)0x0E, (char)0x0C, (char)0x0A, (char)0x08, (char)0x06, (char)0x04, (char)0x02, (char)0x0, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF);
+	__m128i pixels1, pixels2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_int16 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < prefixWidth; width++)
+		{
+			int pix = (int) (*pLocalSrc++);
+			*pLocalDst++ = (vx_uint8)((pix >> shift) & 0xFF);
+		}
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 8));
+			pixels1 = _mm_srai_epi16(pixels1, (int) shift);
+			pixels2 = _mm_srai_epi16(pixels2, (int) shift);
+			pixels1 = _mm_shuffle_epi8(pixels1, maskL);
+			pixels2 = _mm_shuffle_epi8(pixels2, maskH);
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+			_mm_store_si128((__m128i *)pLocalDst, pixels1);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			int pix = *pLocalSrc++;
+			*pLocalDst++ = (vx_uint8)((pix >> shift) & 0xFF);
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorDepth_U8_S16_Sat
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_int16    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int32      shift
+	)
+{
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i pixels1, pixels2;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_int16 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < prefixWidth; width++)
+		{
+			int pix = (int) (*pLocalSrc++);
+			pix >>= shift;
+			pix = min(max(pix, 0), 255);
+			*pLocalDst++ = (vx_uint8)(pix);
+		}
+
+		for (int width = 0; width < (int)alignedWidth; width += 16)
+		{
+			pixels1 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 8));
+			pixels1 = _mm_srai_epi16(pixels1, (int)shift);
+			pixels2 = _mm_srai_epi16(pixels2, (int)shift);
+			pixels1 = _mm_packus_epi16(pixels1, pixels2);
+			_mm_store_si128((__m128i *)pLocalDst, pixels1);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			int pix = *pLocalSrc++;
+			pix >>= shift;
+			pix = min(max(pix, 0), 255);
+			*pLocalDst++ = (vx_uint8)(pix);
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorDepth_S16_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int32      shift
+	)
+{
+	int prefixWidth = intptr_t(pDstImage) & 7;			// Two bytes in output = 1 pixel
+	prefixWidth = (prefixWidth == 0) ? 0 : (8 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i pixelsL, pixelsH;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_int16 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < prefixWidth; width++)
+		{
+			int pix = (int) (*pLocalSrc++);
+			*pLocalDst++ = (vx_int16) (pix << shift);
+		}
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixelsL = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixelsH = _mm_unpackhi_epi8(pixelsL, zeromask);
+			pixelsL = _mm_cvtepu8_epi16(pixelsL);
+			pixelsL = _mm_slli_epi16(pixelsL, (int)shift);
+			pixelsH = _mm_slli_epi16(pixelsH, (int)shift);
+			_mm_store_si128((__m128i *)pLocalDst, pixelsL);
+			_mm_store_si128((__m128i *)(pLocalDst + 8), pixelsH);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			int pix = (int)(*pLocalSrc++);
+			*pLocalDst++ = (vx_int16)(pix << shift);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes >> 1);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Threshold_U8_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+	__m128i pixels;
+	__m128i offset = _mm_set1_epi8((char) 0x80);				// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i thresh = _mm_set1_epi8((char) threshold);
+	thresh = _mm_xor_si128(thresh, offset);						// Convert the threshold to the new range
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int) dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_load_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				pixels = _mm_cmpgt_epi8(pixels, thresh);
+				_mm_store_si128(pLocalDst_xmm++, pixels);
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = (pix > threshold) ? (vx_uint8)255 : 0;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int) dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				pixels = _mm_cmpgt_epi8(pixels, thresh);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels);
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = (pix > threshold) ? (vx_uint8)255 : 0;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Threshold_U8_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+	__m128i pixels;
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i threshU = _mm_set1_epi8((char)upper);
+	__m128i threshL = _mm_set1_epi8((char)lower);
+	__m128i ones = _mm_set1_epi8((char)0xFF);
+	__m128i temp;
+	
+	threshU = _mm_xor_si128(threshU, offset);					// Convert the upper threshold to the new range
+	threshL = _mm_xor_si128(threshL, offset);					// Convert the lower threshold to the new range
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_load_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				temp = _mm_cmpgt_epi8(pixels, threshU);				// pixels > upper gives 255
+				pixels = _mm_cmplt_epi8(pixels, threshL);			// pixels < lower gives 255
+				pixels = _mm_or_si128(pixels, temp);
+				pixels = _mm_andnot_si128(pixels, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels);
+			}
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = ((pix > upper) && (pix < lower)) ? 0 : (vx_uint8)255;
+			}
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				temp = _mm_cmpgt_epi8(pixels, threshU);				// pixels > upper gives 255
+				pixels = _mm_cmplt_epi8(pixels, threshL);			// pixels < lower gives 255
+				pixels = _mm_or_si128(pixels, temp);
+				pixels = _mm_andnot_si128(pixels, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels);
+			}
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = ((pix > upper) && (pix < lower)) ? 0 : (vx_uint8)255;
+			}
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the source image pointer is 16 byte aligned, and the source stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Threshold_U1_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	)
+{
+	__m128i * src = (__m128i*)pSrcImage;
+
+	__m128i pixels;
+	__m128i offset = _mm_set1_epi8((char)0x80);				// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i thresh = _mm_set1_epi8((char)threshold);
+	thresh = _mm_xor_si128(thresh, offset);						// Convert the threshold to the new range
+
+	uint64_t maskConv = 0x0101010101010101;
+	uint64_t result[2];
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels = _mm_load_si128(&src[width >> 4]);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			pixels = _mm_cmpgt_epi8(pixels, thresh);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			result[0] = _pext_u64(pixels.m128i_u64[0], maskConv);
+			result[1] = _pext_u64(pixels.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((result[1] & 0xFF) << 8) | (result[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the source image pointer is 16 byte aligned, and the source stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+	int HafCpu_Threshold_U1_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	)
+{
+	__m128i * src = (__m128i*)pSrcImage;
+	__m128i pixels;
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i threshU = _mm_set1_epi8((char)upper);
+	__m128i threshL = _mm_set1_epi8((char)lower);
+	__m128i ones = _mm_set1_epi8((char)1);
+	__m128i temp;
+
+	threshU = _mm_xor_si128(threshU, offset);					// Convert the upper threshold to the new range
+	threshL = _mm_xor_si128(threshL, offset);					// Convert the lower threshold to the new range
+
+	uint64_t maskConv = 0x0101010101010101;
+	uint64_t result[2];
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels = _mm_load_si128(&src[width >> 4]);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			temp = _mm_cmpgt_epi8(pixels, threshU);
+			temp = _mm_andnot_si128(temp, ones);				// This gives 255 if pixels <= threshU, a way to implement less than or equal to
+			pixels = _mm_cmplt_epi8(pixels, threshL);
+			pixels = _mm_andnot_si128(pixels, temp);			// 255 if pixels >= threshL and AND with temp
+			
+			// Convert U8 to U1
+#ifdef _WIN64
+			result[0] = _pext_u64(pixels.m128i_u64[0], maskConv);
+			result[1] = _pext_u64(pixels.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((result[1] & 0xFF) << 8) | (result[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+
+int HafCpu_Threshold_U1_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+
+	__m128i pixels;
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i thresh = _mm_set1_epi8((char)threshold);
+	thresh = _mm_xor_si128(thresh, offset);						// Convert the threshold to the new range
+
+	int pixelmask;
+	int height = (int)dstHeight;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	while (height)
+	{
+		pLocalSrc_xmm = (__m128i*) pSrcImage;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		int width = (int)(alignedWidth >> 4);					// 16 pixels (bits) are processed at a time in the inner loop
+		while (width)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			pixels = _mm_cmpgt_epi8(pixels, thresh);
+
+			pixelmask = _mm_movemask_epi8(pixels);				// Convert U8 to U1
+			*pLocalDst_16++ = (vx_int16)(pixelmask & 0xFFFF);
+			width--;
+		}
+		pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+		pLocalDst = (vx_uint8 *)pLocalDst_16;
+
+		width = 0;
+		while (width < postfixWidth)
+		{
+			pixelmask = 0;
+			for (int i = 0; i < 8; i++, width++)
+			{
+				if (*pLocalSrc++ > threshold)
+					pixelmask |= 1;
+				pixelmask <<= 1;
+			}
+			*pLocalDst++ = (vx_uint8)(pixelmask & 0xFF);
+		}
+		
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Threshold_U1_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+
+	__m128i pixels, temp;
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i threshU = _mm_set1_epi8((char)upper);
+	__m128i threshL = _mm_set1_epi8((char)lower);
+	__m128i ones = _mm_set1_epi8((char)0xFF);
+
+	threshU = _mm_xor_si128(threshU, offset);					// Convert the upper threshold to the new range
+	threshL = _mm_xor_si128(threshL, offset);					// Convert the lower threshold to the new range
+
+	int pixelmask;
+	int height = (int)dstHeight;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	while (height)
+	{
+		pLocalSrc_xmm = (__m128i*) pSrcImage;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+		int width = (int)(dstWidth >> 4);						// 16 pixels (bits) are processed at a time in the inner loop
+
+		while (width)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			temp = _mm_cmpgt_epi8(pixels, threshU);				// pixels > upper gives 255
+			pixels = _mm_cmplt_epi8(pixels, threshL);			// pixels < lower gives 255
+			pixels = _mm_or_si128(pixels, temp);
+			pixels = _mm_andnot_si128(pixels, ones);
+
+			pixelmask = _mm_movemask_epi8(pixels);				// Convert U8 to U1
+			*pLocalDst_16++ = (short)(pixelmask & 0xFFFF);
+			width--;
+		}
+		pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+		pLocalDst = (vx_uint8 *)pLocalDst_16;
+
+		width = 0;
+		while (width < postfixWidth)
+		{
+			pixelmask = 0;
+			vx_uint8 pix = *pLocalSrc++;
+			for (int i = 0; i < 8; i++, width++)
+			{
+				if ((pix >= lower) && (pix <= upper))
+					pixelmask |= 1;
+				pixelmask <<= 1;
+			}
+			*pLocalDst++ = (vx_uint8)(pixelmask & 0xFF);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+int HafCpu_ThresholdNot_U8_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i thresh = _mm_set1_epi8((char)threshold);
+	thresh = _mm_xor_si128(thresh, offset);						// Convert the threshold to the new range
+	
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_load_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				pixels = _mm_cmpgt_epi8(pixels, thresh);
+				pixels = _mm_andnot_si128(pixels, ones);			// NOT
+				_mm_store_si128(pLocalDst_xmm++, pixels);
+			}
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = (pix > threshold) ? 0 : (vx_uint8)255;
+			}
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				pixels = _mm_cmpgt_epi8(pixels, thresh);
+				pixels = _mm_andnot_si128(pixels, ones);			// NOT
+				_mm_storeu_si128(pLocalDst_xmm++, pixels);
+			}
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = (pix > threshold) ? 0 : (vx_uint8)255;
+			}
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ThresholdNot_U8_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i threshU = _mm_set1_epi8((char)upper);
+	__m128i threshL = _mm_set1_epi8((char)lower);
+	__m128i temp;
+
+	threshU = _mm_xor_si128(threshU, offset);					// Convert the upper threshold to the new range
+	threshL = _mm_xor_si128(threshL, offset);					// Convert the lower threshold to the new range
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_load_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				temp = _mm_cmpgt_epi8(pixels, threshU);				// pixels > upper gives 255
+				pixels = _mm_cmplt_epi8(pixels, threshL);			// pixels < lower gives 255
+				pixels = _mm_or_si128(pixels, temp);
+				_mm_store_si128(pLocalDst_xmm++, pixels);
+			}
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = ((pix > upper) && (pix < lower)) ? (vx_uint8)255 : 0;
+			}
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+				temp = _mm_cmpgt_epi8(pixels, threshU);				// pixels > upper gives 255
+				pixels = _mm_cmplt_epi8(pixels, threshL);			// pixels < lower gives 255
+				pixels = _mm_or_si128(pixels, temp);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels);
+			}
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				vx_uint8 pix = *pLocalSrc++;
+				*pLocalDst++ = ((pix > upper) && (pix < lower)) ? (vx_uint8)255 : 0;
+			}
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the source image pointer is 16 byte aligned, and the source stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_ThresholdNot_U1_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	)
+{
+	__m128i * src = (__m128i*)pSrcImage;
+
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	__m128i offset = _mm_set1_epi8((char)0x80);				// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i thresh = _mm_set1_epi8((char)threshold);
+	thresh = _mm_xor_si128(thresh, offset);						// Convert the threshold to the new range
+
+	uint64_t maskConv = 0x0101010101010101;
+	uint64_t result[2];
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels = _mm_load_si128(&src[width >> 4]);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			pixels = _mm_cmpgt_epi8(pixels, thresh);
+			pixels = _mm_andnot_si128(pixels, ones);			// NOT
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			result[0] = _pext_u64(pixels.m128i_u64[0], maskConv);
+			result[1] = _pext_u64(pixels.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((result[1] & 0xFF) << 8) | (result[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the source image pointer is 16 byte aligned, and the source stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_ThresholdNot_U1_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	)
+{
+	__m128i * src = (__m128i*)pSrcImage;
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i threshU = _mm_set1_epi8((char)upper);
+	__m128i threshL = _mm_set1_epi8((char)lower);
+	__m128i temp;
+
+	threshU = _mm_xor_si128(threshU, offset);					// Convert the upper threshold to the new range
+	threshL = _mm_xor_si128(threshL, offset);					// Convert the lower threshold to the new range
+
+	uint64_t maskConv = 0x0101010101010101;
+	uint64_t result[2];
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels = _mm_load_si128(&src[width >> 4]);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			temp = _mm_cmpgt_epi8(pixels, threshU);
+			temp = _mm_andnot_si128(temp, ones);				// This gives 255 if pixels <= threshU, a way to implement less than or equal to
+			pixels = _mm_cmplt_epi8(pixels, threshL);
+			pixels = _mm_andnot_si128(pixels, temp);			// 255 if pixels >= threshL and AND with temp
+			pixels = _mm_andnot_si128(pixels, ones);			// NOT
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			result[0] = _pext_u64(pixels.m128i_u64[0], maskConv);
+			result[1] = _pext_u64(pixels.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((result[1] & 0xFF) << 8) | (result[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+
+int HafCpu_ThresholdNot_U1_U8_Binary
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      threshold
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+
+	__m128i pixels;
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i thresh = _mm_set1_epi8((char)threshold);
+	thresh = _mm_xor_si128(thresh, offset);						// Convert the threshold to the new range
+
+	int pixelmask;
+	int height = (int)dstHeight;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	while (height)
+	{
+		pLocalSrc_xmm = (__m128i*) pSrcImage;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		int width = (int)(dstWidth >> 4);						// 16 pixels (bits) are processed at a time in the inner loop
+		while (width)
+		{
+			pixels = _mm_load_si128(pLocalSrc_xmm++);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			pixels = _mm_cmpgt_epi8(pixels, thresh);
+
+			pixelmask = _mm_movemask_epi8(pixels);				// Convert U8 to U1
+			*pLocalDst_16++ = (vx_int16)(~pixelmask & 0xFFFF);
+			width--;
+		}
+		pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+		pLocalDst = (vx_uint8 *)pLocalDst_16;
+
+		width = 0;
+		while (width < postfixWidth)
+		{
+			pixelmask = 0;
+			for (int i = 0; i < 8; i++, width++)
+			{
+				if (*pLocalSrc++ <= threshold)
+					pixelmask |= 1;
+				pixelmask <<= 1;
+			}
+			*pLocalDst++ = (vx_uint8)(pixelmask & 0xFF);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ThresholdNot_U1_U8_Range
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8      lower,
+		vx_uint8      upper
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+
+	__m128i pixels, temp;
+	__m128i offset = _mm_set1_epi8((char)0x80);					// To convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	__m128i threshU = _mm_set1_epi8((char)upper);
+	__m128i threshL = _mm_set1_epi8((char)lower);
+	
+	threshU = _mm_xor_si128(threshU, offset);					// Convert the upper threshold to the new range
+	threshL = _mm_xor_si128(threshL, offset);					// Convert the lower threshold to the new range
+
+	int pixelmask;
+	int height = (int)dstHeight;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	while (height)
+	{
+		pLocalSrc_xmm = (__m128i*) pSrcImage;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+		int width = (int)(dstWidth >> 4);						// 16 pixels (bits) are processed at a time in the inner loop
+
+		while (width)
+		{
+			pixels = _mm_load_si128(pLocalSrc_xmm++);
+			pixels = _mm_xor_si128(pixels, offset);				// Convert the pixels to the new range
+			temp = _mm_cmpgt_epi8(pixels, threshU);				// pixels > upper gives 255
+			pixels = _mm_cmplt_epi8(pixels, threshL);			// pixels < lower gives 255
+			pixels = _mm_or_si128(pixels, temp);
+
+			pixelmask = _mm_movemask_epi8(pixels);				// Convert U8 to U1
+			*pLocalDst_16++ = (vx_int16)(pixelmask & 0xFFFF);
+			width--;
+		}
+		pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+		pLocalDst = (vx_uint8 *)pLocalDst_16;
+
+		width = 0;
+		while (width < postfixWidth)
+		{
+			pixelmask = 0;
+			vx_uint8 pix = *pLocalSrc++;
+			for (int i = 0; i < 8; i++, width++)
+			{
+				if ((pix < lower) && (pix > upper))
+					pixelmask |= 1;
+				pixelmask <<= 1;
+			}
+			*pLocalDst++ = (vx_uint8)(pixelmask & 0xFF);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+// compute the dstImage values from the LUT of srcImage
+int HafCpu_Lut_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLut
+	)
+{
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;				// Check for multiple of 16
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i pixels1, pixels2;
+	int p0, p1, p2, p3;
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		unsigned char * pLocalDst = (unsigned char*)pDstImage;
+		unsigned char * pLocalSrc = (unsigned char*)pSrcImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++, pLocalDst++)
+		{
+			*pLocalDst = pLut[*pLocalSrc];
+		}
+
+		for (int x = 0; x < (alignedWidth >> 4); x++)
+		{
+			pixels1 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			p0 = _mm_cvtsi128_si32(pixels1);
+			p1 = _mm_extract_epi32(pixels1, 1);
+			p2 = _mm_extract_epi32(pixels1, 2);
+			p3 = _mm_extract_epi32(pixels1, 3);
+			p0 = pLut[p0 & 0xff] | (pLut[(p0 >> 8) & 0xFF] << 8) | (pLut[(p0 >> 16) & 0xFF] << 16) | (pLut[(p0 >> 24) & 0xFF] << 24);
+			p1 = pLut[p1 & 0xff] | (pLut[(p1 >> 8) & 0xFF] << 8) | (pLut[(p1 >> 16) & 0xFF] << 16) | (pLut[(p1 >> 24) & 0xFF] << 24);
+			p2 = pLut[p2 & 0xff] | (pLut[(p2 >> 8) & 0xFF] << 8) | (pLut[(p2 >> 16) & 0xFF] << 16) | (pLut[(p2 >> 24) & 0xFF] << 24);
+			p3 = pLut[p3 & 0xff] | (pLut[(p3 >> 8) & 0xFF] << 8) | (pLut[(p3 >> 16) & 0xFF] << 16) | (pLut[(p3 >> 24) & 0xFF] << 24);
+			M128I(pixels2).m128i_u32[0] = p0;
+			M128I(pixels2).m128i_u32[1] = p1;
+			M128I(pixels2).m128i_u32[2] = p2;
+			M128I(pixels2).m128i_u32[3] = p3;
+			_mm_store_si128((__m128i *) pLocalDst, pixels2);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++, pLocalDst++)
+		{
+			*pLocalDst = pLut[*pLocalSrc];
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Magnitude_S16_S16S16
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pMagImage,
+		vx_uint32     magImageStrideInBytes,
+		vx_int16    * pGxImage,
+		vx_uint32     gxImageStrideInBytes,
+		vx_int16    * pGyImage,
+		vx_uint32     gyImageStrideInBytes
+	)
+{
+	short *pLocalGx, *pLocalGy, *pLocalDst;
+	
+	int prefixWidth = intptr_t(pMagImage) & 15;							// check for 16 byte aligned
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 7;				// Check for multiple of 8
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i pixelsGxH, pixelsGxL, pixelsGyH, pixelsGyL;
+	__m128d pixels0, pixels1, pixels2, pixels3, temp;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		pLocalGx = (short *)pGxImage;
+		pLocalGy = (short *)pGyImage;
+		pLocalDst = (short *)pMagImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalGx++, pLocalGy++)
+		{
+			float temp = (float)(*pLocalGx * *pLocalGx) + (float)(*pLocalGy * *pLocalGy);
+			temp = sqrtf(temp);
+			*pLocalDst++ = (vx_int16)temp;
+		}
+
+		for (int width = 0; width < (alignedWidth >> 3); width++)
+		{
+			pixelsGxH = _mm_loadu_si128((__m128i *) pLocalGx);
+			pixelsGyH = _mm_loadu_si128((__m128i *) pLocalGy);
+
+			pixelsGxL = _mm_cvtepi16_epi32(pixelsGxH);					// Convert lower 4 words to dwords
+			pixelsGyL = _mm_cvtepi16_epi32(pixelsGyH);					// Convert lower 4 words to dwords
+			pixelsGxH = _mm_srli_si128(pixelsGxH, 8);
+			pixelsGyH = _mm_srli_si128(pixelsGyH, 8);
+			pixelsGxH = _mm_cvtepi16_epi32(pixelsGxH);					// Convert upper 4 words to dwords
+			pixelsGyH = _mm_cvtepi16_epi32(pixelsGyH);					// Convert upper 4 words to dwords
+
+			pixelsGxL = _mm_mullo_epi32(pixelsGxL, pixelsGxL);			// square
+			pixelsGxH = _mm_mullo_epi32(pixelsGxH, pixelsGxH);
+			pixelsGyL = _mm_mullo_epi32(pixelsGyL, pixelsGyL);
+			pixelsGyH = _mm_mullo_epi32(pixelsGyH, pixelsGyH);
+
+			// Convert to double precision values
+			pixels0 = _mm_cvtepi32_pd(pixelsGxL);
+			temp = _mm_cvtepi32_pd(pixelsGyL);
+			pixels0 = _mm_add_pd(pixels0, temp);						// Lower two values a^2 + b^2
+
+			pixelsGxL = _mm_srli_si128(pixelsGxL, 8);
+			pixelsGyL = _mm_srli_si128(pixelsGyL, 8);
+			pixels1 = _mm_cvtepi32_pd(pixelsGxL);
+			temp = _mm_cvtepi32_pd(pixelsGyL);
+			pixels1 = _mm_add_pd(pixels1, temp);						// Next two values a^2 + b^2
+
+			pixels2 = _mm_cvtepi32_pd(pixelsGxH);
+			temp = _mm_cvtepi32_pd(pixelsGyH);
+			pixels2 = _mm_add_pd(pixels2, temp);						// Next two values a^2 + b^2
+
+			pixelsGxH = _mm_srli_si128(pixelsGxH, 8);
+			pixelsGyH = _mm_srli_si128(pixelsGyH, 8);
+			pixels3 = _mm_cvtepi32_pd(pixelsGxH);
+			temp = _mm_cvtepi32_pd(pixelsGyH);
+			pixels3 = _mm_add_pd(pixels3, temp);						// Upper two values a^2 + b^2
+
+			pixels0 = _mm_sqrt_pd(pixels0);								// square root
+			pixels1 = _mm_sqrt_pd(pixels1);								// square root
+			pixels2 = _mm_sqrt_pd(pixels2);								// square root
+			pixels3 = _mm_sqrt_pd(pixels3);								// square root
+
+			pixelsGxL = _mm_cvtpd_epi32(pixels0);						// Convert double to lower 2 dwords
+			pixelsGyL = _mm_cvtpd_epi32(pixels1);						// Convert double to next 2 dwords
+			pixelsGxH = _mm_cvtpd_epi32(pixels2);						// Convert double to next 2 dwords
+			pixelsGyH = _mm_cvtpd_epi32(pixels3);						// Convert double to upper 2 dwords
+
+			pixelsGyL = _mm_slli_si128(pixelsGyL, 8);
+			pixelsGyH = _mm_slli_si128(pixelsGyH, 8);
+			pixelsGxL = _mm_or_si128(pixelsGxL, pixelsGyL);
+			pixelsGxH = _mm_or_si128(pixelsGxH, pixelsGyH);
+
+			pixelsGxL = _mm_packs_epi32(pixelsGxL, pixelsGxH);
+			_mm_store_si128((__m128i *) pLocalDst, pixelsGxL);
+
+			pLocalGx += 8;
+			pLocalGy += 8;
+			pLocalDst += 8;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalGx++, pLocalGy++)
+		{
+			float temp = (float)(*pLocalGx * *pLocalGx) + (float)(*pLocalGy * *pLocalGy);
+			temp = sqrtf(temp);
+			*pLocalDst++ = (vx_int16)temp;
+		}
+
+		pGxImage += (gxImageStrideInBytes >> 1);
+		pGyImage += (gyImageStrideInBytes >> 1);
+		pMagImage += (magImageStrideInBytes >> 1);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_AccumulateWeighted_U8_U8U8
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_float32    alpha
+)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+
+	__m128i pixelsI0, pixelsI1, tempI;
+	__m128 a, aprime, pixelsF0, pixelsF1, pixelsF2, pixelsF3, temp;
+	a = _mm_set_ps1((float) alpha);
+	aprime = _mm_set_ps1((float) (1.0 - alpha));
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				// For the input pixels
+				pixelsI0 = _mm_load_si128(pLocalSrc_xmm++);
+
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI0);				// Convert to int32
+				pixelsF0 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+				pixelsI1 = _mm_srli_si128(pixelsI0, 4);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				pixelsF1 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+				pixelsI1 = _mm_srli_si128(pixelsI0, 8);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				pixelsF2 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+				pixelsI1 = _mm_srli_si128(pixelsI0, 12);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				pixelsF3 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+
+				pixelsF0 = _mm_mul_ps(pixelsF0, a);					// alpha * input
+				pixelsF1 = _mm_mul_ps(pixelsF1, a);					// alpha * input
+				pixelsF2 = _mm_mul_ps(pixelsF2, a);					// alpha * input
+				pixelsF3 = _mm_mul_ps(pixelsF3, a);					// alpha * input
+
+				// For the output pixels
+				pixelsI0 = _mm_load_si128(pLocalDst_xmm);
+
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI0);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF0 = _mm_add_ps(pixelsF0, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI1 = _mm_srli_si128(pixelsI0, 4);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF1 = _mm_add_ps(pixelsF1, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI1 = _mm_srli_si128(pixelsI0, 8);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF2 = _mm_add_ps(pixelsF2, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI1 = _mm_srli_si128(pixelsI0, 12);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF3 = _mm_add_ps(pixelsF3, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI0 = _mm_cvttps_epi32(pixelsF0);
+				pixelsI1 = _mm_cvttps_epi32(pixelsF1);
+				pixelsI0 = _mm_packus_epi32(pixelsI0, pixelsI1);	// lower 8 values (word)
+				pixelsI1 = _mm_cvttps_epi32(pixelsF2);
+				tempI = _mm_cvttps_epi32(pixelsF3);
+				pixelsI1 = _mm_packus_epi32(pixelsI1, tempI);		// upper 8 values (word)
+
+				pixelsI0 = _mm_packus_epi16(pixelsI0, pixelsI1);
+				_mm_store_si128(pLocalDst_xmm++, pixelsI0);
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc++)
+			{
+				vx_float32 temp = ((1 - alpha) * (vx_float32)*pLocalDst) + (alpha * (vx_float32)*pLocalSrc);
+				*pLocalDst++ = (vx_uint8)temp;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcImage;
+			pLocalDst_xmm = (__m128i *) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				// For the input pixels
+				pixelsI0 = _mm_loadu_si128(pLocalSrc_xmm++);
+
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI0);				// Convert to int32
+				pixelsF0 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+				pixelsI1 = _mm_srli_si128(pixelsI0, 4);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				pixelsF1 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+				pixelsI1 = _mm_srli_si128(pixelsI0, 8);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				pixelsF2 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+				pixelsI1 = _mm_srli_si128(pixelsI0, 12);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				pixelsF3 = _mm_cvtepi32_ps(pixelsI1);				// Convert to float32
+
+				pixelsF0 = _mm_mul_ps(pixelsF0, a);					// alpha * input
+				pixelsF1 = _mm_mul_ps(pixelsF1, a);					// alpha * input
+				pixelsF2 = _mm_mul_ps(pixelsF2, a);					// alpha * input
+				pixelsF3 = _mm_mul_ps(pixelsF3, a);					// alpha * input
+
+				// For the output pixels
+				pixelsI0 = _mm_loadu_si128(pLocalDst_xmm);
+
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI0);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF0 = _mm_add_ps(pixelsF0, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI1 = _mm_srli_si128(pixelsI0, 4);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF1 = _mm_add_ps(pixelsF1, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI1 = _mm_srli_si128(pixelsI0, 8);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF2 = _mm_add_ps(pixelsF2, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI1 = _mm_srli_si128(pixelsI0, 12);
+				pixelsI1 = _mm_cvtepu8_epi32(pixelsI1);				// Convert to int32
+				temp = _mm_cvtepi32_ps(pixelsI1);					// Convert to float32
+				temp = _mm_mul_ps(temp, aprime);					// (1 - alpha) * output
+				pixelsF3 = _mm_add_ps(pixelsF3, temp);				// (1 - alpha) * output + alpha * input
+
+				pixelsI0 = _mm_cvttps_epi32(pixelsF0);
+				pixelsI1 = _mm_cvttps_epi32(pixelsF1);
+				pixelsI0 = _mm_packus_epi32(pixelsI0, pixelsI1);	// lower 8 values (word)
+				pixelsI1 = _mm_cvttps_epi32(pixelsF2);
+				tempI = _mm_cvttps_epi32(pixelsF3);
+				pixelsI1 = _mm_packus_epi32(pixelsI1, tempI);		// upper 8 values (word)
+
+				pixelsI0 = _mm_packus_epi16(pixelsI0, pixelsI1);
+				_mm_storeu_si128(pLocalDst_xmm++, pixelsI0);
+			}
+
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc++)
+			{
+				vx_float32 temp = ((1 - alpha) * (vx_float32)*pLocalDst) + (alpha * (vx_float32)*pLocalSrc);
+				*pLocalDst++ = (vx_uint8)temp;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+/* The following are hand optimized CPU based kernels for point-multiply functions */
+int HafCpu_Mul_U8_U8U8_Wrap_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi16((short)0x00FF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1	= (__m128i*)pSrcImage1;
+		__m128i * src2	= (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 4);
+
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with truncation: cvttps2dq
+			pixels1 = _mm_cvttps_epi32(fpels1);
+			pixels2 = _mm_cvttps_epi32(fpels2);
+			pixels3 = _mm_cvttps_epi32(fpels3);
+			pixels4 = _mm_cvttps_epi32(fpels4);
+
+			// pack to unsigned words 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+			// mask for wrap/truncation
+			pixels1 = _mm_and_si128(pixels1, mask);			// wrap to U8
+			pixels3 = _mm_and_si128(pixels3, mask);			// wrap to U8
+			// pack to unsigned bytes
+			pixels1 = _mm_packus_epi16(pixels1, pixels3);
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+// 
+int HafCpu_Mul_U8_U8U8_Wrap_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi16((short)0x00FF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1	= (__m128i*)pSrcImage1;
+		__m128i * src2	= (__m128i*)pSrcImage2;
+		__m128i * dst	= (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 4);
+
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round to nearest even - use convert with rounding: cvtps2dq
+			pixels1 = _mm_cvtps_epi32(fpels1);
+			pixels2 = _mm_cvtps_epi32(fpels2);
+			pixels3 = _mm_cvtps_epi32(fpels3);
+			pixels4 = _mm_cvtps_epi32(fpels4);
+
+			// pack to unsigned words 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+			// mask for wrap/truncation
+			pixels1 = _mm_and_si128(pixels1, mask);			// wrap to U8
+			pixels3 = _mm_and_si128(pixels3, mask);			// wrap to U8
+
+			// pack to unsigned bytes
+			pixels1 = _mm_packus_epi16(pixels1, pixels3);
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+
+}
+
+int HafCpu_Mul_U8_U8U8_Sat_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi16((short)0x7FFF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1	= (__m128i*)pSrcImage1;
+		__m128i * src2	= (__m128i*)pSrcImage2;
+		__m128i * dst	= (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 4);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with truncation: cvttps2dq
+			pixels1 = _mm_cvttps_epi32(fpels1);
+			pixels2 = _mm_cvttps_epi32(fpels2);
+			pixels3 = _mm_cvttps_epi32(fpels3);
+			pixels4 = _mm_cvttps_epi32(fpels4);
+
+			// pack to unsigned words 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+			pixels1 = _mm_min_epu16(pixels1, mask);			// clamp to 0x7fff
+			pixels3 = _mm_min_epu16(pixels3, mask);			// clamp to 0x7fff
+
+			// pack to unsigned bytes through unsigned saturation
+			pixels1 = _mm_packus_epi16(pixels1, pixels3);
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+
+}
+
+int HafCpu_Mul_U8_U8U8_Sat_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi16((short)0x7FFF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrcImage1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst  = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 4);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round to nearest even - use convert with rounding: cvtps2dq
+			pixels1 = _mm_cvtps_epi32(fpels1);
+			pixels2 = _mm_cvtps_epi32(fpels2);
+			pixels3 = _mm_cvtps_epi32(fpels3);
+			pixels4 = _mm_cvtps_epi32(fpels4);
+
+			// pack to unsigned words 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+			pixels1 = _mm_min_epu16(pixels1, mask);			// clamp to 0x7fff
+			pixels3 = _mm_min_epu16(pixels3, mask);			// clamp to 0x7fff
+
+			// pack to unsigned bytes though unsigned saturation
+			pixels1 = _mm_packus_epi16(pixels1, pixels3);
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+// the following primitive is tested and working
+int HafCpu_Mul_S16_U8U8_Wrap_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi32((int)0x0000FFFF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrcImage1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with truncation: cvttps2dq
+			pixels1 = _mm_cvttps_epi32(fpels1);
+			pixels2 = _mm_cvttps_epi32(fpels2);
+			pixels3 = _mm_cvttps_epi32(fpels3);
+			pixels4 = _mm_cvttps_epi32(fpels4);
+
+			// mask for wrap/truncation
+			pixels1 = _mm_and_si128(pixels1, mask);
+			pixels2 = _mm_and_si128(pixels2, mask);
+			pixels3 = _mm_and_si128(pixels3, mask);
+			pixels4 = _mm_and_si128(pixels4, mask);
+
+			// pack with unsigned saturation 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_U8U8_Wrap_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi32((int)0x0000FFFF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrcImage1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round to nearest even: cvtps2dq
+			pixels1 = _mm_cvtps_epi32(fpels1);
+			pixels2 = _mm_cvtps_epi32(fpels2);
+			pixels3 = _mm_cvtps_epi32(fpels3);
+			pixels4 = _mm_cvtps_epi32(fpels4);
+
+			// mask for wrap/truncation
+			pixels1 = _mm_and_si128(pixels1, mask);
+			pixels2 = _mm_and_si128(pixels2, mask);
+			pixels3 = _mm_and_si128(pixels3, mask);
+			pixels4 = _mm_and_si128(pixels4, mask);
+
+			// pack with unsigned saturation 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_Mul_S16_U8U8_Sat_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrcImage1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with truncation: cvttps2dq
+			pixels1 = _mm_cvttps_epi32(fpels1);
+			pixels2 = _mm_cvttps_epi32(fpels2);
+			pixels3 = _mm_cvttps_epi32(fpels3);
+			pixels4 = _mm_cvttps_epi32(fpels4);
+
+			// pack signed saturation 
+			pixels1 = _mm_packs_epi32(pixels1, pixels2);
+			pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_U8U8_Sat_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrcImage1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);
+			pixels2 = _mm_load_si128(src2++);
+			pixels3 = _mm_unpackhi_epi8(pixels1, zeros);
+			pixels1 = _mm_cvtepu8_epi16(pixels1);
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			pixels3 = _mm_mullo_epi16(pixels3, pixels4);			// src1*src2 for (8-15)
+			pixels1 = _mm_mullo_epi16(pixels1, pixels2);			// src1*src2 for (0-7)
+			pixels4 = pixels3;
+			pixels2 = pixels1;
+			// convert to 32 bit0
+			pixels2 = _mm_unpackhi_epi16(pixels2, zeros);			// src1*src2 (4-7)
+			pixels1 = _mm_cvtepu16_epi32(pixels1);				// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(pixels4, zeros);			// src1*src2 (12-15)
+			pixels3 = _mm_cvtepu16_epi32(pixels3);				// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round to nearest even: cvtps2dq
+			pixels1 = _mm_cvtps_epi32(fpels1);
+			pixels2 = _mm_cvtps_epi32(fpels2);
+			pixels3 = _mm_cvtps_epi32(fpels3);
+			pixels4 = _mm_cvtps_epi32(fpels4);
+
+			// pack signed saturation 
+			pixels1 = _mm_packs_epi32(pixels1, pixels2);
+			pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_S16U8_Wrap_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask, temp1, temp2;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi32((int)0x0000FFFF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+			pixels3 = _mm_load_si128(src1++);		// src1 (8-15)
+			pixels2 = _mm_load_si128(src2++);		// src2 (0-15)
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+			temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+			// do mulhi as well since we are multiplying 16x8
+			pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+			pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+			// unpack to 32 bit result
+			pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+			pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+			pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with truncation: cvttps2dq
+			pixels1 = _mm_cvttps_epi32(fpels1);
+			pixels2 = _mm_cvttps_epi32(fpels2);
+			pixels3 = _mm_cvttps_epi32(fpels3);
+			pixels4 = _mm_cvttps_epi32(fpels4);
+
+			// mask for wrap/truncation
+			pixels1 = _mm_and_si128(pixels1, mask);
+			pixels2 = _mm_and_si128(pixels2, mask);
+			pixels3 = _mm_and_si128(pixels3, mask);
+			pixels4 = _mm_and_si128(pixels4, mask);
+
+			// pack signed saturation 
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrc1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_S16U8_Wrap_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask, temp1, temp2;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi32((int)0x0000FFFF);
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+	uint32_t fpState = agoControlFpSetRoundEven();
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+			pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+			pixels2 = _mm_load_si128(src2++);		// src2 (0-15)
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+			temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+			// do mulhi as well since we are multiplying 16x8
+			pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+			pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+			// unpack to 32 bit result
+			pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+			pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+			pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards nearest even
+			pixels1 = _mm_cvtps_epi32(fpels1);
+			pixels2 = _mm_cvtps_epi32(fpels2);
+			pixels3 = _mm_cvtps_epi32(fpels3);
+			pixels4 = _mm_cvtps_epi32(fpels4);
+
+			// mask for wrap/truncation
+			pixels1 = _mm_and_si128(pixels1, mask);
+			pixels2 = _mm_and_si128(pixels2, mask);
+			pixels3 = _mm_and_si128(pixels3, mask);
+			pixels4 = _mm_and_si128(pixels4, mask);
+
+			// pack to words
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrc1	+= srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	agoControlFpReset(fpState);
+
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_Mul_S16_S16U8_Sat_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, temp1, temp2;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+			pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+			pixels2 = _mm_load_si128(src2++);		// src2 (0-15)
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+			temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+			// do mulhi as well since we are multiplying 16x8
+			pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+			pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+			// unpack to 32 bit result
+			pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+			pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+			pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with truncation: cvttps2dq
+			pixels1 = _mm_cvttps_epi32(fpels1);
+			pixels2 = _mm_cvttps_epi32(fpels2);
+			pixels3 = _mm_cvttps_epi32(fpels3);
+			pixels4 = _mm_cvttps_epi32(fpels4);
+
+			// pack signed saturation 
+			pixels1 = _mm_packs_epi32(pixels1, pixels2);
+			pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrc1  += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_Mul_S16_S16U8_Sat_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_uint8    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, temp1, temp2;
+	__m128  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	const __m128 fscale = _mm_set1_ps(scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrcImage2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+			pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+			pixels2 = _mm_load_si128(src2++);		// src2 (0-15)
+			pixels4 = _mm_unpackhi_epi8(pixels2, zeros);
+			pixels2 = _mm_cvtepu8_epi16(pixels2);
+			temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+			temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+			// do mulhi as well since we are multiplying 16x8
+			pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+			pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+			// unpack to 32 bit result
+			pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+			pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+			pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+			pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+			// convert to packed single precision float of src1*src2
+			fpels1 = _mm_cvtepi32_ps(pixels1);
+			fpels2 = _mm_cvtepi32_ps(pixels2);
+			fpels3 = _mm_cvtepi32_ps(pixels3);
+			fpels4 = _mm_cvtepi32_ps(pixels4);
+
+			// multiply with scale
+			fpels1 = _mm_mul_ps(fpels1, fscale);
+			fpels2 = _mm_mul_ps(fpels2, fscale);
+			fpels3 = _mm_mul_ps(fpels3, fscale);
+			fpels4 = _mm_mul_ps(fpels4, fscale);
+
+			// round towards zero - use convert with round
+			pixels1 = _mm_cvtps_epi32(fpels1);
+			pixels2 = _mm_cvtps_epi32(fpels2);
+			pixels3 = _mm_cvtps_epi32(fpels3);
+			pixels4 = _mm_cvtps_epi32(fpels4);
+
+			// pack to words 
+			pixels1 = _mm_packs_epi32(pixels1, pixels2);
+			pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+			// copy to dest
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dst++, pixels3);
+		}
+		pSrc1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_S16S16_Wrap_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_int16    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask, temp1, temp2;
+	__m128d  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi32((int)0x0000FFFF);
+	const __m128d fscale = _mm_set1_pd((double)scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+	unsigned char *pSrc2 = (unsigned char *)pSrcImage2;
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrc2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		if (scale == 1.0f){
+			while (dst < dstlast)
+			{
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+				// mask for wrap/truncation
+				pixels1 = _mm_and_si128(pixels1, mask);
+				pixels2 = _mm_and_si128(pixels2, mask);
+				pixels3 = _mm_and_si128(pixels3, mask);
+				pixels4 = _mm_and_si128(pixels4, mask);
+
+				// pack to words 
+				pixels1 = _mm_packus_epi32(pixels1, pixels2);
+				pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		else
+		{
+			int x = 0;
+			while (dst < dstlast)
+			{
+				__m128d  fpels5, fpels6, fpels7, fpels8;
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+				// convert to packed double precision float of src1*src2
+				fpels1 = _mm_cvtepi32_pd(pixels1);
+				fpels2 = _mm_cvtepi32_pd(pixels2);
+				fpels3 = _mm_cvtepi32_pd(pixels3);
+				fpels4 = _mm_cvtepi32_pd(pixels4);
+
+				
+				fpels5 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels1, 0x4e));
+				fpels6 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels2, 0x4e));
+				fpels7 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels3, 0x4e));
+				fpels8 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels4, 0x4e));
+
+				// multiply with scale
+				fpels1 = _mm_mul_pd(fpels1, fscale);
+				fpels2 = _mm_mul_pd(fpels2, fscale);
+				fpels3 = _mm_mul_pd(fpels3, fscale);
+				fpels4 = _mm_mul_pd(fpels4, fscale);
+				fpels5 = _mm_mul_pd(fpels5, fscale);
+				fpels6 = _mm_mul_pd(fpels6, fscale);
+				fpels7 = _mm_mul_pd(fpels7, fscale);
+				fpels8 = _mm_mul_pd(fpels8, fscale);
+
+				// round towards zero - use convert with truncation: cvttps2dq
+				pixels1 = _mm_cvttpd_epi32(fpels1);
+				pixels2 = _mm_cvttpd_epi32(fpels2);
+				pixels3 = _mm_cvttpd_epi32(fpels3);
+				pixels4 = _mm_cvttpd_epi32(fpels4);
+
+				pixels1 = _mm_unpacklo_epi64(pixels1, _mm_cvttpd_epi32(fpels5));
+				pixels2 = _mm_unpacklo_epi64(pixels2, _mm_cvttpd_epi32(fpels6));
+				pixels3 = _mm_unpacklo_epi64(pixels3, _mm_cvttpd_epi32(fpels7));
+				pixels4 = _mm_unpacklo_epi64(pixels4, _mm_cvttpd_epi32(fpels8));
+
+				// mask for wrap/truncation
+				pixels1 = _mm_and_si128(pixels1, mask);
+				pixels2 = _mm_and_si128(pixels2, mask);
+				pixels3 = _mm_and_si128(pixels3, mask);
+				pixels4 = _mm_and_si128(pixels4, mask);
+
+				// pack to words 
+				pixels1 = _mm_packus_epi32(pixels1, pixels2);
+				pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+				x += 16;
+			}
+		}
+		//y++;
+		pSrc1 += srcImage1StrideInBytes;
+		pSrc2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_S16S16_Wrap_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_int16    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, mask, temp1, temp2;
+	__m128d  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	mask = _mm_set1_epi32((int)0x0000FFFF);
+	const __m128d fscale = _mm_set1_pd((double)scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+	unsigned char *pSrc2 = (unsigned char *)pSrcImage2;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrc2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		if (scale == 1.0f){
+			while (dst < dstlast)
+			{
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+				// mask for wrap/truncation
+				pixels1 = _mm_and_si128(pixels1, mask);
+				pixels2 = _mm_and_si128(pixels2, mask);
+				pixels3 = _mm_and_si128(pixels3, mask);
+				pixels4 = _mm_and_si128(pixels4, mask);
+
+				// pack to words 
+				pixels1 = _mm_packus_epi32(pixels1, pixels2);
+				pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		else
+		{
+
+			while (dst < dstlast)
+			{
+				__m128d  fpels5, fpels6, fpels7, fpels8;
+
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+				// convert to packed double precision float of src1*src2
+				fpels1 = _mm_cvtepi32_pd(pixels1);
+				fpels2 = _mm_cvtepi32_pd(pixels2);
+				fpels3 = _mm_cvtepi32_pd(pixels3);
+				fpels4 = _mm_cvtepi32_pd(pixels4);
+				fpels5 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels1, 0x4e));
+				fpels6 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels2, 0x4e));
+				fpels7 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels3, 0x4e));
+				fpels8 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels4, 0x4e));
+
+				// multiply with scale
+				fpels1 = _mm_mul_pd(fpels1, fscale);
+				fpels2 = _mm_mul_pd(fpels2, fscale);
+				fpels3 = _mm_mul_pd(fpels3, fscale);
+				fpels4 = _mm_mul_pd(fpels4, fscale);
+				fpels5 = _mm_mul_pd(fpels5, fscale);
+				fpels6 = _mm_mul_pd(fpels6, fscale);
+				fpels7 = _mm_mul_pd(fpels7, fscale);
+				fpels8 = _mm_mul_pd(fpels8, fscale);
+
+				// round towards zero - use convert with truncation: cvttps2dq
+				pixels1 = _mm_cvtpd_epi32(fpels1);
+				pixels2 = _mm_cvtpd_epi32(fpels2);
+				pixels3 = _mm_cvtpd_epi32(fpels3);
+				pixels4 = _mm_cvtpd_epi32(fpels4);
+				pixels1 = _mm_unpacklo_epi64(pixels1, _mm_cvtpd_epi32(fpels5));
+				pixels2 = _mm_unpacklo_epi64(pixels2, _mm_cvtpd_epi32(fpels6));
+				pixels3 = _mm_unpacklo_epi64(pixels3, _mm_cvtpd_epi32(fpels7));
+				pixels4 = _mm_unpacklo_epi64(pixels4, _mm_cvtpd_epi32(fpels8));
+
+				// mask for wrap/truncation
+				pixels1 = _mm_and_si128(pixels1, mask);
+				pixels2 = _mm_and_si128(pixels2, mask);
+				pixels3 = _mm_and_si128(pixels3, mask);
+				pixels4 = _mm_and_si128(pixels4, mask);
+
+				// pack signed saturation 
+				pixels1 = _mm_packus_epi32(pixels1, pixels2);
+				pixels3 = _mm_packus_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		pSrc1 += srcImage1StrideInBytes;
+		pSrc2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_S16S16_Sat_Trunc
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_int16    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, temp1, temp2;
+	__m128d  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	const __m128d fscale = _mm_set1_pd((double)scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+	unsigned char *pSrc2 = (unsigned char *)pSrcImage2;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrc2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		if (scale == 1.0f){
+			while (dst < dstlast)
+			{
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+				// pack to words 
+				pixels1 = _mm_packs_epi32(pixels1, pixels2);
+				pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		else
+		{
+			while (dst < dstlast)
+			{
+				__m128d  fpels5, fpels6, fpels7, fpels8;
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+				// convert to packed double precision float of src1*src2
+				fpels1 = _mm_cvtepi32_pd(pixels1);
+				fpels2 = _mm_cvtepi32_pd(pixels2);
+				fpels3 = _mm_cvtepi32_pd(pixels3);
+				fpels4 = _mm_cvtepi32_pd(pixels4);
+				fpels5 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels1, 0x4e));
+				fpels6 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels2, 0x4e));
+				fpels7 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels3, 0x4e));
+				fpels8 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels4, 0x4e));
+
+				// multiply with scale
+				fpels1 = _mm_mul_pd(fpels1, fscale);
+				fpels2 = _mm_mul_pd(fpels2, fscale);
+				fpels3 = _mm_mul_pd(fpels3, fscale);
+				fpels4 = _mm_mul_pd(fpels4, fscale);
+				fpels5 = _mm_mul_pd(fpels5, fscale);
+				fpels6 = _mm_mul_pd(fpels6, fscale);
+				fpels7 = _mm_mul_pd(fpels7, fscale);
+				fpels8 = _mm_mul_pd(fpels8, fscale);
+
+				// round towards zero - use convert with truncation: cvttps2dq
+				pixels1 = _mm_cvttpd_epi32(fpels1);
+				pixels2 = _mm_cvttpd_epi32(fpels2);
+				pixels3 = _mm_cvttpd_epi32(fpels3);
+				pixels4 = _mm_cvttpd_epi32(fpels4);
+				pixels1 = _mm_unpacklo_epi64(pixels1, _mm_cvttpd_epi32(fpels5));
+				pixels2 = _mm_unpacklo_epi64(pixels2, _mm_cvttpd_epi32(fpels6));
+				pixels3 = _mm_unpacklo_epi64(pixels3, _mm_cvttpd_epi32(fpels7));
+				pixels4 = _mm_unpacklo_epi64(pixels4, _mm_cvttpd_epi32(fpels8));
+
+				// pack signed saturation 
+				pixels1 = _mm_packs_epi32(pixels1, pixels2);
+				pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		pSrc1 += srcImage1StrideInBytes;
+		pSrc2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Mul_S16_S16S16_Sat_Round
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_int16    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_int16    * pSrcImage1,
+	vx_uint32     srcImage1StrideInBytes,
+	vx_int16    * pSrcImage2,
+	vx_uint32     srcImage2StrideInBytes,
+	vx_float32    scale
+)
+{
+	// do generic floating point calculation
+	__m128i pixels1, pixels2, pixels3, pixels4, temp1, temp2;
+	__m128d  fpels1, fpels2, fpels3, fpels4;
+	const __m128i zeros = _mm_setzero_si128();
+	const __m128d fscale = _mm_set1_pd((double)scale);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pSrc1 = (unsigned char *)pSrcImage1;
+	unsigned char *pSrc2 = (unsigned char *)pSrcImage2;
+
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src1 = (__m128i*)pSrc1;
+		__m128i * src2 = (__m128i*)pSrc2;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 3);
+		if (scale == 1.0f){
+			while (dst < dstlast)
+			{
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+				// pack to words 
+				pixels1 = _mm_packs_epi32(pixels1, pixels2);
+				pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		else
+		{
+			while (dst < dstlast)
+			{
+				__m128d  fpels5, fpels6, fpels7, fpels8;
+				pixels1 = _mm_load_si128(src1++);		// src1 (0-7)
+				pixels3 = _mm_load_si128(src1++);	// src1 (8-15)
+				pixels2 = _mm_load_si128(src2++);		// src2 (0-7)
+				pixels4 = _mm_load_si128(src2++);	// src2 (8-15)
+
+				temp1 = _mm_mullo_epi16(pixels3, pixels4);			// low for src1*src2 for (8-15)
+				temp2 = _mm_mullo_epi16(pixels1, pixels2);			// low for src1*src2 for (0-7)
+				// do mulhi as well since we are multiplying 16x8
+				pixels3 = _mm_mulhi_epi16(pixels3, pixels4);					// high for src1*src2 for (8-15)
+				pixels1 = _mm_mulhi_epi16(pixels1, pixels2);					// high for src1*src2 for (0-7)
+
+				// unpack to 32 bit result
+				pixels2 = _mm_unpackhi_epi16(temp2, pixels1);		// src1*src2 (4-7)
+				pixels1 = _mm_unpacklo_epi16(temp2, pixels1);		// src1*src2 (0-3)
+				pixels4 = _mm_unpackhi_epi16(temp1, pixels3);		// src1*src2 (12-15)
+				pixels3 = _mm_unpacklo_epi16(temp1, pixels3);		// src1*src2 (8-11)
+
+				// convert to packed double precision float of src1*src2
+				fpels1 = _mm_cvtepi32_pd(pixels1);
+				fpels2 = _mm_cvtepi32_pd(pixels2);
+				fpels3 = _mm_cvtepi32_pd(pixels3);
+				fpels4 = _mm_cvtepi32_pd(pixels4);
+				fpels5 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels1, 0x4e));
+				fpels6 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels2, 0x4e));
+				fpels7 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels3, 0x4e));
+				fpels8 = _mm_cvtepi32_pd(_mm_shuffle_epi32(pixels4, 0x4e));
+
+				// multiply with scale
+				fpels1 = _mm_mul_pd(fpels1, fscale);
+				fpels2 = _mm_mul_pd(fpels2, fscale);
+				fpels3 = _mm_mul_pd(fpels3, fscale);
+				fpels4 = _mm_mul_pd(fpels4, fscale);
+				fpels5 = _mm_mul_pd(fpels5, fscale);
+				fpels6 = _mm_mul_pd(fpels6, fscale);
+				fpels7 = _mm_mul_pd(fpels7, fscale);
+				fpels8 = _mm_mul_pd(fpels8, fscale);
+
+				// round towards zero - use convert with truncation: cvttps2dq
+				pixels1 = _mm_cvtpd_epi32(fpels1);
+				pixels2 = _mm_cvtpd_epi32(fpels2);
+				pixels3 = _mm_cvtpd_epi32(fpels3);
+				pixels4 = _mm_cvtpd_epi32(fpels4);
+				pixels1 = _mm_unpacklo_epi64(pixels1, _mm_cvtpd_epi32(fpels5));
+				pixels2 = _mm_unpacklo_epi64(pixels2, _mm_cvtpd_epi32(fpels6));
+				pixels3 = _mm_unpacklo_epi64(pixels3, _mm_cvtpd_epi32(fpels7));
+				pixels4 = _mm_unpacklo_epi64(pixels4, _mm_cvtpd_epi32(fpels8));
+
+				// pack signed saturation 
+				pixels1 = _mm_packs_epi32(pixels1, pixels2);
+				pixels3 = _mm_packs_epi32(pixels3, pixels4);
+
+				// copy to dest
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+			}
+		}
+		pSrc1 += srcImage1StrideInBytes;
+		pSrc2 += srcImage2StrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MeanStdDev_DATA_U8
+	(
+		vx_float32  * pSum,
+		vx_float32  * pSumOfSquared,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char * pLocalSrc;
+	__m128i pixels, pixels_16, pixels_32, pixels_64;
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i sum = _mm_setzero_si128();
+	__m128i sum_squared = _mm_setzero_si128();
+	
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - prefixWidth - postfixWidth;
+	unsigned int prefixSum = 0, postfixSum = 0;
+	unsigned long long prefixSumSquared = 0, postfixSumSquared = 0;
+
+	int height = (int) srcHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *) pSrcImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			prefixSum += (unsigned int) *pLocalSrc;
+			prefixSumSquared += (unsigned long long)*pLocalSrc * (unsigned long long)*pLocalSrc;
+		}
+		int width = (int) (alignedWidth >> 4);								// 16 pixels processed at a time
+		while (width)
+		{
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			pixels_16 = _mm_unpackhi_epi8(pixels, zeromask);				// 15, 14, 13, 12, 11, 10, 9, 8
+			pixels_32 = _mm_unpackhi_epi16(pixels_16, zeromask);			// 15, 14, 13, 12
+
+			sum = _mm_add_epi32(sum, pixels_32);							// Pixels 15, 14, 13, 12
+			pixels_64 = _mm_unpackhi_epi32(pixels_32, zeromask);			// 15, 14
+			pixels_32 = _mm_cvtepi32_epi64(pixels_32);						// 13, 12
+			pixels_64 = _mm_mul_epu32(pixels_64, pixels_64);				// square
+			pixels_32 = _mm_mul_epu32(pixels_32, pixels_32);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_64);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_32);
+
+			pixels_32 = _mm_cvtepi16_epi32(pixels_16);
+			sum = _mm_add_epi32(sum, pixels_32);							// Pixels 11, 10, 9, 8
+			pixels_64 = _mm_unpackhi_epi32(pixels_32, zeromask);			// 11, 10
+			pixels_32 = _mm_cvtepi32_epi64(pixels_32);						// 9, 8
+			pixels_64 = _mm_mul_epu32(pixels_64, pixels_64);				// square
+			pixels_32 = _mm_mul_epu32(pixels_32, pixels_32);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_64);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_32);
+
+			pixels_16 = _mm_cvtepu8_epi16(pixels);							// 7, 6, 5, 4, 3, 2, 1, 0
+			pixels_32 = _mm_unpackhi_epi16(pixels_16, zeromask);			// 7, 6, 5, 4
+
+			sum = _mm_add_epi32(sum, pixels_32);							// Pixels 7, 6, 5, 4
+			pixels_64 = _mm_unpackhi_epi32(pixels_32, zeromask);			// 7, 6
+			pixels_32 = _mm_cvtepi32_epi64(pixels_32);						// 5, 4
+			pixels_64 = _mm_mul_epu32(pixels_64, pixels_64);				// square
+			pixels_32 = _mm_mul_epu32(pixels_32, pixels_32);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_64);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_32);
+
+			pixels_32 = _mm_cvtepi16_epi32(pixels_16);
+			sum = _mm_add_epi32(sum, pixels_32);							// Pixels 3, 2, 1, 0
+			pixels_64 = _mm_unpackhi_epi32(pixels_32, zeromask);			// 3, 2
+			pixels_32 = _mm_cvtepi32_epi64(pixels_32);						// 1, 0
+			pixels_64 = _mm_mul_epu32(pixels_64, pixels_64);				// square
+			pixels_32 = _mm_mul_epu32(pixels_32, pixels_32);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_64);
+			sum_squared = _mm_add_epi64(sum_squared, pixels_32);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			postfixSum += (unsigned int)*pLocalSrc;
+			postfixSumSquared += (unsigned long long)*pLocalSrc * (unsigned long long)*pLocalSrc;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		height--;
+	}
+	
+	sum = _mm_hadd_epi32(sum, sum);											// Lowest int of sum has sum of last two ints of sum
+	sum = _mm_hadd_epi32(sum, sum);											// Lowest int of sum has the sum of all four ints
+	pixels = _mm_srli_si128(sum_squared, 8);
+	sum_squared = _mm_add_epi64(sum_squared, pixels);
+
+	*pSum = (vx_float32)(M128I(sum).m128i_u32[0] + prefixSum + postfixSum);
+	*pSumOfSquared = (vx_float32)(M128I(sum_squared).m128i_u64[0] + prefixSumSquared + postfixSumSquared);
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MeanStdDevMerge_DATA_DATA
+	(
+		vx_float32  * mean,
+		vx_float32  * stddev,
+		vx_uint32	  totalSampleCount,
+		vx_uint32     numPartitions,
+		vx_float32    partSum[],
+		vx_float32    partSumOfSquared[]
+	)
+{
+	vx_float32 lmean = 0, lstd = 0;
+
+	for (unsigned int i = 0; i < numPartitions; i++)
+	{
+		lmean += partSum[i];
+		lstd += partSumOfSquared[i];
+	}
+
+	lmean /= totalSampleCount;
+	lstd = sqrtf((lstd / totalSampleCount) - (lmean * lmean));
+
+	*mean = lmean;
+	*stddev = lstd;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_IntegralImage_U32_U8
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint32   * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes
+)
+{
+	__m128i pixels1, pixels2, pixels3, pixels4;
+	__m128i zeromask = _mm_setzero_si128();
+	// process 16 at a time (shift and add for cur and previous)
+	unsigned char *pSrcImage1 = pSrcImage;
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src = (__m128i*)pSrcImage1;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstlast = dst + (dstWidth >> 2);
+		__m128i prevsum = _mm_setzero_si128();
+		if (pSrcImage1 == pSrcImage){
+			while (dst < dstlast)
+			{
+				pixels1 = _mm_loadu_si128(src++);		// src (0-15)
+				pixels2 = _mm_unpackhi_epi8(pixels1, zeromask);
+				pixels1 = _mm_cvtepu8_epi16(pixels1);
+				// shift and add
+				pixels3 = pixels1;
+				pixels4 = pixels2;
+				for (int i = 0; i < 7; i++)
+				{
+					pixels3 = _mm_slli_si128(pixels3, 2);
+					pixels4 = _mm_slli_si128(pixels4, 2);
+					pixels1 = _mm_add_epi16(pixels1, pixels3);
+					pixels2 = _mm_add_epi16(pixels2, pixels4);
+				}
+				// for the second 8 sum, add to the first 8
+				pixels3 = _mm_shufflehi_epi16(pixels1, 0xff);
+				pixels3 = _mm_shuffle_epi32(pixels3, 0xff);
+				pixels2 = _mm_add_epi16(pixels2, pixels3);
+				// unpack to dwords and add with prevsum
+				pixels3 = _mm_unpackhi_epi16(pixels1, zeromask);
+				pixels4 = _mm_unpackhi_epi16(pixels2, zeromask);
+				pixels1 = _mm_cvtepu16_epi32(pixels1);
+				pixels2 = _mm_cvtepu16_epi32(pixels2);
+				pixels1 = _mm_add_epi32(pixels1, prevsum);
+				pixels2 = _mm_add_epi32(pixels2, prevsum);
+				pixels3 = _mm_add_epi32(pixels3, prevsum);
+				pixels4 = _mm_add_epi32(pixels4, prevsum);
+
+				// copy to dst (sum in words)
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+				_mm_store_si128(dst++, pixels2);
+				_mm_store_si128(dst++, pixels4);
+				prevsum = _mm_shuffle_epi32(pixels4, 0xff);
+
+			}
+		}
+		else
+		{
+			unsigned int prev_dword = 0;
+			__m128i prevdword = _mm_setzero_si128();
+			__m128i prevsum1 = _mm_setzero_si128();
+			__m128i * prevdst = (__m128i*)(pchDst - dstImageStrideInBytes);
+			while (dst < dstlast)
+			{
+				__m128i prev1, prev2, prev3, prev4, temp, temp1, temp2, temp3;
+				pixels1 = _mm_loadu_si128(src++);		// src (0-15)
+				pixels2 = _mm_unpackhi_epi8(pixels1, zeromask);
+				pixels1 = _mm_cvtepu8_epi16(pixels1);
+				// shift and add
+				pixels3 = pixels1;
+				pixels4 = pixels2;
+				for (int i = 0; i < 7; i++)
+				{
+					pixels3 = _mm_slli_si128(pixels3, 2);
+					pixels4 = _mm_slli_si128(pixels4, 2);
+					pixels1 = _mm_add_epi16(pixels1, pixels3);
+					pixels2 = _mm_add_epi16(pixels2, pixels4);
+				}
+				// for the second 8 sum, add to the first 8
+				pixels3 = _mm_shufflehi_epi16(pixels1, 0xff);
+				pixels3 = _mm_shuffle_epi32(pixels3, 0xff);
+				pixels2 = _mm_add_epi16(pixels2, pixels3);
+				// unpack to dwords and add with prevsum
+				pixels3 = _mm_unpackhi_epi16(pixels1, zeromask);
+				pixels4 = _mm_unpackhi_epi16(pixels2, zeromask);
+				pixels1 = _mm_cvtepu16_epi32(pixels1);
+				pixels2 = _mm_cvtepu16_epi32(pixels2);
+
+				// calculate with prevsum(x) - prevsum(x-1)
+				prev1 = _mm_load_si128(prevdst++);
+
+				// subtract sum(x-1, y-1)
+				temp = _mm_srli_si128(prev1, 12);
+				temp1 = _mm_slli_si128(prev1, 4);
+				prev2 = _mm_load_si128(prevdst++);
+				temp1 = _mm_or_si128(temp1, prevdword);
+				prev1 = _mm_sub_epi32(prev1, temp1);
+
+				prevdword = _mm_srli_si128(prev2, 12);
+				temp1 = _mm_slli_si128(prev2, 4);
+				prev3 = _mm_load_si128(prevdst++);
+				temp1 = _mm_or_si128(temp1, temp);
+				prev2 = _mm_sub_epi32(prev2, temp1);
+
+				temp = _mm_srli_si128(prev3, 12);
+				temp1 = _mm_slli_si128(prev3, 4);
+				prev4 = _mm_load_si128(prevdst++);
+				temp1 = _mm_or_si128(temp1, prevdword);
+				prev3 = _mm_sub_epi32(prev3, temp1);
+
+				prevdword = _mm_srli_si128(prev4, 12);
+				temp1 = _mm_slli_si128(prev4, 4);
+				temp1 = _mm_or_si128(temp1, temp);
+				prev4 = _mm_sub_epi32(prev4, temp1);
+				temp = prev1;
+				temp1 = prev2;
+				temp2 = prev3;
+				temp3 = prev4;
+
+				for (int i = 0; i < 3; i++)
+				{
+					temp = _mm_slli_si128(temp, 4);
+					temp1 = _mm_slli_si128(temp1, 4);
+					temp2 = _mm_slli_si128(temp2, 4);
+					temp3 = _mm_slli_si128(temp3, 4);
+					prev1 = _mm_add_epi32(prev1, temp);
+					prev2 = _mm_add_epi32(prev2, temp1);
+					prev3 = _mm_add_epi32(prev3, temp2);
+					prev4 = _mm_add_epi32(prev4, temp3);
+				}
+				// for the second 4 sum, add to the first 4
+				temp = _mm_shuffle_epi32(prev1, 0xff);
+				prev2 = _mm_add_epi32(prev2, temp);
+				temp1 = _mm_shuffle_epi32(prev2, 0xff);
+				prev3 = _mm_add_epi32(prev3, temp1);
+				temp = _mm_shuffle_epi32(prev3, 0xff);
+				prev4 = _mm_add_epi32(prev4, temp);
+
+				// add to pixels1 to pixels4
+				pixels1 = _mm_add_epi32(pixels1, prev1);
+				pixels3 = _mm_add_epi32(pixels3, prev2);
+				pixels2 = _mm_add_epi32(pixels2, prev3);
+				pixels4 = _mm_add_epi32(pixels4, prev4);
+				prevsum1 = _mm_shuffle_epi32(prev4, 0xff);
+
+				pixels1 = _mm_add_epi32(pixels1, prevsum);
+				pixels3 = _mm_add_epi32(pixels3, prevsum);
+				pixels2 = _mm_add_epi32(pixels2, prevsum);
+				pixels4 = _mm_add_epi32(pixels4, prevsum);
+				// copy to dst (sum in words)
+				_mm_store_si128(dst++, pixels1);
+				_mm_store_si128(dst++, pixels3);
+				_mm_store_si128(dst++, pixels2);
+				_mm_store_si128(dst++, pixels4);
+				prevsum = _mm_shuffle_epi32(pixels4, 0xff);
+			}
+		}
+		pSrcImage1 += srcImageStrideInBytes;
+		pchDst += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+#if 0
+// keeping the implementation in case we need it in future
+int HafCpu_Histogram_DATA_U8
+(
+	vx_uint32     dstHist[],
+	vx_uint32     srcWidth,
+	vx_uint32     srcHeight,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_uint32     numBins,
+	vx_uint32     range,
+	vx_uint32     offset,
+	vx_uint32     window_size
+)
+{
+	__m128i pixels1, pixels2;
+	__m128i * src = (__m128i*)pSrcImage;
+	__m128i * dst = (__m128i*)pDstImage;
+	// clear histogram bins
+	unsigned int *pdst = dstHist;
+	memset(pdst, 0x0, numBins * sizeof(unsigned int));
+	if (!offset)
+	{
+		if (range == 0xff)
+		{
+			for (int y = 0; y < srcHeight; y++)
+			{
+				for (int x = 0; x < srcWidth; x += 16)
+				{
+					pixels1 = _mm_load_si128(&src[x >> 4]);
+					if (window_size > 1)
+					{
+						__m128 win, pel0, pel1, pel2, pel3;
+						// read window size
+						win = _mm_set1_ps((float)window_size);
+						pixels2 = _mm_cvtepu8_epi32(pixels1);
+						pel0 = _mm_cvtepi32_ps(pixels2);
+						pel0 = _mm_div_ps(pel0, win);		// divide by window size
+						_mm_srli_si128(pixels1, 4);
+						pixels2 = _mm_cvtepu8_epi32(pixels1);
+						pel1 = _mm_cvtepi32_ps(pixels2);
+						pel1 = _mm_div_ps(pel1, win);		// divide by window size
+						_mm_srli_si128(pixels1, 4);
+						pixels2 = _mm_cvtepu8_epi32(pixels1);
+						pel2 = _mm_cvtepi32_ps(pixels2);
+						pel2 = _mm_div_ps(pel2, win);		// divide by window size
+						_mm_srli_si128(pixels1, 4);
+						pixels2 = _mm_cvtepu8_epi32(pixels1);
+						pel3 = _mm_cvtepi32_ps(pixels2);
+						pel3 = _mm_div_ps(pel3, win);		// divide by window size
+
+						// convert to int and store
+						pixels1 = _mm_cvtps_epi32(pel0);
+						pixels2 = _mm_cvtps_epi32(pel1);
+						_mm_store_si128(&dst[(x >> 2)], pixels1);
+						_mm_store_si128(&dst[(x >> 2) + 1], pixels2);
+						pixels1 = _mm_cvtps_epi32(pel2);
+						pixels2 = _mm_cvtps_epi32(pel3);
+						_mm_store_si128(&dst[(x >> 2) + 2], pixels1);
+						_mm_store_si128(&dst[(x >> 2) + 3], pixels2);
+					}
+
+				}
+			}
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+#endif
+
+#define NUM_BINS	256
+// special case histogram primitive : range - 255, offset: 0, NumBins: 255
+int HafCpu_Histogram_DATA_U8
+(
+	vx_uint32     dstHist[],
+	vx_uint32     srcWidth,
+	vx_uint32     srcHeight,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes
+)
+{
+	unsigned int *pdst = dstHist;
+	memset(pdst, 0x0, NUM_BINS * sizeof(unsigned int));
+	for (unsigned int y = 0; y < srcHeight; y++)
+	{
+		unsigned int * src = (unsigned int *)(pSrcImage + y*srcImageStrideInBytes);
+		unsigned int * srclast = src + (srcWidth >> 2);
+		while (src < srclast)
+		{
+			// do for 16 pixels..
+			unsigned int pixel4;
+			pixel4 = *src++;
+			pdst[(pixel4 & 0xFF)]++;
+			pdst[(pixel4 >> 8) & 0xFF]++;
+			pdst[(pixel4 >> 16) & 0xFF]++;
+			pdst[(pixel4 >> 24) & 0xFF]++;
+
+			pixel4 = *src++;
+			pdst[(pixel4 & 0xFF)]++;
+			pdst[(pixel4 >> 8) & 0xFF]++;
+			pdst[(pixel4 >> 16) & 0xFF]++;
+			pdst[(pixel4 >> 24) & 0xFF]++;
+
+			pixel4 = *src++;
+			pdst[(pixel4 & 0xFF)]++;
+			pdst[(pixel4 >> 8) & 0xFF]++;
+			pdst[(pixel4 >> 16) & 0xFF]++;
+			pdst[(pixel4 >> 24) & 0xFF]++;
+
+			pixel4 = *src++;
+			pdst[(pixel4 & 0xFF)]++;
+			pdst[(pixel4 >> 8) & 0xFF]++;
+			pdst[(pixel4 >> 16) & 0xFF]++;
+			pdst[(pixel4 >> 24) & 0xFF]++;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_HistogramMerge_DATA_DATA
+(
+	vx_uint32     dstHist[],
+	vx_uint32     numPartitions,
+	vx_uint32   * pPartSrcHist[]
+)
+{
+	__m128i pixels1, pixels2;
+	__m128i * dst = (__m128i*)dstHist;
+
+	for (unsigned int n = 0; n < 256; n+=8)
+	{
+		__m128i sum1 = _mm_setzero_si128();
+		__m128i sum2 = _mm_setzero_si128();
+		for (unsigned int i = 0; i < numPartitions; i++){
+			__m128i *phist = (__m128i *)&pPartSrcHist[i];
+			pixels1 = _mm_load_si128(&phist[(n >> 2)]);
+			pixels2 = _mm_load_si128(&phist[(n >> 2)+1]);
+			sum1 = _mm_add_epi32(sum1, pixels1);
+			sum2 = _mm_add_epi32(sum2, pixels2);
+		}
+		// copy merged
+		_mm_store_si128(&dst[(n >> 2)], sum1);
+		_mm_store_si128(&dst[(n >> 2) + 1], sum2);
+	}
+	return AGO_SUCCESS;
+}
+
+// Primitive: Histogram equalization
+// first do a merge of individual histograms before doing equalization
+int HafCpu_Equalize_DATA_DATA
+(
+vx_uint8    * pLut,
+vx_uint32     numPartitions,
+vx_uint32   * pPartSrcHist[]
+)
+{
+	unsigned int cdfmin = 0, div;
+	__m128i pixels1, pixels2, pixels4;
+	__m128i dst_[NUM_BINS / 4], * dst = dst_;
+	unsigned int * cdf = M128I(dst_[0]).m128i_u32;
+
+	pixels4 = _mm_setzero_si128();
+
+	for (unsigned int n = 0; n < NUM_BINS; n += 8)
+	{
+		__m128i sum1 = _mm_setzero_si128();
+		__m128i sum2 = _mm_setzero_si128();
+		for (unsigned int i = 0; i < numPartitions; i++){
+			__m128i *phist = (__m128i *)&pPartSrcHist[i][n];
+			pixels1 = _mm_load_si128(phist);
+			pixels2 = _mm_load_si128(phist+1);
+			sum1 = _mm_add_epi32(sum1, pixels1);
+			sum2 = _mm_add_epi32(sum2, pixels2);
+		}
+		// calculate cdf
+		// shift and add
+		pixels1 = sum1;
+		pixels2 = sum2;
+		for (int i = 0; i < 4; i++)
+		{
+			pixels1 = _mm_slli_si128(pixels1, 4);
+			pixels2 = _mm_slli_si128(pixels2, 4);
+			sum1 = _mm_add_epi32(sum1, pixels1);
+			sum2 = _mm_add_epi32(sum2, pixels2);
+		}
+		// for the second sum onwards, add to the first 
+		pixels1 = _mm_shuffle_epi32(sum1, 0xff);
+		sum1 = _mm_add_epi32(sum1, pixels4);
+		sum2 = _mm_add_epi32(sum2, pixels4);
+		sum2 = _mm_add_epi32(sum2, pixels1);
+		pixels4 = _mm_shuffle_epi32(sum2, 0xff);
+
+		// store cdf
+		_mm_store_si128(dst++, sum1);
+		_mm_store_si128(dst++, sum2);
+	}
+	// find the cdf[minv]
+	for (int n = 0; n < NUM_BINS; n++){
+		pLut[n] = 0;		// initialize
+		if (cdf[n] || cdfmin){
+			if (!cdfmin){
+				cdfmin = cdf[n];
+				div = cdf[NUM_BINS - 1] - cdfmin;		// range
+			}
+			// equalize to 0-255
+			if (div){
+				float p = (float)(cdf[n] - cdfmin) / (float)div;
+				pLut[n] = (vx_uint8)(p*255.0f + 0.5f);
+			}
+			else
+				pLut[n] = n;		// is this correct?
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMax_DATA_U8
+	(
+		vx_int32    * pDstMinValue,
+		vx_int32    * pDstMaxValue,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	__m128i pixels;
+	__m128i maxVal_xmm = _mm_setzero_si128();
+	__m128i minVal_xmm = _mm_set1_epi8((char) 0xFF);
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - prefixWidth - postfixWidth;
+	unsigned char maxVal = 0, minVal = 255;
+	unsigned char * pLocalSrc;
+
+	int height = (int)srcHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			maxVal = max(maxVal, *pLocalSrc);
+			minVal = min(minVal, *pLocalSrc);
+		}
+
+		pLocalSrc_xmm = (__m128i *) pLocalSrc;
+		int width = (int)(alignedWidth >> 4);									// 16 pixels processed at a time
+		while (width)
+		{
+			pixels = _mm_load_si128(pLocalSrc_xmm++);
+			maxVal_xmm = _mm_max_epu8(maxVal_xmm, pixels);
+			minVal_xmm = _mm_min_epu8(minVal_xmm, pixels);
+
+			width--;
+		}
+
+		pLocalSrc = (unsigned char *)pLocalSrc_xmm;
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			maxVal = max(maxVal, *pLocalSrc);
+			minVal = min(minVal, *pLocalSrc);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		height--;
+	}
+
+	// Compute the max value out of the max at 16 individual places
+	for (int i = 0; i < 16; i++)
+	{
+		maxVal = max(maxVal, M128I(maxVal_xmm).m128i_u8[i]);
+		minVal = min(minVal, M128I(minVal_xmm).m128i_u8[i]);
+	}
+
+	*pDstMinValue = (vx_int32) minVal;
+	*pDstMaxValue = (vx_int32) maxVal;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min values in the source image
+	__m128i minVal = _mm_set1_epi8((unsigned char)globalMin);
+	__m128i pixels;
+	int minCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			pixels = _mm_cmpeq_epi8(pixels, minVal);
+			minMask = _mm_movemask_epi8(pixels);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+						minCount++;
+					minMask >>= 1;
+				}
+			}
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min values in the source image
+	__m128i maxVal = _mm_set1_epi8((unsigned char)globalMax);
+	__m128i pixels;
+	int maxCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int maxMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			pixels = _mm_cmpeq_epi8(pixels, maxVal);
+			maxMask = _mm_movemask_epi8(pixels);
+
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+						maxCount++;
+					maxMask >>= 1;
+				}
+			}
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMaxLocCount = (vx_int32)maxCount;
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi8((unsigned char)globalMin);
+	__m128i maxVal = _mm_set1_epi8((unsigned char)globalMax);
+	__m128i pixels;
+	int minCount = 0, maxCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			__m128i temp = _mm_cmpeq_epi8(pixels, minVal);
+			minMask = _mm_movemask_epi8(temp);
+
+			temp = _mm_cmpeq_epi8(pixels, maxVal);
+			maxMask = _mm_movemask_epi8(temp);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+						minCount++;
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+						maxCount++;
+					maxMask >>= 1;
+				}
+			}
+			
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi8((unsigned char)globalMin);
+	__m128i pixels;
+	int minCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool minListNotFull = (minCount < (int)capacityOfMinLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			pixels = _mm_cmpeq_epi8(pixels, minVal);
+			minMask = _mm_movemask_epi8(pixels);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+					{
+						if (minListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							minLocList[minCount] = loc;
+						}
+						minCount++;
+						minListNotFull = (minCount < (int)capacityOfMinLocList);
+					}
+					minMask >>= 1;
+				}
+			}
+			
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi8((unsigned char)globalMin);
+	__m128i maxVal = _mm_set1_epi8((unsigned char)globalMax);
+	__m128i pixels;
+	int minCount = 0, maxCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool minListNotFull = (minCount < (int)capacityOfMinLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			__m128i temp = _mm_cmpeq_epi8(pixels, minVal);
+			minMask = _mm_movemask_epi8(temp);
+
+			temp = _mm_cmpeq_epi8(pixels, maxVal);
+			maxMask = _mm_movemask_epi8(temp);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+					{
+						if (minListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							minLocList[minCount] = loc;
+						}
+						minCount++;
+						minListNotFull = (minCount < (int)capacityOfMinLocList);
+					}
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+						maxCount++;
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i maxVal = _mm_set1_epi8((unsigned char)globalMax);
+	__m128i pixels;
+	int maxCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int maxMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			
+			pixels = _mm_cmpeq_epi8(pixels, maxVal);
+			maxMask = _mm_movemask_epi8(pixels);
+
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+					{
+						if (maxListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							maxLocList[maxCount] = loc;
+						}
+						maxCount++;
+						maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+					}
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi8((unsigned char)globalMin);
+	__m128i maxVal = _mm_set1_epi8((unsigned char)globalMax);
+	__m128i pixels;
+	int minCount = 0, maxCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			__m128i temp = _mm_cmpeq_epi8(pixels, minVal);
+			minMask = _mm_movemask_epi8(temp);
+
+			temp = _mm_cmpeq_epi8(pixels, maxVal);
+			maxMask = _mm_movemask_epi8(temp);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+						minCount++;
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+					{
+						if (maxListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							maxLocList[maxCount] = loc;
+						}
+						maxCount++;
+						maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+					}
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_U8DATA_Loc_MinMax_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_uint8           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi8((unsigned char) globalMin);
+	__m128i maxVal = _mm_set1_epi8((unsigned char) globalMax);
+	__m128i pixels;
+	int minCount = 0, maxCount = 0;
+	unsigned char * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool minListNotFull = (minCount < (int) capacityOfMinLocList);
+	bool maxListNotFull = (maxCount < (int) capacityOfMaxLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+							
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+			
+			pixels = _mm_load_si128((__m128i *) pLocalSrc);
+			__m128i temp = _mm_cmpeq_epi8(pixels, minVal);
+			minMask = _mm_movemask_epi8(temp);
+
+			temp = _mm_cmpeq_epi8(pixels, maxVal);
+			maxMask = _mm_movemask_epi8(temp);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+					{
+						if (minListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							minLocList[minCount] = loc;
+						}
+						minCount++;
+						minListNotFull = (minCount < (int)capacityOfMinLocList);
+					}
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+					{
+						if (maxListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							maxLocList[maxCount] = loc;
+						}
+						maxCount++;
+						maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+					}
+					maxMask >>= 1;
+				}
+			}
+			
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int) srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;		
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMax_DATA_S16
+	(
+		vx_int32    * pDstMinValue,
+		vx_int32    * pDstMaxValue,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_int16    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	__m128i pixels;
+	
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - prefixWidth - postfixWidth;
+	short maxVal = SHRT_MIN, minVal = SHRT_MAX;
+	short * pLocalSrc;
+
+	__m128i maxVal_xmm = _mm_set1_epi16(maxVal);
+	__m128i minVal_xmm = _mm_set1_epi16(minVal);
+
+	int height = (int)srcHeight;
+	while (height)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			maxVal = max(maxVal, *pLocalSrc);
+			minVal = min(minVal, *pLocalSrc);
+		}
+
+		pLocalSrc_xmm = (__m128i *) pLocalSrc;
+		int width = (int)(alignedWidth >> 3);									// 8 pixels processed at a time
+		while (width)
+		{
+			pixels = _mm_load_si128(pLocalSrc_xmm++);
+			maxVal_xmm = _mm_max_epi16(maxVal_xmm, pixels);
+			minVal_xmm = _mm_min_epi16(minVal_xmm, pixels);
+
+			width--;
+		}
+
+		pLocalSrc = (short *)pLocalSrc_xmm;
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			maxVal = max(maxVal, *pLocalSrc);
+			minVal = min(minVal, *pLocalSrc);
+		}
+		
+		pSrcImage += (srcImageStrideInBytes >> 1);
+		height--;
+	}
+
+	// Compute the max value out of the max at 16 individual places
+	for (int i = 0; i < 8; i++)
+	{
+		maxVal = max(maxVal, M128I(maxVal_xmm).m128i_i16[i]);
+		minVal = min(minVal, M128I(minVal_xmm).m128i_i16[i]);
+	}
+
+	*pDstMinValue = (vx_int32) minVal;
+	*pDstMaxValue = (vx_int32) maxVal;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min values in the source image
+	__m128i minVal = _mm_set1_epi16((short)globalMin);
+	__m128i pixelsH, pixelsL;
+	int minCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, minVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, minVal);
+			pixelsL = _mm_packs_epi16(pixelsL, pixelsH);
+			minMask = _mm_movemask_epi8(pixelsL);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+						minCount++;
+					minMask >>= 1;
+				}
+			}
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min values in the source image
+	__m128i maxVal = _mm_set1_epi16((short)globalMax);
+	__m128i pixelsH, pixelsL;
+	int maxCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int maxMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+			
+			pixelsH = _mm_cmpeq_epi16(pixelsH, maxVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, maxVal);
+			pixelsL = _mm_packs_epi16(pixelsL, pixelsH);
+			maxMask = _mm_movemask_epi8(pixelsL);
+
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+						maxCount++;
+					maxMask >>= 1;
+				}
+			}
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMaxLocCount = (vx_int32)maxCount;
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi16((short)globalMin);
+	__m128i maxVal = _mm_set1_epi16((short)globalMax);
+	__m128i pixelsL, pixelsH;
+	int minCount = 0, maxCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+			
+			__m128i temp1 = _mm_cmpeq_epi16(pixelsH, minVal);
+			__m128i temp0 = _mm_cmpeq_epi16(pixelsL, minVal);
+			temp0 = _mm_packs_epi16(temp0, temp1);
+			minMask = _mm_movemask_epi8(temp0);
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, maxVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, maxVal);
+			temp1 = _mm_packs_epi16(pixelsL, pixelsH);
+			maxMask = _mm_movemask_epi8(temp1);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+						minCount++;
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+						maxCount++;
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_Min
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi16((short)globalMin);
+	__m128i pixelsL, pixelsH;
+	int minCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool minListNotFull = (minCount < (int)capacityOfMinLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, minVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, minVal);
+			pixelsL = _mm_packs_epi16(pixelsL, pixelsH);
+			minMask = _mm_movemask_epi8(pixelsL);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+					{
+						if (minListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							minLocList[minCount] = loc;
+						}
+						minCount++;
+						minListNotFull = (minCount < (int)capacityOfMinLocList);
+					}
+					minMask >>= 1;
+				}
+			}
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi16((short)globalMin);
+	__m128i maxVal = _mm_set1_epi16((short)globalMax);
+	__m128i pixelsL, pixelsH;
+	int minCount = 0, maxCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool minListNotFull = (minCount < (int)capacityOfMinLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+
+			__m128i temp1 = _mm_cmpeq_epi16(pixelsH, minVal);
+			__m128i temp0 = _mm_cmpeq_epi16(pixelsL, minVal);
+			temp0 = _mm_packs_epi16(temp0, temp1);
+			minMask = _mm_movemask_epi8(temp0);
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, maxVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, maxVal);
+			temp1 = _mm_packs_epi16(pixelsL, pixelsH);
+			maxMask = _mm_movemask_epi8(temp1);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+					{
+						if (minListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							minLocList[minCount] = loc;
+						}
+						minCount++;
+						minListNotFull = (minCount < (int)capacityOfMinLocList);
+					}
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+						maxCount++;
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+				maxCount++;
+			
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max
+	(
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i maxVal = _mm_set1_epi16((short)globalMax);
+	__m128i pixelsL, pixelsH;
+	int minCount = 0, maxCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int maxMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, maxVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, maxVal);
+			pixelsL = _mm_packs_epi16(pixelsL, pixelsH);
+			maxMask = _mm_movemask_epi8(pixelsL);
+
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+					{
+						if (maxListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							maxLocList[maxCount] = loc;
+						}
+						maxCount++;
+						maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+					}
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi16((short)globalMin);
+	__m128i maxVal = _mm_set1_epi16((short)globalMax);
+	__m128i pixelsL, pixelsH;
+	int minCount = 0, maxCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+
+			__m128i temp1 = _mm_cmpeq_epi16(pixelsH, minVal);
+			__m128i temp0 = _mm_cmpeq_epi16(pixelsL, minVal);
+			temp0 = _mm_packs_epi16(temp0, temp1);
+			minMask = _mm_movemask_epi8(temp0);
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, maxVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, maxVal);
+			temp1 = _mm_packs_epi16(pixelsL, pixelsH);
+			maxMask = _mm_movemask_epi8(temp1);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+						minCount++;
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+					{
+						if (maxListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							maxLocList[maxCount] = loc;
+						}
+						maxCount++;
+						maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+					}
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+				minCount++;
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax
+	(
+		vx_uint32          * pMinLocCount,
+		vx_uint32          * pMaxLocCount,
+		vx_uint32            capacityOfMinLocList,
+		vx_coordinates2d_t   minLocList[],
+		vx_uint32            capacityOfMaxLocList,
+		vx_coordinates2d_t   maxLocList[],
+		vx_int32           * pDstMinValue,
+		vx_int32           * pDstMaxValue,
+		vx_uint32            numDataPartitions,
+		vx_int32             srcMinValue[],
+		vx_int32             srcMaxValue[],
+		vx_uint32            srcWidth,
+		vx_uint32            srcHeight,
+		vx_int16           * pSrcImage,
+		vx_uint32            srcImageStrideInBytes
+	)
+{
+	// Compute the global minima and maxima
+	vx_int32 globalMin, globalMax;
+	HafCpu_MinMaxMerge_DATA_DATA(&globalMin, &globalMax, numDataPartitions, srcMinValue, srcMaxValue);
+
+	*pDstMinValue = globalMin;
+	*pDstMaxValue = globalMax;
+
+	// Search for the min and the max values in the source image
+	__m128i minVal = _mm_set1_epi16((short)globalMin);
+	__m128i maxVal = _mm_set1_epi16((short)globalMax);
+	__m128i pixelsL, pixelsH;
+	int minCount = 0, maxCount = 0;
+	short * pLocalSrc;
+
+	int prefixWidth = intptr_t(pSrcImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)srcWidth - prefixWidth) & 15;
+	int alignedWidth = (int)srcWidth - postfixWidth;
+
+	bool minListNotFull = (minCount < (int)capacityOfMinLocList);
+	bool maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+	vx_coordinates2d_t loc;
+
+	for (int height = 0; height < (int)srcHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		int width = 0;
+		while (width < prefixWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+
+			width++;
+			pLocalSrc++;
+		}
+
+		while (width < alignedWidth)
+		{
+			int minMask, maxMask;
+
+			pixelsL = _mm_load_si128((__m128i *) pLocalSrc);
+			pixelsH = _mm_load_si128((__m128i *) (pLocalSrc + 8));
+			
+			__m128i temp1 = _mm_cmpeq_epi16(pixelsH, minVal);
+			__m128i temp0 = _mm_cmpeq_epi16(pixelsL, minVal);
+			temp0 = _mm_packs_epi16(temp0, temp1);
+			minMask = _mm_movemask_epi8(temp0);
+
+			pixelsH = _mm_cmpeq_epi16(pixelsH, maxVal);
+			pixelsL = _mm_cmpeq_epi16(pixelsL, maxVal);
+			temp1 = _mm_packs_epi16(pixelsL, pixelsH);
+			maxMask = _mm_movemask_epi8(temp1);
+
+			if (minMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (minMask & 1)
+					{
+						if (minListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							minLocList[minCount] = loc;
+						}
+						minCount++;
+						minListNotFull = (minCount < (int)capacityOfMinLocList);
+					}
+					minMask >>= 1;
+				}
+			}
+			if (maxMask)
+			{
+				for (int i = 0; i < 16; i++)
+				{
+					if (maxMask & 1)
+					{
+						if (maxListNotFull)
+						{
+							loc.y = height;
+							loc.x = width + i;
+							maxLocList[maxCount] = loc;
+						}
+						maxCount++;
+						maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+					}
+					maxMask >>= 1;
+				}
+			}
+
+			width += 16;
+			pLocalSrc += 16;
+		}
+
+		while (width < (int)srcWidth)
+		{
+			if (*pLocalSrc == globalMin)
+			{
+				if (minListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					minLocList[minCount] = loc;
+				}
+				minCount++;
+				minListNotFull = (minCount < (int)capacityOfMinLocList);
+			}
+			if (*pLocalSrc == globalMax)
+			{
+				if (maxListNotFull)
+				{
+					loc.x = width;
+					loc.y = height;
+					maxLocList[maxCount] = loc;
+				}
+				maxCount++;
+				maxListNotFull = (maxCount < (int)capacityOfMaxLocList);
+			}
+			width++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += (srcImageStrideInBytes >> 1);
+	}
+
+	*pMinLocCount = (vx_int32)minCount;
+	*pMaxLocCount = (vx_int32)maxCount;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_MinMaxMerge_DATA_DATA
+	(
+		vx_int32    * pDstMinValue,
+		vx_int32    * pDstMaxValue,
+		vx_uint32     numDataPartitions,
+		vx_int32      srcMinValue[],
+		vx_int32      srcMaxValue[]
+	)
+{
+	vx_int32 minVal, maxVal;
+
+	minVal = srcMinValue[0];
+	maxVal = srcMaxValue[0];
+
+	for (int i = 1; i < (int) numDataPartitions; i++)
+	{
+		minVal = min(minVal, srcMinValue[i]);
+		maxVal = min(minVal, srcMaxValue[i]);
+	}
+
+	*pDstMinValue = minVal;
+	*pDstMaxValue = maxVal;
+
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_MinMaxLocMerge_DATA_DATA
+	(
+		vx_uint32          * pDstLocCount,
+		vx_uint32            capacityOfDstLocList,
+		vx_coordinates2d_t   dstLocList[],
+		vx_uint32            numDataPartitions,
+		vx_uint32            partLocCount[],
+		vx_coordinates2d_t * partLocList[]
+	)
+{
+	int dstCount = 0;
+	int srcCount;
+	vx_coordinates2d_t * srcList;
+	
+	for (int i = 0; i < (int)numDataPartitions; i++)
+	{
+		srcList = partLocList[i];
+		srcCount = partLocCount[i];
+
+		while (srcCount)
+		{
+			*dstLocList++ = *srcList++;
+			dstCount++;
+			srcCount--;
+			if (dstCount > (int) capacityOfDstLocList)
+			{
+				*pDstLocCount = (vx_uint32)(dstCount - 1);
+				return AGO_SUCCESS;
+			}
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+float HafCpu_FastAtan2_rad
+(
+	vx_int16	  Gx,
+	vx_int16      Gy
+)
+{
+	vx_uint16 ax, ay;
+	ax = std::abs(Gx), ay = std::abs(Gy);
+	float a, c, c2;
+	if (ax >= ay)
+	{
+		c = (float)ay / ((float)ax + (float)DBL_EPSILON);
+		c2 = c*c;
+		a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+	}
+	else
+	{
+		c = (float)ax / ((float)ay + (float)DBL_EPSILON);
+		c2 = c*c;
+		a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+	}
+	if (Gx < 0)
+		a = 180.f - a;
+	if (Gy < 0)
+		a = 360.f - a;
+	return (a*(PI/180));
+}
+
+float HafCpu_FastAtan2_deg
+(
+vx_int16	  Gx,
+vx_int16      Gy
+)
+{
+	vx_uint16 ax, ay;
+	ax = std::abs(Gx), ay = std::abs(Gy);
+	float a, c, c2;
+	if (ax >= ay)
+	{
+		c = (float)ay / ((float)ax + (float)DBL_EPSILON);
+		c2 = c*c;
+		a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+	}
+	else
+	{
+		c = (float)ax / ((float)ay + (float)DBL_EPSILON);
+		c2 = c*c;
+		a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+	}
+	if (Gx < 0)
+		a = 180.f - a;
+	if (Gy < 0)
+		a = 360.f - a;
+	return a;
+}
+
+int HafCpu_Phase_U8_S16S16
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pPhaseImage,
+	vx_uint32     phaseImageStrideInBytes,
+	vx_int16    * pGxImage,
+	vx_uint32     gxImageStrideInBytes,
+	vx_int16    * pGyImage,
+	vx_uint32     gyImageStrideInBytes
+)
+{
+	unsigned int y = 0;
+	// do the plain vanilla version with atan2
+	while (y < dstHeight)
+	{
+		vx_uint8 *pdst = pPhaseImage;
+		vx_int16 *pGx = pGxImage;
+		vx_int16 *pGy = pGyImage;
+
+		for (unsigned int x = 0; x < dstWidth; x++)
+		{
+#if 0
+			float arct = atan2((float)pGy[x], (float)pGx[x]);
+			if (arct < 0.f)
+			{
+				arct += TWOPI;
+			}
+			// normalize and copy to dst
+			*pdst++ = (vx_uint8)((vx_uint32)((float)(arct / PI) * 128 + 0.5) & 0xFF);
+#else
+			float scale = (float)128 / 180.f;
+			float arct = HafCpu_FastAtan2_deg(pGx[x], pGy[x]);
+			// normalize and copy to dst
+			*pdst++ = (vx_uint8)((vx_uint32)(arct*scale + 0.5) & 0xFF);
+#endif
+		}
+		pPhaseImage += phaseImageStrideInBytes;
+		pGxImage += (gxImageStrideInBytes>>1);
+		pGyImage += (gyImageStrideInBytes>>1);
+		y++;
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_FastAtan2_Canny
+(
+vx_int16	  Gx,
+vx_int16      Gy
+)
+{
+	unsigned int ret;
+	vx_uint16 ax, ay;
+	ax = std::abs(Gx), ay = std::abs(Gy);	// todo:: check if math.h function is faster
+	float d1 = (float)ax*0.4142135623730950488016887242097f;
+	float d2 = (float)ax*2.4142135623730950488016887242097f;
+	ret = (Gx*Gy) < 0 ? 3 : 1;
+	if (ay <= d1)
+		ret = 0;
+	if (ay >= d2)
+		ret = 2;
+	return ret;
+}
diff --git a/openvx/ago/ago_haf_cpu_canny.cpp b/openvx/ago/ago_haf_cpu_canny.cpp
new file mode 100644
index 0000000..caa0f24
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_canny.cpp
@@ -0,0 +1,1060 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+static const int n_offset[][2][2] = {
+	{ { -1, 0 }, { 1, 0 } },
+	{ { 1, -1 }, { -1, 1 } },
+	{ { 0, -1 }, { 0, 1 } },
+	{ { -1, -1 }, { 1, 1 } },
+};
+static const ago_coord2d_short_t dir_offsets[8] = {
+	{ -1, -1 },
+	{ 0, -1 },
+	{ +1, -1 },
+	{ -1, 0 },
+	{ +1, 0 },
+	{ -1, +1 },
+	{ 0, +1 },
+	{ +1, +1 },
+};
+
+
+int HafCpu_CannySobel_U16_U8_3x3_L1NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	)
+{
+	int x, y;
+	int prefixWidth = ((intptr_t)(pDstImage)) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	pSrcImage += srcImageStrideInBytes;
+	vx_uint32 dstride = dstImageStrideInBytes >> 1;
+	pDstImage += dstride;		// don't care about border. start processing from row2
+	__m128i z = _mm_setzero_si128(), c6 = _mm_set1_epi16(6);
+	vx_int16 *r0 = (vx_int16*)(pLocalData + 16);
+	vx_int16 *r1 = r0 + ((dstWidth + 15) & ~15);
+
+	for (y = 1; y < (int)dstHeight - 1; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage;
+		const vx_uint8* srow2 = pSrcImage + srcImageStrideInBytes;
+		vx_uint16* drow = (vx_uint16*)pDstImage;
+
+		for (x = 0; x < prefixWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 1] - (vx_int16)srow0[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow2[x - 1] + 2 * ((vx_int16)srow1[x + 1] - (vx_int16)srow1[x - 1]);
+			vx_int16 Gy = (vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow0[x - 1] - (vx_int16)srow0[x + 1] + 2 * ((vx_int16)srow2[x] - (vx_int16)srow0[x]);
+			vx_int16 tmp = abs(Gx) + abs(Gy);
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		// do vertical convolution - SSE
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i t0 = _mm_add_epi16(_mm_add_epi16(s0, s2), _mm_slli_epi16(s1, 1));
+			__m128i t1 = _mm_sub_epi16(s2, s0);
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+			_mm_store_si128((__m128i*)(r1 + x), t1);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst - SSE
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r1 + x - 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r1 + x));
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r1 + x + 1));
+
+			__m128i t0 = _mm_sub_epi16(s1, s0);
+			__m128i t1 = _mm_add_epi16(_mm_add_epi16(s2, s4), _mm_slli_epi16(s3, 1));
+			t1 = _mm_sub_epi16(z, t1);
+			for (int i = 0; i < 8; i++){
+				M128I(s1).m128i_i16[i] = HafCpu_FastAtan2_Canny(M128I(t0).m128i_i16[i], M128I(t1).m128i_i16[i]);
+			}
+			t0 = _mm_add_epi16(_mm_abs_epi16(t0), _mm_abs_epi16(t1));
+			// pack with signed saturation
+			t0 = _mm_or_si128(_mm_slli_epi16(t0, 2), s1);
+			// store magnitude and angle to destination
+			_mm_store_si128((__m128i*)(drow + x), t0);
+		}
+
+		for (x = alignedWidth + prefixWidth - 1; x < (int)dstWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 1] - (vx_int16)srow0[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow2[x - 1] + 2 * ((vx_int16)srow1[x + 1] - (vx_int16)srow1[x - 1]);
+			vx_int16 Gy = (vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow0[x - 1] - (vx_int16)srow0[x + 1] + 2 * ((vx_int16)srow2[x] - (vx_int16)srow0[x]);
+			vx_int16 tmp = abs(Gx) + abs(Gy);
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstride;
+	}
+	return AGO_SUCCESS;
+}
+
+// Using separable filter
+//			-1	-2	0	2	1			1
+//										4
+//  Gx =								6
+//										4
+//										1
+
+int HafCpu_CannySobel_U16_U8_5x5_L1NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	)
+{
+	int x, y;
+	int prefixWidth = ((intptr_t)(pDstImage)) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i z = _mm_setzero_si128(), c6 = _mm_set1_epi16(6);
+	vx_uint32 dstride = dstImageStrideInBytes >> 1;
+	pDstImage += 2 * dstride;		// don't care about border. start processing from row2
+	pSrcImage += 2 * srcImageStrideInBytes;
+	vx_int16 *r0 = (vx_int16*)(pLocalData + 16);
+	vx_int16 *r1 = r0 + ((dstWidth + 15) & ~15);
+
+	for (y = 2; y < (int)dstHeight - 2; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - 2 * srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow2 = pSrcImage;
+		const vx_uint8* srow3 = pSrcImage + srcImageStrideInBytes;
+		const vx_uint8* srow4 = pSrcImage + 2 * srcImageStrideInBytes;
+
+		vx_uint16* drow = (vx_uint16*)pDstImage;
+
+		for (x = 0; x < prefixWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 2] + (2 * ((vx_int16)srow0[x + 1])) - (2 * ((vx_int16)srow0[x - 1])) - (vx_int16)srow0[x - 2]
+				+ 4 * ((vx_int16)srow1[x + 2] + (2 * ((vx_int16)srow1[x + 1])) - (2 * ((vx_int16)srow1[x - 1])) - (vx_int16)srow1[x - 2])
+				+ 6 * ((vx_int16)srow2[x + 2] + (2 * ((vx_int16)srow2[x + 1])) - (2 * ((vx_int16)srow2[x - 1])) - (vx_int16)srow2[x - 2])
+				+ 4 * ((vx_int16)srow3[x + 2] + (2 * ((vx_int16)srow3[x + 1])) - (2 * ((vx_int16)srow3[x - 1])) - (vx_int16)srow3[x - 2])
+				+ (vx_int16)srow4[x + 2] + (2 * ((vx_int16)srow4[x + 1])) - (2 * ((vx_int16)srow4[x - 1])) - (vx_int16)srow4[x - 2];
+			vx_int16 Gy = (vx_int16)srow4[x - 2] + (4 * (vx_int16)srow4[x - 1]) + (6 * (vx_int16)srow4[x]) + (4 * (vx_int16)srow4[x + 1]) + (vx_int16)srow4[x + 2]
+				+ 2 * ((vx_int16)srow3[x - 2] + (4 * (vx_int16)srow3[x - 1]) + (6 * (vx_int16)srow3[x]) + (4 * (vx_int16)srow3[x + 1]) + (vx_int16)srow3[x + 2])
+				- 2 * ((vx_int16)srow1[x - 2] + (4 * (vx_int16)srow1[x - 1]) + (6 * (vx_int16)srow1[x]) + (4 * (vx_int16)srow1[x + 1]) + (vx_int16)srow1[x + 2])
+				- (vx_int16)srow0[x - 2] + (4 * (vx_int16)srow0[x - 1]) + (6 * (vx_int16)srow0[x]) + (4 * (vx_int16)srow0[x + 1]) + (vx_int16)srow0[x + 2];
+			vx_int16 tmp = abs(Gx) + abs(Gy);
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		// do vertical convolution - SSE
+		for (x = prefixWidth; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow3 + x)), z);
+			__m128i s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow4 + x)), z);
+
+			__m128i t0 = _mm_add_epi16(_mm_slli_epi16(_mm_add_epi16(s1, s3), 2), _mm_mullo_epi16(s2, c6));
+			t0 = _mm_add_epi16(t0, _mm_add_epi16(s0, s4));
+
+			__m128i t1 = _mm_slli_epi16(_mm_sub_epi16(s3, s1), 1);
+			t1 = _mm_add_epi16(t1, _mm_sub_epi16(s4, s0));
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+			_mm_store_si128((__m128i*)(r1 + x), t1);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst - SSE
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 2));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r0 + x + 2));
+
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r1 + x - 2));
+			__m128i s5 = _mm_loadu_si128((const __m128i*)(r1 + x - 1));
+			__m128i s6 = _mm_loadu_si128((const __m128i*)(r1 + x));
+			__m128i s7 = _mm_loadu_si128((const __m128i*)(r1 + x + 1));
+			__m128i s8 = _mm_loadu_si128((const __m128i*)(r1 + x + 2));
+
+			__m128i t0 = _mm_slli_epi16(_mm_sub_epi16(s2, s1), 1);
+			t0 = _mm_adds_epi16(t0, _mm_sub_epi16(s3, s0));
+			__m128i t1 = _mm_slli_epi16(_mm_add_epi16(s5, s7), 2);
+			s0 = _mm_mullo_epi16(s6, c6);
+			t1 = _mm_add_epi16(t1, _mm_add_epi16(s4, s8));
+			t1 = _mm_add_epi16(t1, s0);
+			t1 = _mm_sub_epi16(z, t1);
+			// find magnitude
+			s0 = _mm_add_epi16(_mm_abs_epi16(t0), _mm_abs_epi16(t1));
+			//s0 = _mm_min_epi16(s0, clamp);
+			for (int i = 0; i < 8; i++){
+				M128I(t0).m128i_i16[i] = HafCpu_FastAtan2_Canny(M128I(t0).m128i_i16[i], M128I(t1).m128i_i16[i]);
+			}
+			s0 = _mm_or_si128(_mm_slli_epi16(s0, 2), t0);
+			// store magnitude and angle to destination
+			_mm_store_si128((__m128i*)(drow + x), s0);
+		}
+
+		for (x = alignedWidth + prefixWidth - 1; x < (int)dstWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 2] + (2 * ((vx_int16)srow0[x + 1])) - (2 * ((vx_int16)srow0[x - 1])) - (vx_int16)srow0[x - 2]
+				+ 4 * ((vx_int16)srow1[x + 2] + (2 * ((vx_int16)srow1[x + 1])) - (2 * ((vx_int16)srow1[x - 1])) - (vx_int16)srow1[x - 2])
+				+ 6 * ((vx_int16)srow2[x + 2] + (2 * ((vx_int16)srow2[x + 1])) - (2 * ((vx_int16)srow2[x - 1])) - (vx_int16)srow2[x - 2])
+				+ 4 * ((vx_int16)srow3[x + 2] + (2 * ((vx_int16)srow3[x + 1])) - (2 * ((vx_int16)srow3[x - 1])) - (vx_int16)srow3[x - 2])
+				+ (vx_int16)srow4[x + 2] + (2 * ((vx_int16)srow4[x + 1])) - (2 * ((vx_int16)srow4[x - 1])) - (vx_int16)srow4[x - 2];
+			vx_int16 Gy = (vx_int16)srow4[x - 2] + (4 * (vx_int16)srow4[x - 1]) + (6 * (vx_int16)srow4[x]) + (4 * (vx_int16)srow4[x + 1]) + (vx_int16)srow4[x + 2]
+				+ 2 * ((vx_int16)srow3[x - 2] + (4 * (vx_int16)srow3[x - 1]) + (6 * (vx_int16)srow3[x]) + (4 * (vx_int16)srow3[x + 1]) + (vx_int16)srow3[x + 2])
+				- 2 * ((vx_int16)srow1[x - 2] + (4 * (vx_int16)srow1[x - 1]) + (6 * (vx_int16)srow1[x]) + (4 * (vx_int16)srow1[x + 1]) + (vx_int16)srow1[x + 2])
+				- (vx_int16)srow0[x - 2] + (4 * (vx_int16)srow0[x - 1]) + (6 * (vx_int16)srow0[x]) + (4 * (vx_int16)srow0[x + 1]) + (vx_int16)srow0[x + 2];
+			vx_int16 tmp = abs(Gx) + abs(Gy);
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstride;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_CannySobel_U16_U8_7x7_L1NORM
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint16   * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pLocalData
+	)
+{
+	int x, y;
+	int prefixWidth = ((intptr_t)(pDstImage)) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i z = _mm_setzero_si128(), c5 = _mm_set1_epi16(5), c6 = _mm_set1_epi16(6);
+	__m128i c15 = _mm_set1_epi16(15), c20 = _mm_set1_epi16(20);
+	__m128i clamp = _mm_set1_epi16(0x3FFF);
+
+	vx_uint32 dstride = dstImageStrideInBytes >> 1;
+	pDstImage += 3 * dstride;		// don't care about border. start processing from row2
+	pSrcImage += 3 * srcImageStrideInBytes;
+	vx_int16 *r0 = (vx_int16*)(pLocalData + 16);
+	vx_int16 *r1 = r0 + ((dstWidth + 15) & ~15);
+
+	for (y = 3; y < (int)dstHeight - 3; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - 3 * srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage - 2 * srcImageStrideInBytes;
+		const vx_uint8* srow2 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow3 = pSrcImage;
+		const vx_uint8* srow4 = pSrcImage + srcImageStrideInBytes;
+		const vx_uint8* srow5 = pSrcImage + 2 * srcImageStrideInBytes;
+		const vx_uint8* srow6 = pSrcImage + 3 * srcImageStrideInBytes;
+
+		vx_uint16* drow = (vx_uint16*)pDstImage;
+
+		for (x = 0; x < prefixWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 3] + (4 * (vx_int16)srow0[x + 2]) + (5 * (vx_int16)srow0[x + 1]) - (5 * (vx_int16)srow0[x - 1]) - (4 * (vx_int16)srow0[x - 2]) - (vx_int16)srow0[x - 3]
+				+ 6 * ((vx_int16)srow1[x + 3] + (4 * (vx_int16)srow1[x + 2]) + (5 * (vx_int16)srow1[x + 1]) - (5 * (vx_int16)srow1[x - 1]) - (4 * (vx_int16)srow1[x - 2]) - (vx_int16)srow1[x - 3])
+				+ 15 * ((vx_int16)srow2[x + 3] + (4 * (vx_int16)srow2[x + 2]) + (5 * (vx_int16)srow2[x + 1]) - (5 * (vx_int16)srow2[x - 1]) - (4 * (vx_int16)srow2[x - 2]) - (vx_int16)srow2[x - 3])
+				+ 20 * ((vx_int16)srow3[x + 3] + (4 * (vx_int16)srow3[x + 2]) + (5 * (vx_int16)srow3[x + 1]) - (5 * (vx_int16)srow3[x - 1]) - (4 * (vx_int16)srow3[x - 2]) - (vx_int16)srow3[x - 3])
+				+ 15 * ((vx_int16)srow4[x + 3] + (4 * (vx_int16)srow4[x + 2]) + (5 * (vx_int16)srow4[x + 1]) - (5 * (vx_int16)srow4[x - 1]) - (4 * (vx_int16)srow4[x - 2]) - (vx_int16)srow4[x - 3])
+				+ 6 * ((vx_int16)srow5[x + 3] + (4 * (vx_int16)srow5[x + 2]) + (5 * (vx_int16)srow5[x + 1]) - (5 * (vx_int16)srow5[x - 1]) - (4 * (vx_int16)srow5[x - 2]) - (vx_int16)srow5[x - 3])
+				+ (vx_int16)srow6[x + 3] + (4 * (vx_int16)srow6[x + 2]) + (5 * (vx_int16)srow6[x + 1]) - (5 * (vx_int16)srow6[x - 1]) - (4 * (vx_int16)srow6[x - 2]) - (vx_int16)srow6[x - 3];
+			vx_int16 Gy = (vx_int16)srow6[x - 3] + (vx_int16)srow6[x + 3] + (6 * ((vx_int16)srow6[x - 2] + (vx_int16)srow6[x + 2])) + (15 * ((vx_int16)srow6[x - 1] + (vx_int16)srow6[x + 1])) + (20 * (vx_int16)srow6[x])
+				+ 4 * ((vx_int16)srow5[x - 3] + (vx_int16)srow5[x + 3] + (6 * ((vx_int16)srow5[x - 2] + (vx_int16)srow5[x + 2])) + (15 * ((vx_int16)srow5[x - 1] + (vx_int16)srow5[x + 1])) + (20 * (vx_int16)srow5[x]))
+				+ 5 * ((vx_int16)srow4[x - 3] + (vx_int16)srow4[x + 3] + (6 * ((vx_int16)srow4[x - 2] + (vx_int16)srow4[x + 2])) + (15 * ((vx_int16)srow4[x - 1] + (vx_int16)srow4[x + 1])) + (20 * (vx_int16)srow4[x]))
+				- 5 * ((vx_int16)srow2[x - 3] + (vx_int16)srow2[x + 3] + (6 * ((vx_int16)srow2[x - 2] + (vx_int16)srow2[x + 2])) + (15 * ((vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1])) + (20 * (vx_int16)srow2[x]))
+				- 4 * ((vx_int16)srow1[x - 3] + (vx_int16)srow1[x + 3] + (6 * ((vx_int16)srow1[x - 2] + (vx_int16)srow1[x + 2])) + (15 * ((vx_int16)srow1[x - 1] + (vx_int16)srow1[x + 1])) + (20 * (vx_int16)srow1[x]))
+				- ((vx_int16)srow0[x - 3] + (vx_int16)srow0[x + 3] + (6 * ((vx_int16)srow0[x - 2] + (vx_int16)srow0[x + 2])) + (15 * ((vx_int16)srow0[x - 1] + (vx_int16)srow0[x + 1])) + (20 * (vx_int16)srow0[x]));
+			vx_int16 tmp = abs(Gx) + abs(Gy);
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		// do vertical convolution - SSE
+		for (x = prefixWidth; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow3 + x)), z);
+			__m128i s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow4 + x)), z);
+			__m128i s5 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow5 + x)), z);
+			__m128i s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow6 + x)), z);
+
+			__m128i t0 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s1, s5), c6), _mm_mullo_epi16(s3, c20));
+			__m128i t2 = _mm_mullo_epi16(_mm_add_epi16(s2, s4), c15);
+			t0 = _mm_add_epi16(t0, _mm_add_epi16(s0, s6));
+			__m128i t1 = _mm_slli_epi16(_mm_sub_epi16(s5, s1), 2);
+			t0 = _mm_add_epi16(t0, t2);
+
+			t2 = _mm_mullo_epi16(_mm_sub_epi16(s4, s2), c5);
+			t0 = _mm_srai_epi16(t0, 2);
+			t1 = _mm_add_epi16(t1, _mm_sub_epi16(s6, s0));
+			t1 = _mm_add_epi16(t1, t2);
+			t1 = _mm_srai_epi16(t1, 2);
+
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+			_mm_store_si128((__m128i*)(r1 + x), t1);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst - SSE
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 3));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x - 2));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r0 + x + 2));
+			__m128i s5 = _mm_loadu_si128((const __m128i*)(r0 + x + 3));
+
+
+			__m128i t0 = _mm_slli_epi16(_mm_subs_epi16(s4, s1), 2);
+			__m128i t1 = _mm_mullo_epi16(_mm_subs_epi16(s3, s2), c5);
+			t0 = _mm_adds_epi16(t0, _mm_subs_epi16(s5, s0));
+			t0 = _mm_adds_epi16(t0, t1);
+
+			s0 = _mm_loadu_si128((const __m128i*)(r1 + x - 3));
+			s1 = _mm_loadu_si128((const __m128i*)(r1 + x - 2));
+			s2 = _mm_loadu_si128((const __m128i*)(r1 + x - 1));
+			s3 = _mm_loadu_si128((const __m128i*)(r1 + x));
+			s4 = _mm_loadu_si128((const __m128i*)(r1 + x + 1));
+			s5 = _mm_loadu_si128((const __m128i*)(r1 + x + 2));
+			__m128i s6 = _mm_loadu_si128((const __m128i*)(r1 + x + 3));
+
+
+			t1 = _mm_adds_epi16(_mm_mullo_epi16(_mm_add_epi16(s1, s5), c6), _mm_mullo_epi16(s3, c20));
+			__m128i t2 = _mm_mullo_epi16(_mm_add_epi16(s2, s4), c15);
+			t1 = _mm_adds_epi16(t1, _mm_adds_epi16(s0, s6));
+			t1 = _mm_adds_epi16(t1, t2);
+			t1 = _mm_subs_epi16(z, t1);
+			// find magnitude
+			s0 = _mm_add_epi16(_mm_abs_epi16(t0), _mm_abs_epi16(t1));
+			s0 = _mm_min_epi16(s0, clamp);
+			for (int i = 0; i < 8; i++){
+				M128I(t0).m128i_i16[i] = HafCpu_FastAtan2_Canny(M128I(t0).m128i_i16[i], M128I(t1).m128i_i16[i]);
+			}
+			s0 = _mm_or_si128(_mm_slli_epi16(s0, 2), t0);
+			// store magnitude and angle to destination
+			_mm_store_si128((__m128i*)(drow + x), s0);
+		}
+
+		for (x = alignedWidth + prefixWidth - 1; x < (int)dstWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 3] + (4 * (vx_int16)srow0[x + 2]) + (5 * (vx_int16)srow0[x + 1]) - (5 * (vx_int16)srow0[x - 1]) - (4 * (vx_int16)srow0[x - 2]) - (vx_int16)srow0[x - 3]
+				+ 6 * ((vx_int16)srow1[x + 3] + (4 * (vx_int16)srow1[x + 2]) + (5 * (vx_int16)srow1[x + 1]) - (5 * (vx_int16)srow1[x - 1]) - (4 * (vx_int16)srow1[x - 2]) - (vx_int16)srow1[x - 3])
+				+ 15 * ((vx_int16)srow2[x + 3] + (4 * (vx_int16)srow2[x + 2]) + (5 * (vx_int16)srow2[x + 1]) - (5 * (vx_int16)srow2[x - 1]) - (4 * (vx_int16)srow2[x - 2]) - (vx_int16)srow2[x - 3])
+				+ 20 * ((vx_int16)srow3[x + 3] + (4 * (vx_int16)srow3[x + 2]) + (5 * (vx_int16)srow3[x + 1]) - (5 * (vx_int16)srow3[x - 1]) - (4 * (vx_int16)srow3[x - 2]) - (vx_int16)srow3[x - 3])
+				+ 15 * ((vx_int16)srow4[x + 3] + (4 * (vx_int16)srow4[x + 2]) + (5 * (vx_int16)srow4[x + 1]) - (5 * (vx_int16)srow4[x - 1]) - (4 * (vx_int16)srow4[x - 2]) - (vx_int16)srow4[x - 3])
+				+ 6 * ((vx_int16)srow5[x + 3] + (4 * (vx_int16)srow5[x + 2]) + (5 * (vx_int16)srow5[x + 1]) - (5 * (vx_int16)srow5[x - 1]) - (4 * (vx_int16)srow5[x - 2]) - (vx_int16)srow5[x - 3])
+				+ (vx_int16)srow6[x + 3] + (4 * (vx_int16)srow6[x + 2]) + (5 * (vx_int16)srow6[x + 1]) - (5 * (vx_int16)srow6[x - 1]) - (4 * (vx_int16)srow6[x - 2]) - (vx_int16)srow6[x - 3];
+			vx_int16 Gy = (vx_int16)srow6[x - 3] + (vx_int16)srow6[x + 3] + (6 * ((vx_int16)srow6[x - 2] + (vx_int16)srow6[x + 2])) + (15 * ((vx_int16)srow6[x - 1] + (vx_int16)srow6[x + 1])) + (20 * (vx_int16)srow6[x])
+				+ 4 * ((vx_int16)srow5[x - 3] + (vx_int16)srow5[x + 3] + (6 * ((vx_int16)srow5[x - 2] + (vx_int16)srow5[x + 2])) + (15 * ((vx_int16)srow5[x - 1] + (vx_int16)srow5[x + 1])) + (20 * (vx_int16)srow5[x]))
+				+ 5 * ((vx_int16)srow4[x - 3] + (vx_int16)srow4[x + 3] + (6 * ((vx_int16)srow4[x - 2] + (vx_int16)srow4[x + 2])) + (15 * ((vx_int16)srow4[x - 1] + (vx_int16)srow4[x + 1])) + (20 * (vx_int16)srow4[x]))
+				- 5 * ((vx_int16)srow2[x - 3] + (vx_int16)srow2[x + 3] + (6 * ((vx_int16)srow2[x - 2] + (vx_int16)srow2[x + 2])) + (15 * ((vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1])) + (20 * (vx_int16)srow2[x]))
+				- 4 * ((vx_int16)srow1[x - 3] + (vx_int16)srow1[x + 3] + (6 * ((vx_int16)srow1[x - 2] + (vx_int16)srow1[x + 2])) + (15 * ((vx_int16)srow1[x - 1] + (vx_int16)srow1[x + 1])) + (20 * (vx_int16)srow1[x]))
+				- ((vx_int16)srow0[x - 3] + (vx_int16)srow0[x + 3] + (6 * ((vx_int16)srow0[x - 2] + (vx_int16)srow0[x + 2])) + (15 * ((vx_int16)srow0[x - 1] + (vx_int16)srow0[x + 1])) + (20 * (vx_int16)srow0[x]));
+			vx_int16 tmp = abs(Gx) + abs(Gy);
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstride;
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_3x3_L1NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper,
+		vx_uint8			 * pScratch
+	)
+{
+	vx_int16 *Gx, *Gy;
+	vx_uint8 * pTemp;
+	vx_uint32 dstride = ((dstWidth + 15)&~15);
+	Gx = (vx_int16 *)pScratch;
+	Gy = (vx_int16 *)(pScratch + dstride*sizeof(vx_int16));
+	pTemp = pScratch + 2*dstride*sizeof(vx_int16);
+
+	// compute Sobel gradients
+	HafCpu_Sobel_S16S16_U8_3x3_GXY(dstWidth, dstHeight - 2, Gx + dstride, dstride * 2, Gy + dstride, dstride * 2, pSrcImage + srcImageStrideInBytes, srcImageStrideInBytes, pTemp);
+	
+	// compute L1 norm and phase
+	unsigned int y = 1;
+	vx_int16 *pGx = Gx + dstride;
+	vx_int16 *pGy = Gy + dstride;
+	vx_int16 *pMag = Gx;
+	while (y < dstHeight)
+	{
+		vx_uint16 *pdst = (vx_uint16*)pMag;		// to store the result
+
+		for (unsigned int x = 1; x < dstWidth; x++)
+		{
+			vx_uint8 orn;	// orientation
+
+			float scale = (float)128 / 180.f;
+			float arct = HafCpu_FastAtan2_deg(pGx[x], pGy[x]);
+			// normalize and convert to degrees 0-180
+			orn = (((int)(arct*scale) + 16) >> 5)&7;		// quantize to 8 (22.5 degrees)
+			if (orn >= 4)orn -= 4;
+			vx_int16 val = (vx_int16)(abs(pGx[x]) + abs(pGy[x]));
+			pdst[x] = (vx_uint16)((val << 2) | orn);				// store both mag and orientation
+		}
+		pGx += dstride;
+		pGy += dstride;
+		pMag += dstride;
+		y++;
+	}
+
+	// do minmax suppression: from Gx
+	ago_coord2d_ushort_t *pxyStack = xyStack;
+	for (y = 1; y < dstHeight - 1; y++)
+	{
+		vx_uint8* pOut = pDst + y*dstStrideInBytes;
+		vx_int16 *pSrc = (vx_int16 *)(Gx + y * dstride);	// we are processing from 2nd row
+		for (unsigned int x = 1; x < dstWidth - 1; x++, pSrc++)
+		{
+			vx_int32 edge;
+			// get the Mag and angle
+			int mag = (pSrc[0] >> 2);
+			int ang = pSrc[0] & 3;
+			int offset0 = n_offset[ang][0][1] * dstride + n_offset[ang][0][0];
+			int offset1 = n_offset[ang][1][1] * dstride + n_offset[ang][1][0];
+			edge = ((mag >(pSrc[offset0] >> 2)) && (mag >(pSrc[offset1] >> 2))) ? mag : 0;
+			if (edge > hyst_upper){
+				pOut[x] = (vx_int8)255;
+				// add the cordinates to stacktop
+				pxyStack->x = x;	// store x and y co-ordinates
+				pxyStack->y = y;	// store x and y co-ordinates
+				pxyStack++;
+			}
+			else if (edge <= hyst_lower){
+				pOut[x] = 0;
+			}
+			else pOut[x] = 127;
+		}
+	}
+	*pxyStackTop = (vx_uint32)(pxyStack - xyStack);
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_5x5_L1NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_CannySobelSuppThreshold_U8XY_U8_7x7_L1NORM
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint8             * pSrcImage,
+		vx_uint32              srcImageStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	)
+{
+	return AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+}
+
+int HafCpu_CannySuppThreshold_U8XY_U16_3x3
+	(
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32            * pxyStackTop,
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDst,
+		vx_uint32              dstStrideInBytes,
+		vx_uint16            * pSrc,
+		vx_uint32              srcStrideInBytes,
+		vx_uint16               hyst_lower,
+		vx_uint16               hyst_upper
+	)
+{
+	// do minmax suppression: from Gx
+	vx_uint32 sstride = srcStrideInBytes>>1;
+	ago_coord2d_ushort_t *pxyStack = xyStack;
+	for (unsigned int y = 1; y < dstHeight - 1; y++)
+	{
+		vx_uint8* pOut = pDst + y*dstStrideInBytes;
+		vx_uint16 *pLocSrc = pSrc + y * sstride + 1;	// we are processing from 2nd row
+		for (unsigned int x = 1; x < dstWidth - 1; x++, pLocSrc++)
+		{
+			vx_int32 edge;
+			// get the Mag and angle
+			int mag = (pLocSrc[0] >> 2);
+			int ang = pLocSrc[0] & 3;
+			int offset0 = n_offset[ang][0][1] * sstride + n_offset[ang][0][0];
+			int offset1 = n_offset[ang][1][1] * sstride + n_offset[ang][1][0];
+			edge = ((mag >(pLocSrc[offset0] >> 2)) && (mag >(pLocSrc[offset1] >> 2))) ? mag : 0;
+			if (edge > hyst_upper){
+				pOut[x] = (vx_int8)255;
+				// add the cordinates to stacktop
+				pxyStack->x = x;	// store x and y co-ordinates
+				pxyStack->y = y;	// store x and y co-ordinates
+				pxyStack++;
+
+			}
+			else if (edge <= hyst_lower){
+				pOut[x] = 0;
+			}
+			else pOut[x] = 127;
+		}
+	}
+	*pxyStackTop = (vx_uint32)(pxyStack - xyStack);
+//	printf("Number of points in XY stack: %d\n", *pxyStackTop);
+#if 0
+	FILE *fp = fopen("c:\\temp\\norm.yuv", "wb");
+	if (fp) fwrite(pDst, 1, 640 * 480, fp);
+	fclose(fp);
+#endif
+	return AGO_SUCCESS;
+}
+
+int HafCpu_CannyEdgeTrace_U8_U8XY
+	(
+		vx_uint32              dstWidth,
+		vx_uint32              dstHeight,
+		vx_uint8             * pDstImage,
+		vx_uint32              dstImageStrideInBytes,
+		vx_uint32              capacityOfXY,
+		ago_coord2d_ushort_t   xyStack[],
+		vx_uint32              xyStackTop
+	)
+{
+	ago_coord2d_ushort_t *pxyStack = xyStack + xyStackTop;
+	while (pxyStack != xyStack){
+			pxyStack--;
+			vx_uint16 x = pxyStack->x;
+			vx_uint16 y = pxyStack->y;
+			// look at all the neighbors for strong edge value
+		for (int i = 0; i < 8; i++){
+			const ago_coord2d_short_t offs = dir_offsets[i];
+			vx_int16 x1 = x + offs.x;
+			vx_int16 y1 = y + offs.y;
+			vx_uint8 *pDst = pDstImage + y1*dstImageStrideInBytes + x1;
+			if (*pDst == 127)
+			{
+				*pDst |= 0x80;		// *pDst = 255
+				*((unsigned *)pxyStack) = (y1<<16)|x1;
+				pxyStack++;
+			}
+		}
+	}
+	// go through the entire destination and convert all 127 to 0
+	const __m128i mm127 = _mm_set1_epi8((char)127);
+	for (unsigned int y = 0; y < dstHeight; y++) {
+		__m128i * src = (__m128i *)pDstImage;
+		vx_uint32 width = (dstWidth + 15) >> 4;
+
+		for (unsigned int x = 0; x < width; x++) {
+			__m128i mask;
+			__m128i pixels = _mm_load_si128(src);
+			mask = _mm_cmpeq_epi8(pixels, mm127);
+			pixels = _mm_andnot_si128(mask, pixels);
+			_mm_store_si128(src++, pixels);
+		}
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_CannySobel_U16_U8_3x3_L2NORM
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint16   * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_uint8    * pLocalData
+)
+{
+	int x, y;
+	int prefixWidth = ((intptr_t)(pDstImage)) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	pSrcImage += srcImageStrideInBytes;
+	vx_uint32 dstride = dstImageStrideInBytes >> 1;
+	pDstImage += dstride;		// don't care about border. start processing from row2
+	__m128i z = _mm_setzero_si128(), c6 = _mm_set1_epi16(6);
+	vx_int16 *r0 = (vx_int16*)(pLocalData + 16);
+	vx_int16 *r1 = r0 + ((dstWidth + 15) & ~15);
+
+	for (y = 1; y < (int)dstHeight - 1; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage;
+		const vx_uint8* srow2 = pSrcImage + srcImageStrideInBytes;
+		vx_uint16* drow = (vx_uint16*)pDstImage;
+
+		for (x = 0; x < prefixWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 1] - (vx_int16)srow0[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow2[x - 1] + 2 * ((vx_int16)srow1[x + 1] - (vx_int16)srow1[x - 1]);
+			vx_int16 Gy = (vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow0[x - 1] - (vx_int16)srow0[x + 1] + 2 * ((vx_int16)srow2[x] - (vx_int16)srow0[x]);
+			vx_int16 tmp = (vx_int16)sqrt((Gx*Gx) + (Gy*Gy));
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		// do vertical convolution - SSE
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i t0 = _mm_add_epi16(_mm_add_epi16(s0, s2), _mm_slli_epi16(s1, 1));
+			__m128i t1 = _mm_sub_epi16(s2, s0);
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+			_mm_store_si128((__m128i*)(r1 + x), t1);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst - SSE
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r1 + x - 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r1 + x));
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r1 + x + 1));
+
+			__m128i t0 = _mm_sub_epi16(s1, s0);
+			__m128i t1 = _mm_add_epi16(_mm_add_epi16(s2, s4), _mm_slli_epi16(s3, 1));
+			t1 = _mm_sub_epi16(z, t1);
+			s0 = _mm_mullo_epi16(t0, t0);
+			s1 = _mm_mullo_epi16(t1, t1);
+			// unpack to dwords for multiplication
+			s2 = _mm_unpackhi_epi16(s0, z);
+			s0 = _mm_unpacklo_epi16(s0, z);
+			s3 = _mm_unpackhi_epi16(s1, z);
+			s1 = _mm_unpacklo_epi16(s1, z);
+			__m128 f0 = _mm_cvtepi32_ps(s0);
+			__m128 f1 = _mm_cvtepi32_ps(s2);
+			__m128 f2 = _mm_cvtepi32_ps(s1);
+			__m128 f3 = _mm_cvtepi32_ps(s3);
+			f0 = _mm_add_ps(f0, f2);
+			f1 = _mm_add_ps(f1, f3);
+			f0 = _mm_sqrt_ps(f0);
+			f1 = _mm_sqrt_ps(f1);
+
+			for (int i = 0; i < 8; i++){
+				M128I(s1).m128i_i16[i] = HafCpu_FastAtan2_Canny(M128I(t0).m128i_i16[i], M128I(t1).m128i_i16[i]);
+			}
+			t0 = _mm_cvtps_epi32(f0);
+			t1 = _mm_cvtps_epi32(f1);
+			// pack with signed saturation
+			t0 = _mm_packus_epi32(t0, t1);
+			t0 = _mm_or_si128(_mm_slli_epi16(t0, 2), s1);
+			// store magnitude and angle to destination
+			_mm_store_si128((__m128i*)(drow + x), t0);
+		}
+
+		for (x = alignedWidth + prefixWidth - 1; x < (int)dstWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 1] - (vx_int16)srow0[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow2[x - 1] + 2 * ((vx_int16)srow1[x + 1] - (vx_int16)srow1[x - 1]);
+			vx_int16 Gy = (vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1] - (vx_int16)srow0[x - 1] - (vx_int16)srow0[x + 1] + 2 * ((vx_int16)srow2[x] - (vx_int16)srow0[x]);
+			vx_int16 tmp = (vx_int16)sqrt((Gx*Gx) + (Gy*Gy));
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstride;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_CannySobel_U16_U8_5x5_L2NORM
+	(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint16   * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_uint8    * pLocalData
+	)
+{
+	int x, y;
+	int prefixWidth = ((intptr_t)(pDstImage)) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i z = _mm_setzero_si128(), c6 = _mm_set1_epi16(6);
+	vx_uint32 dstride = dstImageStrideInBytes >> 1;
+	pDstImage += 2 * dstride;		// don't care about border. start processing from row2
+	pSrcImage += 2 * srcImageStrideInBytes;
+	vx_int16 *r0 = (vx_int16*)(pLocalData + 16);
+	vx_int16 *r1 = r0 + ((dstWidth + 15) & ~15);
+
+	for (y = 2; y < (int)dstHeight - 2; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - 2 * srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow2 = pSrcImage;
+		const vx_uint8* srow3 = pSrcImage + srcImageStrideInBytes;
+		const vx_uint8* srow4 = pSrcImage + 2 * srcImageStrideInBytes;
+
+		vx_uint16* drow = (vx_uint16*)pDstImage;
+
+		for (x = 0; x < prefixWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 2] + (2 * ((vx_int16)srow0[x + 1])) - (2 * ((vx_int16)srow0[x - 1])) - (vx_int16)srow0[x - 2]
+				+ 4 * ((vx_int16)srow1[x + 2] + (2 * ((vx_int16)srow1[x + 1])) - (2 * ((vx_int16)srow1[x - 1])) - (vx_int16)srow1[x - 2])
+				+ 6 * ((vx_int16)srow2[x + 2] + (2 * ((vx_int16)srow2[x + 1])) - (2 * ((vx_int16)srow2[x - 1])) - (vx_int16)srow2[x - 2])
+				+ 4 * ((vx_int16)srow3[x + 2] + (2 * ((vx_int16)srow3[x + 1])) - (2 * ((vx_int16)srow3[x - 1])) - (vx_int16)srow3[x - 2])
+				+ (vx_int16)srow4[x + 2] + (2 * ((vx_int16)srow4[x + 1])) - (2 * ((vx_int16)srow4[x - 1])) - (vx_int16)srow4[x - 2];
+			vx_int16 Gy = (vx_int16)srow4[x - 2] + (4 * (vx_int16)srow4[x - 1]) + (6 * (vx_int16)srow4[x]) + (4 * (vx_int16)srow4[x + 1]) + (vx_int16)srow4[x + 2]
+				+ 2 * ((vx_int16)srow3[x - 2] + (4 * (vx_int16)srow3[x - 1]) + (6 * (vx_int16)srow3[x]) + (4 * (vx_int16)srow3[x + 1]) + (vx_int16)srow3[x + 2])
+				- 2 * ((vx_int16)srow1[x - 2] + (4 * (vx_int16)srow1[x - 1]) + (6 * (vx_int16)srow1[x]) + (4 * (vx_int16)srow1[x + 1]) + (vx_int16)srow1[x + 2])
+				- (vx_int16)srow0[x - 2] + (4 * (vx_int16)srow0[x - 1]) + (6 * (vx_int16)srow0[x]) + (4 * (vx_int16)srow0[x + 1]) + (vx_int16)srow0[x + 2];
+			vx_int16 tmp = (vx_int16)sqrt((Gx*Gx) + (Gy*Gy));
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		// do vertical convolution
+		for (x = prefixWidth; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow3 + x)), z);
+			__m128i s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow4 + x)), z);
+
+			__m128i t0 = _mm_add_epi16(_mm_slli_epi16(_mm_add_epi16(s1, s3), 2), _mm_mullo_epi16(s2, c6));
+			t0 = _mm_add_epi16(t0, _mm_add_epi16(s0, s4));
+
+			__m128i t1 = _mm_slli_epi16(_mm_sub_epi16(s3, s1), 1);
+			t1 = _mm_add_epi16(t1, _mm_sub_epi16(s4, s0));
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+			_mm_store_si128((__m128i*)(r1 + x), t1);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 2));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r0 + x + 2));
+
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r1 + x - 2));
+			__m128i s5 = _mm_loadu_si128((const __m128i*)(r1 + x - 1));
+			__m128i s6 = _mm_loadu_si128((const __m128i*)(r1 + x));
+			__m128i s7 = _mm_loadu_si128((const __m128i*)(r1 + x + 1));
+			__m128i s8 = _mm_loadu_si128((const __m128i*)(r1 + x + 2));
+
+			__m128i t0 = _mm_slli_epi16(_mm_sub_epi16(s2, s1), 1);
+			t0 = _mm_adds_epi16(t0, _mm_sub_epi16(s3, s0));
+			__m128i t1 = _mm_slli_epi16(_mm_add_epi16(s5, s7), 2);
+			s0 = _mm_mullo_epi16(s6, c6);
+			t1 = _mm_add_epi16(t1, _mm_add_epi16(s4, s8));
+			t1 = _mm_adds_epi16(t1, s0);
+			t1 = _mm_sub_epi16(z, t1);
+			// unpack for multiplication
+			s0 = _mm_unpacklo_epi16(t0, t1);
+			s2 = _mm_unpackhi_epi16(t0, t1);
+			s0 = _mm_madd_epi16(s0, s0);
+			s2 = _mm_madd_epi16(s2, s2);
+
+			__m128 f0 = _mm_cvtepi32_ps(s0);
+			__m128 f1 = _mm_cvtepi32_ps(s2);
+			f0 = _mm_sqrt_ps(f0);
+			f1 = _mm_sqrt_ps(f1);
+
+			for (int i = 0; i < 8; i++){
+				M128I(s1).m128i_i16[i] = HafCpu_FastAtan2_Canny(M128I(t0).m128i_i16[i], M128I(t1).m128i_i16[i]);
+			}
+			t0 = _mm_cvtps_epi32(f0);
+			t1 = _mm_cvtps_epi32(f1);
+			// pack with signed saturation
+			t0 = _mm_packus_epi32(t0, t1);
+			t0 = _mm_or_si128(_mm_slli_epi16(t0, 2), s1);
+			// store magnitude and angle to destination
+			_mm_store_si128((__m128i*)(drow + x), t0);
+		}
+
+		for (x = alignedWidth + prefixWidth - 1; x < (int)dstWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 2] + (2 * ((vx_int16)srow0[x + 1])) - (2 * ((vx_int16)srow0[x - 1])) - (vx_int16)srow0[x - 2]
+				+ 4 * ((vx_int16)srow1[x + 2] + (2 * ((vx_int16)srow1[x + 1])) - (2 * ((vx_int16)srow1[x - 1])) - (vx_int16)srow1[x - 2])
+				+ 6 * ((vx_int16)srow2[x + 2] + (2 * ((vx_int16)srow2[x + 1])) - (2 * ((vx_int16)srow2[x - 1])) - (vx_int16)srow2[x - 2])
+				+ 4 * ((vx_int16)srow3[x + 2] + (2 * ((vx_int16)srow3[x + 1])) - (2 * ((vx_int16)srow3[x - 1])) - (vx_int16)srow3[x - 2])
+				+ (vx_int16)srow4[x + 2] + (2 * ((vx_int16)srow4[x + 1])) - (2 * ((vx_int16)srow4[x - 1])) - (vx_int16)srow4[x - 2];
+			vx_int16 Gy = (vx_int16)srow4[x - 2] + (4 * (vx_int16)srow4[x - 1]) + (6 * (vx_int16)srow4[x]) + (4 * (vx_int16)srow4[x + 1]) + (vx_int16)srow4[x + 2]
+				+ 2 * ((vx_int16)srow3[x - 2] + (4 * (vx_int16)srow3[x - 1]) + (6 * (vx_int16)srow3[x]) + (4 * (vx_int16)srow3[x + 1]) + (vx_int16)srow3[x + 2])
+				- 2 * ((vx_int16)srow1[x - 2] + (4 * (vx_int16)srow1[x - 1]) + (6 * (vx_int16)srow1[x]) + (4 * (vx_int16)srow1[x + 1]) + (vx_int16)srow1[x + 2])
+				- (vx_int16)srow0[x - 2] + (4 * (vx_int16)srow0[x - 1]) + (6 * (vx_int16)srow0[x]) + (4 * (vx_int16)srow0[x + 1]) + (vx_int16)srow0[x + 2];
+			vx_int16 tmp = (vx_int16)sqrt((Gx*Gx) + (Gy*Gy));
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstride;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_CannySobel_U16_U8_7x7_L2NORM
+	(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint16   * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_uint8    * pLocalData
+	)
+{
+	int x, y;
+	int prefixWidth = ((intptr_t)(pDstImage)) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i z = _mm_setzero_si128(), c5 = _mm_set1_epi16(5), c6 = _mm_set1_epi16(6);
+	__m128i c15 = _mm_set1_epi16(15), c20 = _mm_set1_epi16(20);
+	__m128i clamp = _mm_set1_epi16(0x3FFF);
+
+	vx_uint32 dstride = dstImageStrideInBytes >> 1;
+	pDstImage += 3 * dstride;		// don't care about border. start processing from row2
+	pSrcImage += 3 * srcImageStrideInBytes;
+	vx_int16 *r0 = (vx_int16*)(pLocalData + 16);
+	vx_int16 *r1 = r0 + ((dstWidth + 15) & ~15);
+
+	for (y = 3; y < (int)dstHeight - 3; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - 3 * srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage - 2 * srcImageStrideInBytes;
+		const vx_uint8* srow2 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow3 = pSrcImage;
+		const vx_uint8* srow4 = pSrcImage + srcImageStrideInBytes;
+		const vx_uint8* srow5 = pSrcImage + 2 * srcImageStrideInBytes;
+		const vx_uint8* srow6 = pSrcImage + 3 * srcImageStrideInBytes;
+
+		vx_uint16* drow = (vx_uint16*)pDstImage;
+
+		for (x = 0; x < prefixWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 3] + (4 * (vx_int16)srow0[x + 2]) + (5 * (vx_int16)srow0[x + 1]) - (5 * (vx_int16)srow0[x - 1]) - (4 * (vx_int16)srow0[x - 2]) - (vx_int16)srow0[x - 3]
+				+ 6 * ((vx_int16)srow1[x + 3] + (4 * (vx_int16)srow1[x + 2]) + (5 * (vx_int16)srow1[x + 1]) - (5 * (vx_int16)srow1[x - 1]) - (4 * (vx_int16)srow1[x - 2]) - (vx_int16)srow1[x - 3])
+				+ 15 * ((vx_int16)srow2[x + 3] + (4 * (vx_int16)srow2[x + 2]) + (5 * (vx_int16)srow2[x + 1]) - (5 * (vx_int16)srow2[x - 1]) - (4 * (vx_int16)srow2[x - 2]) - (vx_int16)srow2[x - 3])
+				+ 20 * ((vx_int16)srow3[x + 3] + (4 * (vx_int16)srow3[x + 2]) + (5 * (vx_int16)srow3[x + 1]) - (5 * (vx_int16)srow3[x - 1]) - (4 * (vx_int16)srow3[x - 2]) - (vx_int16)srow3[x - 3])
+				+ 15 * ((vx_int16)srow4[x + 3] + (4 * (vx_int16)srow4[x + 2]) + (5 * (vx_int16)srow4[x + 1]) - (5 * (vx_int16)srow4[x - 1]) - (4 * (vx_int16)srow4[x - 2]) - (vx_int16)srow4[x - 3])
+				+ 6 * ((vx_int16)srow5[x + 3] + (4 * (vx_int16)srow5[x + 2]) + (5 * (vx_int16)srow5[x + 1]) - (5 * (vx_int16)srow5[x - 1]) - (4 * (vx_int16)srow5[x - 2]) - (vx_int16)srow5[x - 3])
+				+ (vx_int16)srow6[x + 3] + (4 * (vx_int16)srow6[x + 2]) + (5 * (vx_int16)srow6[x + 1]) - (5 * (vx_int16)srow6[x - 1]) - (4 * (vx_int16)srow6[x - 2]) - (vx_int16)srow6[x - 3];
+			vx_int16 Gy = (vx_int16)srow6[x - 3] + (vx_int16)srow6[x + 3] + (6 * ((vx_int16)srow6[x - 2] + (vx_int16)srow6[x + 2])) + (15 * ((vx_int16)srow6[x - 1] + (vx_int16)srow6[x + 1])) + (20 * (vx_int16)srow6[x])
+				+ 4 * ((vx_int16)srow5[x - 3] + (vx_int16)srow5[x + 3] + (6 * ((vx_int16)srow5[x - 2] + (vx_int16)srow5[x + 2])) + (15 * ((vx_int16)srow5[x - 1] + (vx_int16)srow5[x + 1])) + (20 * (vx_int16)srow5[x]))
+				+ 5 * ((vx_int16)srow4[x - 3] + (vx_int16)srow4[x + 3] + (6 * ((vx_int16)srow4[x - 2] + (vx_int16)srow4[x + 2])) + (15 * ((vx_int16)srow4[x - 1] + (vx_int16)srow4[x + 1])) + (20 * (vx_int16)srow4[x]))
+				- 5 * ((vx_int16)srow2[x - 3] + (vx_int16)srow2[x + 3] + (6 * ((vx_int16)srow2[x - 2] + (vx_int16)srow2[x + 2])) + (15 * ((vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1])) + (20 * (vx_int16)srow2[x]))
+				- 4 * ((vx_int16)srow1[x - 3] + (vx_int16)srow1[x + 3] + (6 * ((vx_int16)srow1[x - 2] + (vx_int16)srow1[x + 2])) + (15 * ((vx_int16)srow1[x - 1] + (vx_int16)srow1[x + 1])) + (20 * (vx_int16)srow1[x]))
+				- ((vx_int16)srow0[x - 3] + (vx_int16)srow0[x + 3] + (6 * ((vx_int16)srow0[x - 2] + (vx_int16)srow0[x + 2])) + (15 * ((vx_int16)srow0[x - 1] + (vx_int16)srow0[x + 1])) + (20 * (vx_int16)srow0[x]));
+			vx_int16 tmp = (vx_int16)sqrt((Gx*Gx) + (Gy*Gy));
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+
+		// do vertical convolution
+		for (x = prefixWidth; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow3 + x)), z);
+			__m128i s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow4 + x)), z);
+			__m128i s5 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow5 + x)), z);
+			__m128i s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow6 + x)), z);
+
+			__m128i t0 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s1, s5), c6), _mm_mullo_epi16(s3, c20));
+			__m128i t2 = _mm_mullo_epi16(_mm_add_epi16(s2, s4), c15);
+			t0 = _mm_add_epi16(t0, _mm_add_epi16(s0, s6));
+			__m128i t1 = _mm_slli_epi16(_mm_sub_epi16(s5, s1), 2);
+			t0 = _mm_add_epi16(t0, t2);
+
+			t2 = _mm_mullo_epi16(_mm_sub_epi16(s4, s2), c5);
+			t1 = _mm_add_epi16(t1, _mm_sub_epi16(s6, s0));
+			t0 = _mm_srai_epi16(t0, 2);
+			t1 = _mm_add_epi16(t1, t2);
+			t1 = _mm_srai_epi16(t1, 2);
+
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+			_mm_store_si128((__m128i*)(r1 + x), t1);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst
+		x = prefixWidth;
+		for (; x <= alignedWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 3));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x - 2));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r0 + x + 2));
+			__m128i s5 = _mm_loadu_si128((const __m128i*)(r0 + x + 3));
+
+
+			__m128i t0 = _mm_slli_epi16(_mm_subs_epi16(s4, s1), 2);
+			__m128i t1 = _mm_mullo_epi16(_mm_subs_epi16(s3, s2), c5);
+			t0 = _mm_adds_epi16(t0, _mm_subs_epi16(s5, s0));
+			t0 = _mm_adds_epi16(t0, t1);
+
+			s0 = _mm_loadu_si128((const __m128i*)(r1 + x - 3));
+			s1 = _mm_loadu_si128((const __m128i*)(r1 + x - 2));
+			s2 = _mm_loadu_si128((const __m128i*)(r1 + x - 1));
+			s3 = _mm_loadu_si128((const __m128i*)(r1 + x));
+			s4 = _mm_loadu_si128((const __m128i*)(r1 + x + 1));
+			s5 = _mm_loadu_si128((const __m128i*)(r1 + x + 2));
+			__m128i s6 = _mm_loadu_si128((const __m128i*)(r1 + x + 3));
+
+
+			t1 = _mm_adds_epi16(_mm_mullo_epi16(_mm_add_epi16(s1, s5), c6), _mm_mullo_epi16(s3, c20));
+			__m128i t2 = _mm_mullo_epi16(_mm_add_epi16(s2, s4), c15);
+			t1 = _mm_adds_epi16(t1, _mm_adds_epi16(s0, s6));
+			t1 = _mm_adds_epi16(t1, t2);
+			t1 = _mm_subs_epi16(z, t1);
+			// unpack for multiplication
+			s0 = _mm_unpacklo_epi16(t0, t1);
+			s2 = _mm_unpackhi_epi16(t0, t1);
+			s0 = _mm_madd_epi16(s0, s0);
+			s2 = _mm_madd_epi16(s2, s2);
+
+			__m128 f0 = _mm_cvtepi32_ps(s0);
+			__m128 f1 = _mm_cvtepi32_ps(s2);
+			f0 = _mm_sqrt_ps(f0);
+			f1 = _mm_sqrt_ps(f1);
+			for (int i = 0; i < 8; i++){
+				M128I(s1).m128i_i16[i] = HafCpu_FastAtan2_Canny(M128I(t0).m128i_i16[i], M128I(t1).m128i_i16[i]);
+			}
+			t0 = _mm_cvtps_epi32(f0);
+			t1 = _mm_cvtps_epi32(f1);
+			// pack with signed saturation
+			t0 = _mm_packus_epi32(t0, t1);
+			t0 = _mm_or_si128(_mm_slli_epi16(t0, 2), s1);
+			// store magnitude and angle to destination
+			_mm_store_si128((__m128i*)(drow + x), t0);
+		}
+
+		for (x = alignedWidth + prefixWidth - 1; x < (int)dstWidth; x++)
+		{
+			vx_int16 Gx = (vx_int16)srow0[x + 3] + (4 * (vx_int16)srow0[x + 2]) + (5 * (vx_int16)srow0[x + 1]) - (5 * (vx_int16)srow0[x - 1]) - (4 * (vx_int16)srow0[x - 2]) - (vx_int16)srow0[x - 3]
+				+ 6 * ((vx_int16)srow1[x + 3] + (4 * (vx_int16)srow1[x + 2]) + (5 * (vx_int16)srow1[x + 1]) - (5 * (vx_int16)srow1[x - 1]) - (4 * (vx_int16)srow1[x - 2]) - (vx_int16)srow1[x - 3])
+				+ 15 * ((vx_int16)srow2[x + 3] + (4 * (vx_int16)srow2[x + 2]) + (5 * (vx_int16)srow2[x + 1]) - (5 * (vx_int16)srow2[x - 1]) - (4 * (vx_int16)srow2[x - 2]) - (vx_int16)srow2[x - 3])
+				+ 20 * ((vx_int16)srow3[x + 3] + (4 * (vx_int16)srow3[x + 2]) + (5 * (vx_int16)srow3[x + 1]) - (5 * (vx_int16)srow3[x - 1]) - (4 * (vx_int16)srow3[x - 2]) - (vx_int16)srow3[x - 3])
+				+ 15 * ((vx_int16)srow4[x + 3] + (4 * (vx_int16)srow4[x + 2]) + (5 * (vx_int16)srow4[x + 1]) - (5 * (vx_int16)srow4[x - 1]) - (4 * (vx_int16)srow4[x - 2]) - (vx_int16)srow4[x - 3])
+				+ 6 * ((vx_int16)srow5[x + 3] + (4 * (vx_int16)srow5[x + 2]) + (5 * (vx_int16)srow5[x + 1]) - (5 * (vx_int16)srow5[x - 1]) - (4 * (vx_int16)srow5[x - 2]) - (vx_int16)srow5[x - 3])
+				+ (vx_int16)srow6[x + 3] + (4 * (vx_int16)srow6[x + 2]) + (5 * (vx_int16)srow6[x + 1]) - (5 * (vx_int16)srow6[x - 1]) - (4 * (vx_int16)srow6[x - 2]) - (vx_int16)srow6[x - 3];
+			vx_int16 Gy = (vx_int16)srow6[x - 3] + (vx_int16)srow6[x + 3] + (6 * ((vx_int16)srow6[x - 2] + (vx_int16)srow6[x + 2])) + (15 * ((vx_int16)srow6[x - 1] + (vx_int16)srow6[x + 1])) + (20 * (vx_int16)srow6[x])
+				+ 4 * ((vx_int16)srow5[x - 3] + (vx_int16)srow5[x + 3] + (6 * ((vx_int16)srow5[x - 2] + (vx_int16)srow5[x + 2])) + (15 * ((vx_int16)srow5[x - 1] + (vx_int16)srow5[x + 1])) + (20 * (vx_int16)srow5[x]))
+				+ 5 * ((vx_int16)srow4[x - 3] + (vx_int16)srow4[x + 3] + (6 * ((vx_int16)srow4[x - 2] + (vx_int16)srow4[x + 2])) + (15 * ((vx_int16)srow4[x - 1] + (vx_int16)srow4[x + 1])) + (20 * (vx_int16)srow4[x]))
+				- 5 * ((vx_int16)srow2[x - 3] + (vx_int16)srow2[x + 3] + (6 * ((vx_int16)srow2[x - 2] + (vx_int16)srow2[x + 2])) + (15 * ((vx_int16)srow2[x - 1] + (vx_int16)srow2[x + 1])) + (20 * (vx_int16)srow2[x]))
+				- 4 * ((vx_int16)srow1[x - 3] + (vx_int16)srow1[x + 3] + (6 * ((vx_int16)srow1[x - 2] + (vx_int16)srow1[x + 2])) + (15 * ((vx_int16)srow1[x - 1] + (vx_int16)srow1[x + 1])) + (20 * (vx_int16)srow1[x]))
+				- ((vx_int16)srow0[x - 3] + (vx_int16)srow0[x + 3] + (6 * ((vx_int16)srow0[x - 2] + (vx_int16)srow0[x + 2])) + (15 * ((vx_int16)srow0[x - 1] + (vx_int16)srow0[x + 1])) + (20 * (vx_int16)srow0[x]));
+			vx_int16 tmp = (vx_int16)sqrt((Gx*Gx) + (Gy*Gy));
+			tmp <<= 2;
+			tmp |= (HafCpu_FastAtan2_Canny(Gx, Gy) & 3);
+			drow[x] = tmp;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstride;
+	}
+	return AGO_SUCCESS;
+}
diff --git a/openvx/ago/ago_haf_cpu_ch_extract_combine.cpp b/openvx/ago/ago_haf_cpu_ch_extract_combine.cpp
new file mode 100644
index 0000000..027af9a
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_ch_extract_combine.cpp
@@ -0,0 +1,2137 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+DECL_ALIGN(16) unsigned char dataChannelExtract[16 * 29] ATTR_ALIGN(16) = { 
+	  0,   2,   4,   6,   8,  10,  12,  14, 255, 255, 255, 255, 255, 255, 255, 255,		// Lower 8 bytes pos0 for U8_U16
+	255, 255, 255, 255, 255, 255, 255, 255,   0,   2,   4,   6,   8,  10,  12,  14,		// Upper 8 bytes pos0 for U8_U16
+	  1,   3,   5,   7,   9,  11,  13,  15, 255, 255, 255, 255, 255, 255, 255, 255,		// Lower 8 bytes pos1 for U8_U16
+	255, 255, 255, 255, 255, 255, 255, 255,   1,   3,   5,   7,   9,  11,  13,  15,		// Upper 8 bytes pos1 for U8_U16
+	  0,   3,   6,   9,  12,  15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// Lower 6 bytes pos0 for U8_U24
+	255, 255, 255, 255, 255, 255,   2,   5,   8,  11,  14, 255, 255, 255, 255, 255,		// Mid	 5 bytes pos0 for U8_U24 
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  1,   4,   7,  10,  13,		// Upper 5 bytes pos0 for U8_U24 
+	  1,   4,   7,  10,  13, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// Lower 5 bytes pos1 for U8_U24
+	255, 255, 255, 255, 255,   0,   3,   6,   9,  12,  15, 255, 255, 255, 255, 255,		// Mid	 6 bytes pos1 for U8_U24
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   2,   5,   8,  11,  14, 		// Upper 5 bytes pos1 for U8_U24
+	  2,   5,   8,  11,  14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  	// Lower 5 bytes pos2 for U8_U24
+	255, 255, 255, 255, 255,   1,   4,   7,  10,  13, 255, 255, 255, 255, 255, 255,		// Mid	 5 bytes pos2 for U8_U24
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0,   3,   6,   9,  12,  15,		// Upper 6 bytes pos2 for U8_U24
+	  0,   4,   8,  12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// Low	 4 bytes pos0 for U8_U32
+	255, 255, 255, 255,   0,   4,   8,  12, 255, 255, 255, 255, 255, 255, 255, 255,		// Next	 4 bytes pos0 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255,   0,   4,   8,  12, 255, 255, 255, 255,		// Next	 4 bytes pos0 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0,   4,   8,  12,		// Upper 4 bytes pos0 for U8_U32	
+	  1,   5,   9,  13, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// Low	 4 bytes pos1 for U8_U32
+	255, 255, 255, 255,   1,   5,   9,  13, 255, 255, 255, 255, 255, 255, 255, 255,		// Next	 4 bytes pos1 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255,   1,   5,   9,  13, 255, 255, 255, 255,		// Next	 4 bytes pos1 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   1,   5,   9,  13,		// Upper 4 bytes pos1 for U8_U32	
+	  2,   6,  10,  14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// Low	 4 bytes pos2 for U8_U32
+	255, 255, 255, 255,   2,   6,  10,  14, 255, 255, 255, 255, 255, 255, 255, 255,		// Next	 4 bytes pos2 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255,   2,   6,  10,  14, 255, 255, 255, 255,		// Next	 4 bytes pos2 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   2,   6,  10,  14,		// Upper 4 bytes pos2 for U8_U32	
+	  3,   7,  11,  15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// Low	 4 bytes pos3 for U8_U32
+	255, 255, 255, 255,   3,   7,  11,  15, 255, 255, 255, 255, 255, 255, 255, 255,		// Next	 4 bytes pos3 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255,   3,   7,  11,  15, 255, 255, 255, 255,		// Next	 4 bytes pos3 for U8_U32
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   3,   7,  11,  15		// Upper 4 bytes pos3 for U8_U32
+};
+
+DECL_ALIGN(16) unsigned char dataChannelCombine[16 * 15] ATTR_ALIGN(16) = { 
+	  0, 255, 255,   1, 255, 255,   2, 255, 255,   3, 255, 255,   4, 255, 255,   5,		// R into first  16 bytes for RGB
+	255, 255,   6, 255, 255,   7, 255, 255,   8, 255, 255,   9, 255, 255,  10, 255,		// R into second 16 bytes for RGB
+	255,  11, 255, 255,  12, 255, 255,  13, 255, 255,  14, 255, 255,  15, 255, 255,	 	// R into third  16 bytes for RGB
+	255,   0, 255, 255,   1, 255, 255,   2, 255, 255,   3, 255, 255,   4, 255, 255,		// G into first  16 bytes for RGB
+	  5, 255, 255,   6, 255, 255,   7, 255, 255,   8, 255, 255,   9, 255, 255,  10,		// G into second 16 bytes for RGB
+	255, 255,  11, 255, 255,  12, 255, 255,  13, 255, 255,  14, 255, 255,  15, 255,		// G into third  16 bytes for RGB
+	255, 255,   0, 255, 255,   1, 255, 255,   2, 255, 255,   3, 255, 255,   4, 255,		// B into first  16 bytes for RGB
+	255,   5, 255, 255,   6, 255, 255,   7, 255, 255,   8, 255, 255,   9, 255, 255,		// B into second 16 bytes for RGB
+	 10, 255, 255,  11, 255, 255,  12, 255, 255,  13, 255, 255,  14, 255, 255,  15,		// B into third  16 bytes for RGB
+	255,   0, 255,   1, 255,   2, 255,   3, 255,   4, 255,   5, 255,   6, 255,	 7,		// Y into UYVY
+	  0, 255, 255, 255,   1, 255, 255, 255,   2, 255, 255, 255,   3, 255, 255, 255,		// U into UYVY
+	255, 255,   0, 255, 255, 255,   1, 255, 255, 255,   2, 255, 255, 255,   3, 255,		// V into UYVY
+	  0, 255,   1, 255,   2, 255,   3, 255,   4, 255,   5, 255,   6, 255,   7, 255,		// Y into YUYV
+	255,   0, 255, 255, 255,   1, 255, 255, 255,   2, 255, 255, 255,   3, 255, 255,		// U into YUYV
+	255, 255, 255,   0, 255, 255, 255,   1, 255, 255, 255,   2, 255, 255, 255,	 3,		// V into YUYV
+};
+
+extern vx_uint32 dataConvertU1ToU8_4bytes[16];
+
+/* This function assumes that the pixelSizeinBytes is equal to the srcStrideX*/
+int HafCpu_BufferCopyDisperseInDst
+	(
+		vx_uint32	  dstWidth,
+		vx_uint32	  dstHeight,
+		vx_uint32	  pixelSizeInBytes,
+		vx_uint8	* pDstImage,
+		vx_uint32	  dstImageStrideYInBytes,
+		vx_uint32	  dstImageStrideXInBytes,
+		vx_uint8	* pSrcImage,
+		vx_uint32	  srcImageStrideYInBytes
+	)
+{
+	if (pixelSizeInBytes == 1)					// 8 bits per pixel
+	{
+		vx_uint8 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = pSrcImage;
+			pLocalDst = pDstImage;
+
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				*pLocalDst = *pLocalSrc++;
+				pLocalDst += dstImageStrideXInBytes;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	else if (pixelSizeInBytes == 2)				// 16 bits per pixel
+	{
+		vx_int16 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = (vx_int16 *) pSrcImage;
+			pLocalDst = (vx_int16 *) pDstImage;
+
+			int xStride = dstImageStrideXInBytes >> 1;
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				*pLocalDst = *pLocalSrc++;
+				pLocalDst += xStride;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	else if (pixelSizeInBytes == 4)				// 32 bits per pixel
+	{
+		vx_int32 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = (vx_int32 *)pSrcImage;
+			pLocalDst = (vx_int32 *)pDstImage;
+
+			int xStride = dstImageStrideXInBytes >> 2;
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				*pLocalDst = *pLocalSrc++;
+				pLocalDst += xStride;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	else										// General case
+	{
+		vx_uint8 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = pSrcImage;
+			pLocalDst = pDstImage;
+
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				for (int byte = 0; byte < (int)pixelSizeInBytes; byte++)
+					*pLocalDst++ = *pLocalSrc++;
+				pLocalDst += dstImageStrideXInBytes;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+/* This function assumes that the pixelSizeinBytes is equal to the dstStrideX*/
+int HafCpu_BufferCopyDisperseInSrc
+	(
+		vx_uint32	  dstWidth,
+		vx_uint32	  dstHeight,
+		vx_uint32	  pixelSizeInBytes,
+		vx_uint8	* pDstImage,
+		vx_uint32	  dstImageStrideYInBytes,
+		vx_uint8	* pSrcImage,
+		vx_uint32	  srcImageStrideYInBytes,
+		vx_uint32	  srcImageStrideXInBytes
+	)
+{
+	if (pixelSizeInBytes == 1)					// 8 bits per pixel
+	{
+		vx_uint8 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = pSrcImage;
+			pLocalDst = pDstImage;
+
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc;
+				pLocalSrc += srcImageStrideXInBytes;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	else if (pixelSizeInBytes == 2)				// 16 bits per pixel
+	{
+		vx_int16 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = (vx_int16 *)pSrcImage;
+			pLocalDst = (vx_int16 *)pDstImage;
+
+			int xStride = srcImageStrideXInBytes >> 1;
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc;
+				pLocalSrc += xStride;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	else if (pixelSizeInBytes == 4)				// 32 bits per pixel
+	{
+		vx_int32 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = (vx_int32 *)pSrcImage;
+			pLocalDst = (vx_int32 *)pDstImage;
+
+			int xStride = srcImageStrideXInBytes >> 2;
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc;
+				pLocalSrc += xStride;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+	else										// General case
+	{
+		vx_uint8 *pLocalSrc, *pLocalDst;
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc = pSrcImage;
+			pLocalDst = pDstImage;
+
+			for (int width = 0; width < (int)dstWidth; width++)
+			{
+				for (int byte = 0; byte < (int)pixelSizeInBytes; byte++)
+					*pLocalDst++ = *pLocalSrc++;
+				pLocalSrc += srcImageStrideXInBytes;
+			}
+			pSrcImage += srcImageStrideYInBytes;
+			pDstImage += dstImageStrideYInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_BinaryCopy_U8_U8
+	(
+		vx_size       size,
+		vx_uint8    * pDstBuf,
+		vx_uint8    * pSrcBuf
+	)
+{
+	if ((intptr_t(pSrcBuf) & 15) | (intptr_t(pDstBuf) & 15))
+		memcpy(pDstBuf, pSrcBuf, size);
+	else
+	{
+		__m128i * src = (__m128i*) pSrcBuf;
+		__m128i * dst = (__m128i*) pDstBuf;
+		__m128i r0, r1, r2, r3;
+
+		vx_size prefixBytes = intptr_t(pDstBuf) & 15;
+		vx_size sizeAligned = size & ~63;
+
+		for (unsigned int i = 0; i < sizeAligned; i += 64)
+		{
+			r0 = _mm_loadu_si128(src++);
+			r1 = _mm_loadu_si128(src++);
+			r2 = _mm_loadu_si128(src++);
+			r3 = _mm_loadu_si128(src++);
+			_mm_store_si128(dst++, r0);
+			_mm_store_si128(dst++, r1);
+			_mm_store_si128(dst++, r2);
+			_mm_store_si128(dst++, r3);
+		}
+		for (vx_size i = sizeAligned; i < size; i++) {
+			pDstBuf[i] = pSrcBuf[i];
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelCopy_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	if ((srcImageStrideInBytes | dstImageStrideInBytes) & 15)
+	{
+		int height = (int)dstHeight;
+		while (height)
+		{
+			unsigned char * pLocalSrc = (unsigned char *)pSrcImage;
+			unsigned char * pLocalDst = (unsigned char *)pDstImage;
+			int width = (int)dstWidth;
+			while (width)
+			{
+				*pLocalDst++ = *pLocalSrc++;
+				width--;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	else
+	{
+		__m128i r0, r1;
+		unsigned char *pLocalSrc, *pLocalDst;
+		__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+
+		int prefixWidth = intptr_t(pDstImage) & 15;
+		prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 31;					// 32 pixels processed at a time in SSE loop
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDst = (unsigned char *)pDstImage;
+
+			for (int x = 0; x < prefixWidth; x++)
+				*pLocalDst++ = *pLocalSrc++;
+
+			int width = alignedWidth >> 5;									// 32 pixels copied at a time
+			pLocalSrc_xmm = (__m128i *) pLocalSrc;
+			pLocalDst_xmm = (__m128i *) pLocalDst;
+			while (width)
+			{
+				r0 = _mm_loadu_si128(pLocalSrc_xmm++);
+				_mm_store_si128(pLocalDst_xmm++, r0);
+				r1 = _mm_loadu_si128(pLocalSrc_xmm++);
+				_mm_store_si128(pLocalDst_xmm++, r1);
+
+				width--;
+			}
+
+			pLocalSrc = (unsigned char *)pLocalSrc_xmm;
+			pLocalDst = (unsigned char *)pLocalDst_xmm;
+			for (int x = 0; x < postfixWidth; x++)
+				*pLocalDst++ = *pLocalSrc++;
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/*The function assumes that the destination pointer is 16 byte aligned and the destination stride as well.
+Also, the width is a multiple of 32, if not then number of pixels copies would be the next largest multiple of 32 after dstWidth. 
+The LSB of every byte is copies, therefore 0 -> 0 and non zero -> 1*/
+int HafCpu_ChannelCopy_U8_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*) pDstImage;
+	__m128i r0, r1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 32)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage + (width >> 3) + 2));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage + (width >> 3) + 3));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			r0 = _mm_load_si128((__m128i*) pixels_u64);
+			r1 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			// Convert U1 to U8	- Thresholded
+			r0 = _mm_cmpgt_epi8(r0, zeromask);
+			r1 = _mm_cmpgt_epi8(r1, zeromask);
+
+			_mm_store_si128(&dst[width >> 4], r0);
+			_mm_store_si128(&dst[(width >> 4) + 1], r1);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/*The function assumes that the source pointer is 16 byte aligned and the source stride as well.
+Also, the width is a multiple of 32, if not then number of pixels copies would be the next largest multiple of 16 after dstWidth.
+The LSB of every byte is copies, therefore 0 -> 0 and non zero -> 1*/
+int HafCpu_ChannelCopy_U1_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * src = (__m128i*) pSrcImage;
+	__m128i r0;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			r0 = _mm_load_si128(&src[width >> 4]);
+			
+			// Convert U8 to U1	- Extract LSB
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(r0.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(r0.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+
+int HafCpu_ChannelCopy_U8_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	short * pLocalSrc;
+	int * pLocalDst;
+
+	int height = (int) dstHeight;
+	short inputPixels;
+	int outputPixels[4];
+
+	int postfixWidth = dstWidth & 15;			// The input has to be a multiple of 16 or 8 pixels (U1 = 8 pixels in a byte)
+	int alignedWidth = dstWidth >> 4;
+	while (height > 0)
+	{
+		pLocalSrc = (short *) pSrcImage;
+		pLocalDst = (int *) pDstImage;
+		int width = alignedWidth;				// Each inner loop processess 4 output ints = 4*4 = 16 bytes
+
+		while (width > 0)
+		{
+			inputPixels = *pLocalSrc++;
+			outputPixels[0] = dataConvertU1ToU8_4bytes[inputPixels & 15];
+			inputPixels >>= 4;
+			outputPixels[1] = dataConvertU1ToU8_4bytes[inputPixels & 15];
+			inputPixels >>= 4;
+			outputPixels[2] = dataConvertU1ToU8_4bytes[inputPixels & 15];
+			inputPixels >>= 4;
+			outputPixels[3] = dataConvertU1ToU8_4bytes[inputPixels & 15];
+			*pLocalDst++ = outputPixels[0];
+			*pLocalDst++ = outputPixels[1];
+			*pLocalDst++ = outputPixels[2];
+			*pLocalDst++ = outputPixels[3];
+
+			width--;
+		}
+
+		width = postfixWidth;
+		while (width > 0)
+		{
+			inputPixels = *((vx_uint8 *)pLocalSrc - 1);
+			outputPixels[0] = dataConvertU1ToU8_4bytes[inputPixels & 15];
+			inputPixels >>= 4;
+			outputPixels[1] = dataConvertU1ToU8_4bytes[inputPixels & 15];
+			*pLocalDst++ = outputPixels[0];
+			*pLocalDst++ = outputPixels[1];
+			width = 0;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+/*The function assumes that the source pointer is 16 byte aligned and the source stride as well.
+Also, the width is a multiple of 16, if not then number of pixels copies would be the next largest multiple of 16 after dstWidth.
+The function also assumes that the input is either 0x00 or 0xFF. Only the MSB of the pixelvalues of input is used*/
+int HafCpu_ChannelCopy_U1_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * pLocalSrc;
+	short * pLocalDst;
+
+	__m128i pixels;
+	int pixelmask;
+	int height = (int) dstHeight;
+
+	while (height > 0)
+	{
+		pLocalSrc = (__m128i*) pSrcImage;
+		pLocalDst = (short *) pDstImage;
+		int width = (int) (dstWidth >> 4);		// 16 pixels (bits) are processed at a time in the inner loop
+		while (width > 0)
+		{
+			pixels = _mm_load_si128(pLocalSrc++);
+			pixelmask = _mm_movemask_epi8(pixels);
+			*pLocalDst++ = (short) (pixelmask & 0xFFFF);
+			width--;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	
+	return AGO_SUCCESS;
+}
+#endif
+
+/* This function assumes that the width is a multiple of 8, if not, then the number of pixels copied is the next highest multiple of 8 after dstWidth*/
+int HafCpu_ChannelCopy_U1_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	for (unsigned int y = 0; y < dstHeight; y++)
+	{
+		for (unsigned int x = 0; x < (dstWidth >> 3); x++)
+		{
+			pDstImage[x] = pSrcImage[x];
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/*The function assumes that the data pointers are 16 byte aligned, and size is a multiple of 16, otherwise it is taken to be the multiple of 16 largest after size*/
+int HafCpu_MemSet_U8
+	(
+		vx_size       count,
+		vx_uint8    * pDstBuf,
+		vx_uint8      value
+	)
+{
+	__m128i val = _mm_set1_epi8((char)value);
+	__m128i * buf = (__m128i *) pDstBuf;
+	__m128i * buf_end = buf + (count >> 4);
+	for (; buf != buf_end; buf++)
+		_mm_store_si128(buf, val);
+	return AGO_SUCCESS;
+}
+
+/*The function assumes that the data pointers are 16 byte aligned, and size is a multiple of 16, otherwise it is taken to be the multiple of 16 largest after size*/
+int HafCpu_MemSet_U16
+	(
+		vx_size       count,
+		vx_uint16   * pDstBuf,
+		vx_uint16     value
+	)
+{
+	__m128i val = _mm_set1_epi16((short)value);
+	__m128i * buf = (__m128i *) pDstBuf;
+	__m128i * buf_end = buf + (count >> 3);
+	for (; buf != buf_end; buf++)
+		_mm_store_si128(buf, val);
+	return AGO_SUCCESS;
+}
+
+/*The function assumes that the data pointers are 16 byte aligned, and size is a multiple of 48, otherwise it is taken to be the multiple of 48 largest after size*/
+int HafCpu_MemSet_U24
+	(
+		vx_size       count,
+		vx_uint8	* pDstBuf,
+		vx_uint32     value
+	)
+{
+	char val_R = (char)(value & 0xFF);
+	char val_G = (char)((value >> 8) & 0xFF);
+	char val_B = (char)((value >> 16) & 0xFF);
+	__m128i val1 = _mm_set_epi8(val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R);
+	__m128i val2 = _mm_set_epi8(val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G);
+	__m128i val3 = _mm_set_epi8(val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B, val_G, val_R, val_B);
+	__m128i * buf = (__m128i *) pDstBuf;
+	__m128i * buf_end = buf + ((count*3) >> 4);
+	for (; buf < buf_end;) {
+		_mm_store_si128(buf++, val1);
+		_mm_store_si128(buf++, val2);
+		_mm_store_si128(buf++, val3);
+	}
+	return AGO_SUCCESS;
+}
+
+/*The function assumes that the data pointers are 16 byte aligned, and size is a multiple of 16, otherwise it is taken to be the multiple of 16 largest after size*/
+int HafCpu_MemSet_U32
+	(
+		vx_size       count,
+		vx_uint32   * pDstBuf,
+		vx_uint32     value
+	)
+{
+	__m128i val = _mm_set1_epi32((int)value);
+	__m128i * buf = (__m128i *) pDstBuf;
+	__m128i * buf_end = buf + (count >> 2);
+	for (; buf != buf_end; buf++)
+		_mm_store_si128(buf, val);
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U16_Pos0
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1;
+	__m128i mask1 = _mm_load_si128(tbl);
+	__m128i mask2 = _mm_load_si128(tbl + 1);
+	
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r0 = _mm_or_si128(r0, r1);
+			_mm_storeu_si128((__m128i *) pLocalDst, r0);
+
+			pLocalSrc += 32;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U16_Pos1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1;
+	__m128i mask1 = _mm_load_si128(tbl + 2);
+	__m128i mask2 = _mm_load_si128(tbl + 3);
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r0 = _mm_or_si128(r0, r1);
+			_mm_storeu_si128((__m128i *) pLocalDst, r0);
+
+			pLocalSrc += 32;
+			pLocalDst += 16;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			pLocalSrc++;
+			*pLocalDst++ = *pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U24_Pos0
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2;
+	__m128i mask1 = _mm_load_si128(tbl + 4);
+	__m128i mask2 = _mm_load_si128(tbl + 5);
+	__m128i mask3 = _mm_load_si128(tbl + 6);
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+		int width = alignedWidth >> 4;
+
+		while (width)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+
+			_mm_storeu_si128((__m128i *) pLocalDst, r0);
+			width--;
+			pLocalSrc += 48;
+			pLocalDst += 16;
+		}
+
+		width = postfixWidth;
+		while (width)
+		{
+			*pLocalDst++ = *pLocalSrc;
+			pLocalSrc += 3;
+			width--;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U24_Pos1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2;
+	__m128i mask1 = _mm_load_si128(tbl + 7);
+	__m128i mask2 = _mm_load_si128(tbl + 8);
+	__m128i mask3 = _mm_load_si128(tbl + 9);
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+		int width = alignedWidth >> 4;
+
+		while (width)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+
+			_mm_storeu_si128((__m128i *) pLocalDst, r0);
+			width--;
+			pLocalSrc += 48;
+			pLocalDst += 16;
+		}
+
+		width = postfixWidth;
+		while (width)
+		{
+			*pLocalDst++ = *++pLocalSrc;
+			pLocalSrc += 2;
+			width--;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U24_Pos2
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2;
+	__m128i mask1 = _mm_load_si128(tbl + 10);
+	__m128i mask2 = _mm_load_si128(tbl + 11);
+	__m128i mask3 = _mm_load_si128(tbl + 12);
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+		int width = alignedWidth >> 4;
+
+		while (width)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+
+			_mm_storeu_si128((__m128i *) pLocalDst, r0);
+			width--;
+			pLocalSrc += 48;
+			pLocalDst += 16;
+		}
+
+		width = postfixWidth;
+		while (width)
+		{
+			pLocalSrc += 2;
+			*pLocalDst++ = *pLocalSrc++;
+			width--;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U32_Pos0
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2, r3;
+	__m128i mask1 = _mm_load_si128(tbl + 13);
+	__m128i mask2 = _mm_load_si128(tbl + 14);
+	__m128i mask3 = _mm_load_si128(tbl + 15);
+	__m128i mask4 = _mm_load_si128(tbl + 16);
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			r3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r3 = _mm_shuffle_epi8(r3, mask4);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+			r0 = _mm_or_si128(r0, r3);
+			_mm_storeu_si128((__m128i *)pLocalDst, r0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc;
+			pLocalSrc += 4;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U32_Pos1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2, r3;
+	__m128i mask1 = _mm_load_si128(tbl + 17);
+	__m128i mask2 = _mm_load_si128(tbl + 18);
+	__m128i mask3 = _mm_load_si128(tbl + 19);
+	__m128i mask4 = _mm_load_si128(tbl + 20);
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			r3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r3 = _mm_shuffle_epi8(r3, mask4);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+			r0 = _mm_or_si128(r0, r3);
+			_mm_storeu_si128((__m128i *)pLocalDst, r0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *++pLocalSrc;
+			pLocalSrc += 3;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U32_Pos2
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2, r3;
+	__m128i mask1 = _mm_load_si128(tbl + 21);
+	__m128i mask2 = _mm_load_si128(tbl + 22);
+	__m128i mask3 = _mm_load_si128(tbl + 23);
+	__m128i mask4 = _mm_load_si128(tbl + 24);
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			r3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r3 = _mm_shuffle_epi8(r3, mask4);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+			r0 = _mm_or_si128(r0, r3);
+			_mm_storeu_si128((__m128i *)pLocalDst, r0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			pLocalSrc += 2;
+			*pLocalDst++ = *pLocalSrc;
+			pLocalSrc += 2;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8_U32_Pos3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelExtract;
+	__m128i r0, r1, r2, r3;
+	__m128i mask1 = _mm_load_si128(tbl + 25);
+	__m128i mask2 = _mm_load_si128(tbl + 26);
+	__m128i mask3 = _mm_load_si128(tbl + 27);
+	__m128i mask4 = _mm_load_si128(tbl + 28);
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			r0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			r1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			r2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			r3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+			r0 = _mm_shuffle_epi8(r0, mask1);
+			r1 = _mm_shuffle_epi8(r1, mask2);
+			r2 = _mm_shuffle_epi8(r2, mask3);
+			r3 = _mm_shuffle_epi8(r3, mask4);
+			r0 = _mm_or_si128(r0, r1);
+			r0 = _mm_or_si128(r0, r2);
+			r0 = _mm_or_si128(r0, r3);
+			_mm_storeu_si128((__m128i *)pLocalDst, r0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			pLocalSrc += 3;
+			*pLocalDst++ = *pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelCombine_U16_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes
+	)
+{
+	unsigned char *pLocalSrc0, *pLocalSrc1, *pLocalDst;
+	__m128i r0, r1, resultL, resultH;
+	__m128i *pLocalSrc0_xmm, *pLocalSrc1_xmm, *pLocalDst_xmm;
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 31;					// 32 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc0 = (unsigned char *) pSrcImage0;
+		pLocalSrc1 = (unsigned char *) pSrcImage1;
+		pLocalDst = (unsigned char *) pDstImage;
+
+		for (int x = 0; x < prefixWidth; x++)
+		{
+			*pLocalDst++ = *pLocalSrc0++;
+			*pLocalDst++ = *pLocalSrc1++;
+		}
+
+
+		int width = (int)(dstWidth >> 4);									// 16 byte pairs copied into dst at once
+		pLocalSrc0_xmm = (__m128i *) pLocalSrc0;
+		pLocalSrc1_xmm = (__m128i *) pLocalSrc1;
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		while (width)
+		{
+			r0 = _mm_load_si128(pLocalSrc0_xmm++);
+			r1 = _mm_load_si128(pLocalSrc1_xmm++);
+			resultL = _mm_unpacklo_epi8(r0, r1);
+			resultH = _mm_unpackhi_epi8(r0, r1);
+			_mm_store_si128(pLocalDst_xmm++, resultL);
+			_mm_store_si128(pLocalDst_xmm++, resultH);
+			width--;
+		}
+
+		pLocalSrc0 = (unsigned char *) pLocalSrc0_xmm;
+		pLocalSrc1 = (unsigned char *) pLocalSrc1_xmm;
+		pLocalDst = (unsigned char *) pLocalDst_xmm;
+		for (int x = 0; x < postfixWidth; x++)
+		{
+			*pLocalDst++ = *pLocalSrc0++;
+			*pLocalDst++ = *pLocalSrc1++;
+		}
+
+		pSrcImage0 += srcImage0StrideInBytes;
+		pSrcImage1 += srcImage1StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelCombine_U24_U8U8U8_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelCombine;
+	__m128i r, g, b, result1, result2, result3;
+	__m128i maskR1 = _mm_load_si128(tbl);
+	__m128i maskR2 = _mm_load_si128(tbl + 1);
+	__m128i maskR3 = _mm_load_si128(tbl + 2);
+	__m128i maskG1 = _mm_load_si128(tbl + 3);
+	__m128i maskG2 = _mm_load_si128(tbl + 4);
+	__m128i maskG3 = _mm_load_si128(tbl + 5);
+	__m128i maskB1 = _mm_load_si128(tbl + 6);
+	__m128i maskB2 = _mm_load_si128(tbl + 7);
+	__m128i maskB3 = _mm_load_si128(tbl + 8);
+
+	int height = (int) dstHeight;
+	while (height)
+	{
+		vx_uint8 * pLocalSrc0 = pSrcImage0;
+		vx_uint8 * pLocalSrc1 = pSrcImage1;
+		vx_uint8 * pLocalSrc2 = pSrcImage2;
+		vx_uint8 * pLocalDst = pDstImage;
+		
+		int width = (int) (dstWidth >> 4);
+		while (width)
+		{
+			r = _mm_loadu_si128((__m128i *) pLocalSrc0);
+			g = _mm_loadu_si128((__m128i *) pLocalSrc1);
+			b = _mm_loadu_si128((__m128i *) pLocalSrc2);
+
+			
+			result1 = _mm_shuffle_epi8(r, maskR1);					// Extract and place R in first 16 bytes
+			result2 = _mm_shuffle_epi8(g, maskG1);					// Extract and place G in first 16 bytes
+			result3 = _mm_shuffle_epi8(b, maskB1);					// Extract and place B in first 16 bytes
+			result1 = _mm_or_si128(result1, result2);
+			result1 = _mm_or_si128(result1, result3);
+
+			result2 = _mm_shuffle_epi8(r, maskR2);					// Extract and place R in second 16 bytes
+			result3 = _mm_shuffle_epi8(g, maskG2);					// Extract and place G in second 16 bytes
+			result2 = _mm_or_si128(result2, result3);
+			result3 = _mm_shuffle_epi8(b, maskB2);					// Extract and place B in second 16 bytes
+			result2 = _mm_or_si128(result2, result3);
+
+			result3 = _mm_shuffle_epi8(r, maskR3);					// Extract and place R in third 16 bytes
+			r = _mm_shuffle_epi8(g, maskG3);						// Extract and place G in third 16 bytes
+			g = _mm_shuffle_epi8(b, maskB3);						// Extract and place B in third 16 bytes
+			result3 = _mm_or_si128(result3, r);
+			result3 = _mm_or_si128(result3, g);
+
+			_mm_storeu_si128((__m128i *) pLocalDst, result1);
+			_mm_storeu_si128((__m128i *) (pLocalDst + 16), result2);
+			_mm_storeu_si128((__m128i *) (pLocalDst + 32), result3);
+
+			width--;
+			pLocalSrc0 += 16;
+			pLocalSrc1 += 16;
+			pLocalSrc2 += 16;
+			pLocalDst += 48;
+		}
+
+		for (width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc0++;
+			*pLocalDst++ = *pLocalSrc1++;
+			*pLocalDst++ = *pLocalSrc2++;
+		}
+
+		pSrcImage0 += srcImage0StrideInBytes;
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelCombine_U32_U8U8U8_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~31;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelCombine;
+	__m128i Y0, Y1, U, V;
+	__m128i maskY = _mm_load_si128(tbl + 9);
+	__m128i maskU = _mm_load_si128(tbl + 10);
+	__m128i maskV = _mm_load_si128(tbl + 11);
+	__m128i result1, result2;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc0 = pSrcImage0;
+		vx_uint8 * pLocalSrc1 = pSrcImage1;
+		vx_uint8 * pLocalSrc2 = pSrcImage2;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 5); width++)
+		{
+			Y0 = _mm_loadu_si128((__m128i *) pLocalSrc0);
+			Y1 = _mm_loadu_si128((__m128i *) (pLocalSrc0 + 16));
+			U = _mm_loadu_si128((__m128i *) pLocalSrc1);
+			V = _mm_loadu_si128((__m128i *) pLocalSrc2);
+
+			result1 = _mm_shuffle_epi8(Y0, maskY);			// Y
+			result2 = _mm_shuffle_epi8(U, maskU);			// U
+			result1 = _mm_or_si128(result1, result2);		// U Y _ Y
+			result2 = _mm_shuffle_epi8(V, maskV);			// V
+			result1 = _mm_or_si128(result1, result2);		// U Y V Y	- first 16 bytes
+
+			Y0 = _mm_srli_si128(Y0, 8);
+			U = _mm_srli_si128(U, 4);
+			V = _mm_srli_si128(V, 4);
+			result2 = _mm_shuffle_epi8(Y0, maskY);			// Y
+			Y0 = _mm_shuffle_epi8(U, maskU);				// U
+			result2 = _mm_or_si128(result2, Y0);			// U Y _ Y
+			Y0 = _mm_shuffle_epi8(V, maskV);				// V
+			result2 = _mm_or_si128(result2, Y0);			// U Y V Y - next 16 bytes
+
+			_mm_storeu_si128((__m128i *)pLocalDst, result1);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 16), result2);
+
+			U = _mm_srli_si128(U, 4);
+			V = _mm_srli_si128(V, 4);
+			result1 = _mm_shuffle_epi8(Y1, maskY);			// Y
+			result2 = _mm_shuffle_epi8(U, maskU);			// U
+			result1 = _mm_or_si128(result1, result2);		// U Y _ Y
+			result2 = _mm_shuffle_epi8(V, maskV);			// V
+			result1 = _mm_or_si128(result1, result2);		// U Y V Y	- next 16 bytes
+
+			Y1 = _mm_srli_si128(Y1, 8);
+			U = _mm_srli_si128(U, 4);
+			V = _mm_srli_si128(V, 4);
+			result2 = _mm_shuffle_epi8(Y1, maskY);			// Y
+			Y1 = _mm_shuffle_epi8(U, maskU);				// U
+			result2 = _mm_or_si128(result2, Y1);			// U Y _ Y
+			Y1 = _mm_shuffle_epi8(V, maskV);				// V
+			result2 = _mm_or_si128(result2, Y1);			// U Y V Y - last 16 bytes
+
+			_mm_storeu_si128((__m128i *)(pLocalDst + 32), result1);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 48), result2);
+
+			pLocalSrc0 += 32;
+			pLocalSrc1 += 16;
+			pLocalSrc2 += 16;
+			pLocalDst += 64;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc1++;			// U
+			*pLocalDst++ = *pLocalSrc0++;			// Y
+			*pLocalDst++ = *pLocalSrc2++;			// V
+			*pLocalDst++ = *pLocalSrc0++;			// Y
+		}
+
+		pSrcImage0 += srcImage0StrideInBytes;
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelCombine_U32_U8U8U8_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~31;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataChannelCombine;
+	__m128i Y0, Y1, U, V;
+	__m128i maskY = _mm_load_si128(tbl + 12);
+	__m128i maskU = _mm_load_si128(tbl + 13);
+	__m128i maskV = _mm_load_si128(tbl + 14);
+	__m128i result1, result2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc0 = pSrcImage0;
+		vx_uint8 * pLocalSrc1 = pSrcImage1;
+		vx_uint8 * pLocalSrc2 = pSrcImage2;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 5); width++)
+		{
+			Y0 = _mm_loadu_si128((__m128i *) pLocalSrc0);
+			Y1 = _mm_loadu_si128((__m128i *) (pLocalSrc0 + 16));
+			U = _mm_loadu_si128((__m128i *) pLocalSrc1);
+			V = _mm_loadu_si128((__m128i *) pLocalSrc2);
+
+			result1 = _mm_shuffle_epi8(Y0, maskY);			// Y
+			result2 = _mm_shuffle_epi8(U, maskU);			// U
+			result1 = _mm_or_si128(result1, result2);		// Y U Y _
+			result2 = _mm_shuffle_epi8(V, maskV);			// V
+			result1 = _mm_or_si128(result1, result2);		// Y U Y V	- first 16 bytes
+
+			Y0 = _mm_srli_si128(Y0, 8);
+			U = _mm_srli_si128(U, 4);
+			V = _mm_srli_si128(V, 4);
+			result2 = _mm_shuffle_epi8(Y0, maskY);			// Y
+			Y0 = _mm_shuffle_epi8(U, maskU);				// U
+			result2 = _mm_or_si128(result2, Y0);			// Y U Y _
+			Y0 = _mm_shuffle_epi8(V, maskV);				// V
+			result2 = _mm_or_si128(result2, Y0);			// Y U Y V - next 16 bytes
+
+			_mm_storeu_si128((__m128i *)pLocalDst, result1);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 16), result2);
+
+			U = _mm_srli_si128(U, 4);
+			V = _mm_srli_si128(V, 4);
+			result1 = _mm_shuffle_epi8(Y1, maskY);			// Y
+			result2 = _mm_shuffle_epi8(U, maskU);			// U
+			result1 = _mm_or_si128(result1, result2);		// Y U Y _
+			result2 = _mm_shuffle_epi8(V, maskV);			// V
+			result1 = _mm_or_si128(result1, result2);		// Y U Y V	- next 16 bytes
+
+			Y1 = _mm_srli_si128(Y1, 8);
+			U = _mm_srli_si128(U, 4);
+			V = _mm_srli_si128(V, 4);
+			result2 = _mm_shuffle_epi8(Y1, maskY);			// Y
+			Y1 = _mm_shuffle_epi8(U, maskU);				// U
+			result2 = _mm_or_si128(result2, Y1);			// Y U Y _
+			Y1 = _mm_shuffle_epi8(V, maskV);				// V
+			result2 = _mm_or_si128(result2, Y1);			// Y U Y V - last 16 bytes
+
+			_mm_storeu_si128((__m128i *)(pLocalDst + 32), result1);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 48), result2);
+
+			pLocalSrc0 += 32;
+			pLocalSrc1 += 16;
+			pLocalSrc2 += 16;
+			pLocalDst += 64;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc0++;			// Y
+			*pLocalDst++ = *pLocalSrc1++;			// U
+			*pLocalDst++ = *pLocalSrc0++;			// Y
+			*pLocalDst++ = *pLocalSrc2++;			// V
+		}
+
+		pSrcImage0 += srcImage0StrideInBytes;
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelCombine_U32_U8U8U8U8_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage0,
+		vx_uint32     srcImage0StrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes,
+		vx_uint8    * pSrcImage3,
+		vx_uint32     srcImage3StrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i r, g, b, x, pixels0, pixels1, pixels2;
+
+	int height = (int) dstHeight;
+
+	while (height)
+	{
+		vx_uint8 * pLocalSrc0 = pSrcImage0;
+		vx_uint8 * pLocalSrc1 = pSrcImage1;
+		vx_uint8 * pLocalSrc2 = pSrcImage2;
+		vx_uint8 * pLocalSrc3 = pSrcImage3;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		int width = (int)(dstWidth >> 4);					// Inner loop processess 16 pixels at a time
+		while (width)
+		{
+			r = _mm_loadu_si128((__m128i *) pLocalSrc0);
+			g = _mm_loadu_si128((__m128i *) pLocalSrc1);
+			b = _mm_loadu_si128((__m128i *) pLocalSrc2);
+			x = _mm_loadu_si128((__m128i *) pLocalSrc3);
+
+			pixels0 = _mm_unpacklo_epi8(r, g);				// r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
+			pixels1 = _mm_unpacklo_epi8(b, x);				// b0 x0 b1 x1 b2 x2 b3 x3 b4 x4 b5 x5 b6 x6 b7 x7
+			pixels2 = _mm_unpacklo_epi16(pixels0, pixels1);	// r0 g0 b0 x0 r1 g1 b1 x1 r2 g2 b2 x2 r3 g3 b3 x3
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels2);
+			pLocalDst += 16;
+
+			pixels2 = _mm_unpackhi_epi16(pixels0, pixels1);	// r4 g4 b4 x4 r5 g5 b5 x5 r6 g6 b6 x6 r7 g7 b7 x7
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels2);
+			pLocalDst += 16;
+
+			pixels0 = _mm_unpackhi_epi8(r, g);				// r8 g8 r9 g9 r10 g10 r11 g11 r12 g12 r13 g13 r14 g14 r15 g15
+			pixels1 = _mm_unpackhi_epi8(b, x);				// b8 x8 b9 x9 b10 x10 b11 x11 b12 x12 b13 x13 b14 x14 b15 x15
+			pixels2 = _mm_unpacklo_epi16(pixels0, pixels1);	// r8 g8 b8 x8 r9 g9 b9 x9 r10 g10 b10 x10 r11 g11 b11 x11
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels2);
+			pLocalDst += 16;
+
+			pixels2 = _mm_unpackhi_epi16(pixels0, pixels1);	// r12 g12 b12 x12 r13 g13 b13 x13 r14 g14 b14 x14 r15 g15 b15 x15
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels2);
+			pLocalDst += 16;
+			
+			width--;
+			pLocalSrc0 += 16;
+			pLocalSrc1 += 16;
+			pLocalSrc2 += 16;
+			pLocalSrc3 += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc0++;
+			*pLocalDst++ = *pLocalSrc1++;
+			*pLocalDst++ = *pLocalSrc2++;
+			*pLocalDst++ = *pLocalSrc3++;
+		}
+
+		pSrcImage0 += srcImage0StrideInBytes;
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pSrcImage3 += srcImage3StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8U8U8_U24
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage0,
+		vx_uint8    * pDstImage1,
+		vx_uint8    * pDstImage2,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// Check for output buffer alignment
+	intptr_t prealignBytes = (intptr_t(pDstImage0) & intptr_t(pDstImage1) & intptr_t(pDstImage2)) & 15;
+	bool isAligned = (prealignBytes == ((intptr_t(pDstImage0) | intptr_t(pDstImage1) | intptr_t(pDstImage2)) & 15));	// True if all three buffers have the same alignment
+
+
+	unsigned char *pLocalSrc, *pLocalDst0, *pLocalDst1, *pLocalDst2;
+	__m128i * tbl = (__m128i *) dataChannelExtract;
+	__m128i pixels0, pixels1, pixels2, pixels_R, pixels_G;
+
+	__m128i mask_r0 = _mm_load_si128(tbl + 4);
+	__m128i mask_r1 = _mm_load_si128(tbl + 5);
+	__m128i mask_r2 = _mm_load_si128(tbl + 6);
+	__m128i mask_g0 = _mm_load_si128(tbl + 7);
+	__m128i mask_g1 = _mm_load_si128(tbl + 8);
+	__m128i mask_g2 = _mm_load_si128(tbl + 9);
+	__m128i mask_b0 = _mm_load_si128(tbl + 10);
+	__m128i mask_b1 = _mm_load_si128(tbl + 11);
+	__m128i mask_b2 = _mm_load_si128(tbl + 12);
+
+	if (isAligned)
+	{
+		int prefixWidth = (int)((prealignBytes == 0) ? 0 : (16 - prealignBytes));
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *) pSrcImage;
+			pLocalDst0 = (unsigned char *) pDstImage0;
+			pLocalDst1 = (unsigned char *) pDstImage1;
+			pLocalDst2 = (unsigned char *) pDstImage2;
+
+			for (int x = 0; x < prefixWidth; x++)
+			{
+				*pLocalDst0++ = *pSrcImage++;
+				*pLocalDst1++ = *pSrcImage++;
+				*pLocalDst2++ = *pSrcImage++;
+			}
+
+			int width = (int)(alignedWidth >> 4);											// 16 bytes at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels2 = _mm_loadu_si128((__m128i *) (pLocalSrc + 32));
+
+				pixels_R = _mm_shuffle_epi8(pixels0, mask_r0);
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels1, mask_r1));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels2, mask_r2));
+				_mm_store_si128((__m128i *)pLocalDst0, pixels_R);
+
+				pixels_G = _mm_shuffle_epi8(pixels0, mask_g0);
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels1, mask_g1));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels2, mask_g2));
+				_mm_store_si128((__m128i *)pLocalDst1, pixels_G);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask_b0);
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels1, mask_b1));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels2, mask_b2));
+				_mm_store_si128((__m128i *)pLocalDst2, pixels0);
+
+				pLocalSrc += 48;
+				pLocalDst0 += 16;
+				pLocalDst1 += 16;
+				pLocalDst2 += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDst0++ = *pSrcImage++;
+				*pLocalDst1++ = *pSrcImage++;
+				*pLocalDst2++ = *pSrcImage++;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage0 += dstImageStrideInBytes;
+			pDstImage1 += dstImageStrideInBytes;
+			pDstImage2 += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	else
+	{
+		int postfixWidth = dstWidth & 15;
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDst0 = (unsigned char *)pDstImage0;
+			pLocalDst1 = (unsigned char *)pDstImage1;
+			pLocalDst2 = (unsigned char *)pDstImage2;
+
+			int width = (int)(dstWidth >> 4);											// 16 bytes at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels2 = _mm_loadu_si128((__m128i *) (pLocalSrc + 32));
+
+				pixels_R = _mm_shuffle_epi8(pixels0, mask_r0);
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels1, mask_r1));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels2, mask_r2));
+				_mm_storeu_si128((__m128i *)pLocalDst0, pixels_R);
+
+				pixels_G = _mm_shuffle_epi8(pixels0, mask_g0);
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels1, mask_g1));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels2, mask_g2));
+				_mm_storeu_si128((__m128i *)pLocalDst1, pixels_G);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask_b0);
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels1, mask_b1));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels2, mask_b2));
+				_mm_storeu_si128((__m128i *)pLocalDst2, pixels0);
+
+				pLocalSrc += 48;
+				pLocalDst0 += 16;
+				pLocalDst1 += 16;
+				pLocalDst2 += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDst0++ = *pSrcImage++;
+				*pLocalDst1++ = *pSrcImage++;
+				*pLocalDst2++ = *pSrcImage++;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage0 += dstImageStrideInBytes;
+			pDstImage1 += dstImageStrideInBytes;
+			pDstImage2 += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8U8U8_U32
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage0,
+		vx_uint8    * pDstImage1,
+		vx_uint8    * pDstImage2,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// Check for output buffer alignment
+	intptr_t prealignBytes = (intptr_t(pDstImage0) & intptr_t(pDstImage1) & intptr_t(pDstImage2)) & 15;
+	bool isAligned = (prealignBytes == ((intptr_t(pDstImage0) | intptr_t(pDstImage1) | intptr_t(pDstImage2)) & 15));	// True if all three buffers have the same alignment
+	unsigned char *pLocalSrc, *pLocalDst0, *pLocalDst1, *pLocalDst2;
+	__m128i * tbl = (__m128i *) dataChannelExtract;
+	__m128i pixels0, pixels1, pixels2, pixels3, pixels_R, pixels_G;
+
+	__m128i mask0 = _mm_load_si128(tbl + 13);
+	__m128i mask1 = _mm_load_si128(tbl + 14);
+	__m128i mask2 = _mm_load_si128(tbl + 15);
+	__m128i mask3 = _mm_load_si128(tbl + 16);
+
+	if (isAligned)
+	{
+		int prefixWidth = (int)((prealignBytes == 0) ? 0 : (16 - prealignBytes));
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDst0 = (unsigned char *)pDstImage0;
+			pLocalDst1 = (unsigned char *)pDstImage1;
+			pLocalDst2 = (unsigned char *)pDstImage2;
+
+			for (int x = 0; x < prefixWidth; x++)
+			{
+				*pLocalDst0++ = *pLocalSrc++;
+				*pLocalDst1++ = *pLocalSrc++;
+				*pLocalDst2++ = *pLocalSrc++;
+				pLocalSrc++;;
+			}
+
+			int width = (int)(alignedWidth >> 4);
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels2 = _mm_loadu_si128((__m128i *) (pLocalSrc + 32));
+				pixels3 = _mm_loadu_si128((__m128i *) (pLocalSrc + 48));
+
+				pixels_R = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst0, pixels_R);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels_G = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst1, pixels_G);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask0);
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels1, mask1));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels2, mask2));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst2, pixels0);
+
+				pLocalSrc += 64;
+				pLocalDst0 += 16;
+				pLocalDst1 += 16;
+				pLocalDst2 += 16;
+
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDst0++ = *pLocalSrc++;
+				*pLocalDst1++ = *pLocalSrc++;
+				*pLocalDst2++ = *pLocalSrc++;
+				pLocalSrc++;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage0 += dstImageStrideInBytes;
+			pDstImage1 += dstImageStrideInBytes;
+			pDstImage2 += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	else
+	{
+		int postfixWidth = dstWidth & 15;
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDst0 = (unsigned char *)pDstImage0;
+			pLocalDst1 = (unsigned char *)pDstImage1;
+			pLocalDst2 = (unsigned char *)pDstImage2;
+
+			int width = (int)(dstWidth >> 4);											// 16 bytes at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels2 = _mm_loadu_si128((__m128i *) (pLocalSrc + 32));
+				pixels3 = _mm_loadu_si128((__m128i *) (pLocalSrc + 48));
+
+				pixels_R = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst0, pixels_R);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels_G = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst1, pixels_G);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask0);
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels1, mask1));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels2, mask2));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst2, pixels0);
+
+				pLocalSrc += 64;
+				pLocalDst0 += 16;
+				pLocalDst1 += 16;
+				pLocalDst2 += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDst0++ = *pLocalSrc++;
+				*pLocalDst1++ = *pLocalSrc++;
+				*pLocalDst2++ = *pLocalSrc++;
+				pLocalSrc++;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage0 += dstImageStrideInBytes;
+			pDstImage1 += dstImageStrideInBytes;
+			pDstImage2 += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ChannelExtract_U8U8U8U8_U32
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage0,
+		vx_uint8    * pDstImage1,
+		vx_uint8    * pDstImage2,
+		vx_uint8    * pDstImage3,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// Check for output buffer alignment
+	intptr_t prealignBytes = (intptr_t(pDstImage0) & intptr_t(pDstImage1) & intptr_t(pDstImage2) & intptr_t(pDstImage3)) & 15;
+	bool isAligned = (prealignBytes == ((intptr_t(pDstImage0) | intptr_t(pDstImage1) | intptr_t(pDstImage2) | intptr_t(pDstImage3)) & 15));	// True if all three buffers have the same alignment
+
+	unsigned char *pLocalSrc, *pLocalDst0, *pLocalDst1, *pLocalDst2, *pLocalDst3;
+	__m128i * tbl = (__m128i *) dataChannelExtract;
+	__m128i pixels0, pixels1, pixels2, pixels3, pixels_R, pixels_G, pixels_B;
+
+	__m128i mask0 = _mm_load_si128(tbl + 13);
+	__m128i mask1 = _mm_load_si128(tbl + 14);
+	__m128i mask2 = _mm_load_si128(tbl + 15);
+	__m128i mask3 = _mm_load_si128(tbl + 16);
+
+	if (isAligned)
+	{
+		int prefixWidth = (int)((prealignBytes == 0) ? 0 : (16 - prealignBytes));
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDst0 = (unsigned char *)pDstImage0;
+			pLocalDst1 = (unsigned char *)pDstImage1;
+			pLocalDst2 = (unsigned char *)pDstImage2;
+			pLocalDst3 = (unsigned char *)pDstImage3;
+
+			for (int x = 0; x < prefixWidth; x++)
+			{
+				*pLocalDst0++ = *pSrcImage++;
+				*pLocalDst1++ = *pSrcImage++;
+				*pLocalDst2++ = *pSrcImage++;
+				*pLocalDst3++ = *pSrcImage++;
+			}
+
+			int width = (int)(alignedWidth >> 4);
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels2 = _mm_loadu_si128((__m128i *) (pLocalSrc + 32));
+				pixels3 = _mm_loadu_si128((__m128i *) (pLocalSrc + 48));
+
+				pixels_R = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst0, pixels_R);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels_G = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst1, pixels_G);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels_B = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_B = _mm_or_si128(pixels_B, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_B = _mm_or_si128(pixels_B, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_B = _mm_or_si128(pixels_B, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst2, pixels_B);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask0);
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels1, mask1));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels2, mask2));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_store_si128((__m128i *)pLocalDst3, pixels0);
+
+				pLocalSrc += 64;
+				pLocalDst0 += 16;
+				pLocalDst1 += 16;
+				pLocalDst2 += 16;
+				pLocalDst3 += 16;
+
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDst0++ = *pSrcImage++;
+				*pLocalDst1++ = *pSrcImage++;
+				*pLocalDst2++ = *pSrcImage++;
+				*pLocalDst3++ = *pSrcImage++;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage0 += dstImageStrideInBytes;
+			pDstImage1 += dstImageStrideInBytes;
+			pDstImage2 += dstImageStrideInBytes;
+			pDstImage3 += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	else
+	{
+		int postfixWidth = dstWidth & 15;
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDst0 = (unsigned char *)pDstImage0;
+			pLocalDst1 = (unsigned char *)pDstImage1;
+			pLocalDst2 = (unsigned char *)pDstImage2;
+			pLocalDst3 = (unsigned char *)pDstImage3;
+
+			int width = (int)(dstWidth >> 4);											// 16 bytes at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels2 = _mm_loadu_si128((__m128i *) (pLocalSrc + 32));
+				pixels3 = _mm_loadu_si128((__m128i *) (pLocalSrc + 48));
+
+				pixels_R = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_R = _mm_or_si128(pixels_R, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst0, pixels_R);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels_G = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_G = _mm_or_si128(pixels_G, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst1, pixels_G);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels_B = _mm_shuffle_epi8(pixels0, mask0);
+				pixels_B = _mm_or_si128(pixels_B, _mm_shuffle_epi8(pixels1, mask1));
+				pixels_B = _mm_or_si128(pixels_B, _mm_shuffle_epi8(pixels2, mask2));
+				pixels_B = _mm_or_si128(pixels_B, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst2, pixels_B);
+
+				pixels0 = _mm_srli_si128(pixels0, 1);
+				pixels1 = _mm_srli_si128(pixels1, 1);
+				pixels2 = _mm_srli_si128(pixels2, 1);
+				pixels3 = _mm_srli_si128(pixels3, 1);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask0);
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels1, mask1));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels2, mask2));
+				pixels0 = _mm_or_si128(pixels0, _mm_shuffle_epi8(pixels3, mask3));
+				_mm_storeu_si128((__m128i *)pLocalDst3, pixels0);
+
+				pLocalSrc += 64;
+				pLocalDst0 += 16;
+				pLocalDst1 += 16;
+				pLocalDst2 += 16;
+				pLocalDst3 += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDst0++ = *pSrcImage++;
+				*pLocalDst1++ = *pSrcImage++;
+				*pLocalDst2++ = *pSrcImage++;
+				*pLocalDst3++ = *pSrcImage++;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage0 += dstImageStrideInBytes;
+			pDstImage1 += dstImageStrideInBytes;
+			pDstImage2 += dstImageStrideInBytes;
+			pDstImage3 += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	return AGO_SUCCESS;
+}
diff --git a/openvx/ago/ago_haf_cpu_color_convert.cpp b/openvx/ago/ago_haf_cpu_color_convert.cpp
new file mode 100644
index 0000000..0774665
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_color_convert.cpp
@@ -0,0 +1,4771 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+DECL_ALIGN(16) unsigned char dataColorConvert[16 * 26] ATTR_ALIGN(16) = {
+	  1,   3,   5,   7,   9,  11,  13,  15, 255, 255, 255, 255, 255, 255, 255, 255,		// UYVY to IYUV - Y; UV12 to IUV - V (lower); NV21 to IYUV - U; UYVY to NV12 - Y; YUYV to NV12 - UV
+	  0,   4,   8,  12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// UYVY to IYUV - U
+	  2,   6,  10,  14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// UYVY to IYUV - V
+	  0,   2,   4,   6,   8,  10,  12,  14, 255, 255, 255, 255, 255, 255, 255, 255,		// YUYV to IYUV - Y; UV12 to IUV - U (lower); NV21 to IYUV - V; UYVY to NV12 - UV; YUYV to NV12 - Y
+	  1,   5,   9,  13, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// YUYV to IYUV - U
+	  3,   7,  11,  15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// YUYV to IYUV - V
+	  0,   0,   2,   2,   4,   4,   6,   6,   8,   8,  10,  10,  12,  12,  14,  14,		// UV12 to UV - U; NV21 to YUV4 - V
+	  1,   1,   3,   3,   5,   5,   7,   7,   9,   9,  11,  11,  13,  13,  15,  15,		// VV12 to UV - V; NV21 to YUV4 - U
+	  0,   1,   2,   4,   5,   6,   8,   9,  10,  12,  13,  14, 255, 255, 255, 255,		// RGBX to RGB - First 16 bytes of RGBX to first 16 bytes of RGB
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0,   1,   2,   4,		// RGBX to RGB - Second 16 bytes of RGBX to first 16 bytes of RGB
+	  5,   6,   8,   9,  10,  12,  13,  14, 255, 255, 255, 255, 255, 255, 255, 255,		// RGBX to RGB - Second 16 bytes of RGBX to second 16 bytes of RGB
+	255, 255, 255, 255, 255, 255, 255, 255,   0,   1,   2,   4,   5,   6,   8,   9,		// RGBX to RGB - Third 16 bytes of RGBX to second 16 bytes of RGB
+	 10,  12,  13,  14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// RGBX to RGB - Third 16 bytes of RGBX to third 16 bytes of RGB
+	255, 255, 255, 255,   0,   1,   2,   4,   5,   6,   8,   9,  10,  12,  13,  14,		// RGBX to RGB - Fourth 16 bytes of RGBX to third 16 bytes of RGB
+	  0,   1,   2, 255,   3,   4,   5, 255,   6,   7,   8, 255,   9,  10,  11, 255,		// RGB to RGBX - First 16 bytes of RGB to first 16 bytes of RGBX
+	 12,  13,  14, 255,  15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// RGB to RGBX - First 16 bytes of RGB to second 16 bytes of RGBX
+	255, 255, 255, 255, 255,   0,   1, 255,   2,   3,   4, 255,   5,   6,   7, 255,		// RGB to RGBX - Second 16 bytes of RGB to second 16 bytes of RGBX
+	  8,   9,  10, 255,  11,  12,  13, 255,  14,  15, 255, 255, 255, 255, 255, 255,		// RGB to RGBX - Second 16 bytes of RGB to third 16 bytes of RGBX
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0, 255,   1,   2,   3, 255,		// RGB to RGBX - Third 16 bytes of RGB to third 16 bytes of RGBX
+	  4,   5,   6, 255,   7,   8,   9, 255,  10,  11,  12, 255,  13,  14,  15, 255,		// RGB to RGBX - Third 16 bytes of RGB to fourth 16 bytes of RGBX
+	  0,   0,   0, 255,   0,   0,   0, 255,   0,   0,   0, 255,   0,   0,   0, 255,		// RGB to RGBX - Mask to fill in 255 for X positions
+	  0,   3,   6,   9,  12,  15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// RGB to single plane extraction
+	  2,   5,   8,  11,  14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// RGB to single plane extraction
+	  1,   4,   7,  10,  13, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 	// RGB to single plane extraction
+	255, 255, 255, 255, 255, 255, 255, 255,   1,   3,   5,   7,   9,  11,  13,  15,		// UV12 to IUV - V (upper)
+	255, 255, 255, 255, 255, 255, 255, 255,   0,   2,   4,   6,   8,  10,  12,  14 		// UV12 to IUV - U (upper)
+};
+
+int HafCpu_FormatConvert_IYUV_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char *pLocalSrc, *pLocalDstY, *pLocalDstU, *pLocalDstV;
+	unsigned char *pLocalSrcNextRow, *pLocalDstYNextRow;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+	__m128i maskY = _mm_load_si128(tbl);
+	__m128i maskU = _mm_load_si128(tbl + 1);
+	__m128i maskV = _mm_load_si128(tbl + 2);
+	__m128i pixels0, pixels1, pixels0_NextRow, pixels1_NextRow, temp0, temp1;
+
+	bool isAligned = (((intptr_t(pDstYImage) & intptr_t(pDstUImage) & intptr_t(pDstVImage)) & 7) == ((intptr_t(pDstYImage) | intptr_t(pDstUImage) | intptr_t(pDstVImage)) & 7));		// Check for 8 byte alignment
+	isAligned = isAligned & ((intptr_t(pDstYImage) & 8) == 0);					// Y image should be 16 byte aligned or have same alignment as the Chroma planes
+
+	if (isAligned)
+	{
+		int prefixWidth = intptr_t(pDstYImage) & 15;
+		prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstY = (unsigned char *)pDstYImage;
+			pLocalDstYNextRow = (unsigned char *)pDstYImage + dstYImageStrideInBytes;
+			pLocalDstU = (unsigned char *)pDstUImage;
+			pLocalDstV = (unsigned char *)pDstVImage;
+
+			for (int x = 0; x < prefixWidth; x++)
+			{
+				*pLocalDstU++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// U
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstV++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// V
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+			}
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskY);								// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskY);								// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstY, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskY);						// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskY);						// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstYNextRow, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1, maskU);								// U plane, intermideate bytes 4..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskV);								// V plane, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1 = _mm_slli_si128(pixels1, 4);
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskU);								// U plane, intermideate bytes 0..3
+				pixels0 = _mm_shuffle_epi8(pixels0, maskV);								// V plane, intermideate bytes 0..3
+				temp0 = _mm_or_si128(temp0, temp1);										// U plane, intermideate bytes 0..7
+				pixels0 = _mm_or_si128(pixels0, pixels1);								// V plane, intermideate bytes 0..7
+				
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskU);						// U plane - next row, intermideate bytes 4..7
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskV);				// V plane - next row, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 4);
+
+				pixels1 = _mm_shuffle_epi8(pixels0_NextRow, maskU);						// U plane - next row, intermideate bytes 0..3
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskV);				// V plane - next row, intermideate bytes 0..3
+				temp1 = _mm_or_si128(temp1, pixels1);									// U plane - next row, intermideate bytes 0..7
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);		// V plane - next row, intermideate bytes 0..7
+
+				temp0 = _mm_avg_epu8(temp0, temp1);										// U plane, bytes 0..7
+				*((int64_t *)pLocalDstU) = M128I(temp0).m128i_i64[0];
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);						// V plane, bytes 0..7
+				*((int64_t *)pLocalDstV) = M128I(pixels0).m128i_i64[0];
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstY += 16;
+				pLocalDstYNextRow += 16;
+				pLocalDstU += 8;
+				pLocalDstV += 8;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDstU++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// U
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstV++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// V
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstYImage += (dstYImageStrideInBytes + dstYImageStrideInBytes);			// Advance by 2 rows
+			pDstUImage += dstUImageStrideInBytes;
+			pDstVImage += dstVImageStrideInBytes;
+
+			height -= 2;
+		}
+	}
+	else
+	{
+		int postfixWidth = (int)dstWidth & 15;
+		int alignedWidth = (int)dstWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstY = (unsigned char *)pDstYImage;
+			pLocalDstYNextRow = (unsigned char *)pDstYImage + dstYImageStrideInBytes;
+			pLocalDstU = (unsigned char *)pDstUImage;
+			pLocalDstV = (unsigned char *)pDstVImage;
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskY);								// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskY);								// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstY, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskY);						// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskY);						// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstYNextRow, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1, maskU);								// U plane, intermideate bytes 4..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskV);								// V plane, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1 = _mm_slli_si128(pixels1, 4);
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskU);								// U plane, intermideate bytes 0..3
+				pixels0 = _mm_shuffle_epi8(pixels0, maskV);								// V plane, intermideate bytes 0..3
+				temp0 = _mm_or_si128(temp0, temp1);										// U plane, intermideate bytes 0..7
+				pixels0 = _mm_or_si128(pixels0, pixels1);								// V plane, intermideate bytes 0..7
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskU);						// U plane - next row, intermideate bytes 4..7
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskV);				// V plane - next row, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 4);
+
+				pixels1 = _mm_shuffle_epi8(pixels0_NextRow, maskU);						// U plane - next row, intermideate bytes 0..3
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskV);				// V plane - next row, intermideate bytes 0..3
+				temp1 = _mm_or_si128(temp1, pixels1);									// U plane - next row, intermideate bytes 0..7
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);		// V plane - next row, intermideate bytes 0..7
+
+				temp0 = _mm_avg_epu8(temp0, temp1);										// U plane, bytes 0..7
+				*((int64_t *)pLocalDstU) = M128I(temp0).m128i_i64[0];
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);						// V plane, bytes 0..7
+				*((int64_t *)pLocalDstV) = M128I(pixels0).m128i_i64[0];
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstY += 16;
+				pLocalDstYNextRow += 16;
+				pLocalDstU += 8;
+				pLocalDstV += 8;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDstU++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// U
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstV++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// V
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstYImage += (dstYImageStrideInBytes + dstYImageStrideInBytes);			// Advance by 2 rows
+			pDstUImage += dstUImageStrideInBytes;
+			pDstVImage += dstVImageStrideInBytes;
+
+			height -= 2;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGB_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~7;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i shufMask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 5, 4, 2, 1, 0);
+	__m128i tempI, row;
+	__m128 Y0, Y1, U, V;
+
+	// BT709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+	__m128 const128 = _mm_set1_ps(128.0f);
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 8)
+		{
+			row = _mm_loadu_si128((__m128i *)pLocalSrc);
+
+			for (int i = 0; i < 4; i++)
+			{
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				U = _mm_cvtepi32_ps(tempI);								// U U U U
+				U = _mm_sub_ps(U, const128);
+				row = _mm_srli_si128(row, 1);
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				Y0 = _mm_cvtepi32_ps(tempI);							// Y0 Y0 Y0 Y0
+				row = _mm_srli_si128(row, 1);
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				V = _mm_cvtepi32_ps(tempI);								// V V V V
+				V = _mm_sub_ps(V, const128);
+				row = _mm_srli_si128(row, 1);
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				Y1 = _mm_cvtepi32_ps(tempI);							// Y1 Y1 Y1 Y1
+				row = _mm_srli_si128(row, 1);
+
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 0
+				Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 1
+
+				// Convert RGB01 to U8
+				tempI = _mm_packus_epi32(_mm_cvttps_epi32(Y0), _mm_cvttps_epi32(Y1));
+				tempI = _mm_packus_epi16(tempI, tempI);
+				tempI = _mm_shuffle_epi8(tempI, shufMask);
+				_mm_storeu_si128((__m128i *)(pLocalDst + 6 * i), tempI);
+			}
+
+			pLocalSrc += 16;
+			pLocalDst += 24;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float Ypix1, Ypix2, Upix, Vpix, Rpix, Gpix, Bpix;
+			Upix  = (float)(*pLocalSrc++) - 128.0f;
+			Ypix1 = (float)(*pLocalSrc++);
+			Vpix  = (float)(*pLocalSrc++) - 128.0f;
+			Ypix2 = (float)(*pLocalSrc++);
+
+			Rpix = fminf(fmaxf(Ypix1 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix1 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix1 + (Upix * 1.8556f), 0.0f), 255.0f);
+			
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+			
+			Rpix = fminf(fmaxf(Ypix2 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix2 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix2 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGB_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~7;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i shufMask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 5, 4, 2, 1, 0);
+	__m128i tempI, row;
+	__m128 Y0, Y1, U, V;
+
+	// BT709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+	__m128 const128 = _mm_set1_ps(128.0f);
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 8)
+		{
+			row = _mm_loadu_si128((__m128i *)pLocalSrc);
+
+			for (int i = 0; i < 4; i++)
+			{
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				Y0 = _mm_cvtepi32_ps(tempI);							// Y0 Y0 Y0 Y0
+				row = _mm_srli_si128(row, 1);
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				U = _mm_cvtepi32_ps(tempI);								// U U U U
+				U = _mm_sub_ps(U, const128);
+				row = _mm_srli_si128(row, 1);
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				Y1 = _mm_cvtepi32_ps(tempI);							// Y1 Y1 Y1 Y1
+				row = _mm_srli_si128(row, 1);
+				tempI = _mm_shuffle_epi8(row, _mm_set1_epi32((int)0xFFFFFF00));
+				V = _mm_cvtepi32_ps(tempI);								// V V V V
+				V = _mm_sub_ps(V, const128);
+				row = _mm_srli_si128(row, 1);
+
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 0
+				Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 1
+
+				// Convert RGB01 to U8
+				tempI = _mm_packus_epi32(_mm_cvttps_epi32(Y0), _mm_cvttps_epi32(Y1));
+				tempI = _mm_packus_epi16(tempI, tempI);
+				tempI = _mm_shuffle_epi8(tempI, shufMask);
+				_mm_storeu_si128((__m128i *)(pLocalDst + 6 * i), tempI);
+			}
+
+			pLocalSrc += 16;
+			pLocalDst += 24;
+		}
+		
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float Ypix1, Ypix2, Upix, Vpix, Rpix, Gpix, Bpix;
+			Ypix1 = (float)(*pLocalSrc++);
+			Upix = (float)(*pLocalSrc++) - 128.0f;
+			Ypix2 = (float)(*pLocalSrc++);
+			Vpix = (float)(*pLocalSrc++) - 128.0f;
+
+			Rpix = fminf(fmaxf(Ypix1 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix1 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix1 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+
+			Rpix = fminf(fmaxf(Ypix2 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix2 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix2 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGB_IYUV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcYImage,
+		vx_uint32     srcYImageStrideInBytes,
+		vx_uint8    * pSrcUImage,
+		vx_uint32     srcUImageStrideInBytes,
+		vx_uint8    * pSrcVImage,
+		vx_uint32     srcVImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	alignedWidth -= 16;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128 Y00, Y01, Y10, Y11, U, V;
+	__m128i Y0pix, Y1pix;
+	__m128i shufMask = _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+
+	vx_uint8 Upixels[8], Vpixels[8];
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrcY = pSrcYImage;
+		vx_uint8 * pLocalSrcU = pSrcUImage;
+		vx_uint8 * pLocalSrcV = pSrcVImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)	// Process 16 pixels at a time
+		{
+			Y0pix = _mm_loadu_si128((__m128i *) pLocalSrcY);
+			Y1pix = _mm_loadu_si128((__m128i *) (pLocalSrcY + srcYImageStrideInBytes));
+			*((int64_t *)Upixels) = *((int64_t *)pLocalSrcU);
+			*((int64_t *)Vpixels) = *((int64_t *)pLocalSrcV);
+
+			for (int i = 0; i < 4; i++)
+			{
+				// For pixels 00, 01
+				//			  10, 11
+				U = _mm_cvtepi32_ps(_mm_set1_epi32((int)Upixels[2*i]));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				V = _mm_cvtepi32_ps(_mm_set1_epi32((int)Vpixels[2*i]));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 00
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 01
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 10
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 11
+
+				__m128i tempI0 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB00, RGB01 to U8
+				__m128i tempI1 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB10, RGB11 to U8
+
+				// For pixels 02, 03
+				//			  12, 13
+				U = _mm_cvtepi32_ps(_mm_set1_epi32((int)Upixels[2*i + 1]));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				V = _mm_cvtepi32_ps(_mm_set1_epi32((int)Vpixels[2*i + 1]));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 02
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 03
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 12
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 13
+
+				__m128i tempI2 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB02, RGB03 to U8
+				tempI0 = _mm_packus_epi16(tempI0, tempI2);
+				tempI0 = _mm_shuffle_epi8(tempI0, shufMask);
+				_mm_storeu_si128((__m128i *)pLocalDst, tempI0);
+
+				__m128i tempI3 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB12, RGB13 to U8
+				tempI1 = _mm_packus_epi16(tempI1, tempI3);
+				tempI1 = _mm_shuffle_epi8(tempI1, shufMask);
+				_mm_storeu_si128((__m128i *)(pLocalDst + dstImageStrideInBytes), tempI1);
+				pLocalDst += 12;
+			}
+
+			pLocalSrcY += 16;
+			pLocalSrcU += 8;
+			pLocalSrcV += 8;
+		}
+
+		for (int width = 0; width < (postfixWidth >> 1); width++)		// Processing two pixels at a time in a row
+		{
+			float Ypix, Rpix, Gpix, Bpix;
+
+			Ypix = (float)(*pLocalSrcY);
+			Rpix = (float)(*pLocalSrcV++) - 128.0f;
+			Bpix = (float)(*pLocalSrcU++) - 128.0f;
+
+			Gpix = (Bpix * 0.1873f) + (Rpix * 0.4681f);
+			Rpix *= 1.5748f;
+			Bpix *= 1.8556f;
+
+			*pLocalDst = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcY + 1));
+			*(pLocalDst + 3) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 4) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 5) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcY + srcYImageStrideInBytes));
+			*(pLocalDst + dstImageStrideInBytes + 0) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcY + srcYImageStrideInBytes + 1));
+			*(pLocalDst + dstImageStrideInBytes + 3) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 4) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 5) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			pLocalSrcY += 2;
+			pLocalDst += 6;
+		}
+		pSrcYImage += (srcYImageStrideInBytes + srcYImageStrideInBytes);
+		pSrcUImage += srcUImageStrideInBytes;
+		pSrcVImage += srcVImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes + dstImageStrideInBytes);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGB_NV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	alignedWidth -= 16;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128 Y00, Y01, Y10, Y11, U, V;
+	__m128i Y0pix, Y1pix, UVpix;
+	__m128i shufMask = _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrcLuma = pSrcLumaImage;
+		vx_uint8 * pLocalSrcChroma = pSrcChromaImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)	// Process 16 pixels at a time
+		{
+			Y0pix = _mm_loadu_si128((__m128i *) pLocalSrcLuma);
+			Y1pix = _mm_loadu_si128((__m128i *) (pLocalSrcLuma + srcLumaImageStrideInBytes));
+			UVpix = _mm_loadu_si128((__m128i *) pLocalSrcChroma);
+
+			for (int i = 0; i < 4; i++)
+			{
+				// For pixels 00, 01
+				//			  10, 11
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 00
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 01
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 10
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 11
+
+				__m128i tempI0 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB00, RGB01 to U8
+				__m128i tempI1 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB10, RGB11 to U8
+
+				// For pixels 02, 03
+				//			  12, 13
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 02
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 03
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 12
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 13
+
+				__m128i tempI2 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB02, RGB03 to U8
+				tempI0 = _mm_packus_epi16(tempI0, tempI2);
+				tempI0 = _mm_shuffle_epi8(tempI0, shufMask);
+				_mm_storeu_si128((__m128i *)pLocalDst, tempI0);
+
+				__m128i tempI3 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB12, RGB13 to U8
+				tempI1 = _mm_packus_epi16(tempI1, tempI3);
+				tempI1 = _mm_shuffle_epi8(tempI1, shufMask);
+				_mm_storeu_si128((__m128i *)(pLocalDst + dstImageStrideInBytes), tempI1);
+				pLocalDst += 12;
+			}
+			pLocalSrcLuma += 16;
+			pLocalSrcChroma += 16;
+		}
+
+		for (int width = 0; width < (postfixWidth >> 1); width++)		// Processing two pixels at a time in a row
+		{
+			float Ypix, Rpix, Gpix, Bpix;
+
+			Ypix = (float)(*pLocalSrcLuma);
+			Bpix = (float)(*pLocalSrcChroma++) - 128.0f;
+			Rpix = (float)(*pLocalSrcChroma++) - 128.0f;
+
+			Gpix = (Bpix * 0.1873f) + (Rpix * 0.4681f);
+			Rpix *= 1.5748f;
+			Bpix *= 1.8556f;
+
+			*pLocalDst = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcLuma + 1));
+			*(pLocalDst + 3) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 4) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 5) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes));
+			*(pLocalDst + dstImageStrideInBytes + 0) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes + 1));
+			*(pLocalDst + dstImageStrideInBytes + 3) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 4) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 5) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			pLocalSrcLuma += 2;
+			pLocalDst += 6;
+		}
+		pSrcLumaImage += (srcLumaImageStrideInBytes + srcLumaImageStrideInBytes);
+		pSrcChromaImage += srcChromaImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes + dstImageStrideInBytes);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGB_NV21
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	alignedWidth -= 16;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128 Y00, Y01, Y10, Y11, U, V;
+	__m128i Y0pix, Y1pix, UVpix;
+	__m128i shufMask = _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrcLuma = pSrcLumaImage;
+		vx_uint8 * pLocalSrcChroma = pSrcChromaImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)	// Process 16 pixels at a time
+		{
+			Y0pix = _mm_loadu_si128((__m128i *) pLocalSrcLuma);
+			Y1pix = _mm_loadu_si128((__m128i *) (pLocalSrcLuma + srcLumaImageStrideInBytes));
+			UVpix = _mm_loadu_si128((__m128i *) pLocalSrcChroma);
+
+			for (int i = 0; i < 4; i++)
+			{
+				// For pixels 00, 01
+				//			  10, 11
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 00
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 01
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 10
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 11
+
+				__m128i tempI0 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB00, RGB01 to U8
+				__m128i tempI1 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB10, RGB11 to U8
+
+				// For pixels 02, 03
+				//			  12, 13
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 02
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 03
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 12
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 13
+
+				__m128i tempI2 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB02, RGB03 to U8
+				tempI0 = _mm_packus_epi16(tempI0, tempI2);
+				tempI0 = _mm_shuffle_epi8(tempI0, shufMask);
+				_mm_storeu_si128((__m128i *)pLocalDst, tempI0);
+
+				__m128i tempI3 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB12, RGB13 to U8
+				tempI1 = _mm_packus_epi16(tempI1, tempI3);
+				tempI1 = _mm_shuffle_epi8(tempI1, shufMask);
+				_mm_storeu_si128((__m128i *)(pLocalDst + dstImageStrideInBytes), tempI1);
+				pLocalDst += 12;
+			}
+			pLocalSrcLuma += 16;
+			pLocalSrcChroma += 16;
+		}
+
+		for (int width = 0; width < (postfixWidth >> 1); width++)		// Processing two pixels at a time in a row
+		{
+			float Ypix, Rpix, Gpix, Bpix;
+
+			Ypix = (float)(*pLocalSrcLuma);
+			Rpix = (float)(*pLocalSrcChroma++) - 128.0f;
+			Bpix = (float)(*pLocalSrcChroma++) - 128.0f;
+
+			Gpix = (Bpix * 0.1873f) + (Rpix * 0.4681f);
+			Rpix *= 1.5748f;
+			Bpix *= 1.8556f;
+
+			*pLocalDst = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcLuma + 1));
+			*(pLocalDst + 3) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 4) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 5) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes));
+			*(pLocalDst + dstImageStrideInBytes + 0) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes + 1));
+			*(pLocalDst + dstImageStrideInBytes + 3) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 4) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 5) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+
+			pLocalSrcLuma += 2;
+			pLocalDst += 6;
+		}
+		pSrcLumaImage += (srcLumaImageStrideInBytes + srcLumaImageStrideInBytes);
+		pSrcChromaImage += srcChromaImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes + dstImageStrideInBytes);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGBX_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~7;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i mask = _mm_set1_epi32((int)0xFFFFFF00);						// 255 255 255 0 255 255 255 0 255 255 255 0 255 255 255 0
+	__m128i tempI, row, RGB0, RGB1;
+	__m128 Y0, Y1, U, V;
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+	__m128 const128 = _mm_set1_ps(128.0f);
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 8)
+		{
+			row = _mm_load_si128((__m128i *)pLocalSrc);
+
+			// Pixels 0,1
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U0 U0 U0 U0
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y0 Y0 Y0 Y0
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V0 V0 V0 V0
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y1 Y1 Y1 Y1
+			row = _mm_srli_si128(row, 1);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 0
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 1
+
+			tempI = _mm_cvttps_epi32(Y0);
+			RGB0 = _mm_cvttps_epi32(Y1);
+			RGB0 = _mm_packus_epi32(tempI, RGB0);					// X1 B1 G1 R1 X0 B0 G0 R0 (words)
+
+			// Pixels 2,3
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U1 U1 U1 U1
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y2 Y2 Y2 Y2
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V1 V1 V1 V1
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y3 Y3 Y3 Y3
+			row = _mm_srli_si128(row, 1);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 2
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 3
+
+			tempI = _mm_cvttps_epi32(Y0);
+			RGB1 = _mm_cvttps_epi32(Y1);
+			RGB1 = _mm_packus_epi32(tempI, RGB1);					// X3 B3 G3 R3 X2 B2 G2 R2 (words)
+			RGB0 = _mm_packus_epi16(RGB0, RGB1);					// X3 B3 G3 R3 X2 B2 G2 R2 X1 B1 G1 R1 X0 B0 G0 R0 (bytes)
+
+			// Pixels 4,5
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U2 U2 U2 U2
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y4 Y4 Y4 Y4
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V2 V2 V2 V2
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y5 Y5 Y5 Y5
+			row = _mm_srli_si128(row, 1);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 4
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 5
+
+			tempI = _mm_cvttps_epi32(Y0);
+			RGB1 = _mm_cvttps_epi32(Y1);
+			RGB1 = _mm_packus_epi32(tempI, RGB1);					// X5 B5 G5 R5 X4 B4 G4 R4 (words)
+
+			// Pixels 6,7
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U3 U3 U3 U3
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y6 Y6 Y6 Y6
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V3 V3 V3 V3
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y7 Y7 Y7 Y7
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 6
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 7
+
+			tempI = _mm_cvttps_epi32(Y0);
+			row = _mm_cvttps_epi32(Y1);
+			row = _mm_packus_epi32(tempI, row);						// X7 B7 G7 R7 X6 B6 G6 R6 (words)
+			RGB1 = _mm_packus_epi16(RGB1, row);						// X7 B7 G7 R7 X6 B6 G6 R6 X5 B5 G5 R5 X4 B4 G4 R4 (bytes)
+
+			// Make the X component value 255
+			RGB0 = _mm_or_si128(RGB0, _mm_set1_epi32((int)0xFF000000));
+			RGB1 = _mm_or_si128(RGB1, _mm_set1_epi32((int)0xFF000000));
+
+			_mm_storeu_si128((__m128i *)pLocalDst, RGB0);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 16), RGB1);
+
+			pLocalSrc += 16;
+			pLocalDst += 32;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float Ypix1, Ypix2, Upix, Vpix, Rpix, Gpix, Bpix;
+			Upix = (float)(*pLocalSrc++) - 128.0f;
+			Ypix1 = (float)(*pLocalSrc++);
+			Vpix = (float)(*pLocalSrc++) - 128.0f;
+			Ypix2 = (float)(*pLocalSrc++);
+
+			Rpix = fminf(fmaxf(Ypix1 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix1 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix1 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+			*pLocalDst++ = (vx_uint8)255;
+
+			Rpix = fminf(fmaxf(Ypix2 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix2 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix2 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+			*pLocalDst++ = (vx_uint8)255;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGBX_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~7;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+	
+	__m128i mask = _mm_set1_epi32((int)0xFFFFFF00);						// 255 255 255 0 255 255 255 0 255 255 255 0 255 255 255 0
+	__m128i tempI, row, RGB0, RGB1;
+	__m128 Y0, Y1, U, V;
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+	__m128 const128 = _mm_set1_ps(128.0f);
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 8)
+		{
+			row = _mm_load_si128((__m128i *)pLocalSrc);
+
+			// Pixels 0,1
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y0 Y0 Y0 Y0
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U0 U0 U0 U0
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y1 Y1 Y1 Y1
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V0 V0 V0 V0
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 0
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 1
+
+			tempI = _mm_cvttps_epi32(Y0);
+			RGB0 = _mm_cvttps_epi32(Y1);
+			RGB0 = _mm_packus_epi32(tempI, RGB0);					// X1 B1 G1 R1 X0 B0 G0 R0 (words)
+
+			// Pixels 2,3
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y2 Y2 Y2 Y2
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U1 U1 U1 U1
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y3 Y3 Y3 Y3
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V1 V1 V1 V1
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 2
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 3
+
+			tempI = _mm_cvttps_epi32(Y0);
+			RGB1 = _mm_cvttps_epi32(Y1);
+			RGB1 = _mm_packus_epi32(tempI, RGB1);					// X3 B3 G3 R3 X2 B2 G2 R2 (words)
+			RGB0 = _mm_packus_epi16(RGB0, RGB1);					// X3 B3 G3 R3 X2 B2 G2 R2 X1 B1 G1 R1 X0 B0 G0 R0 (bytes)
+
+			// Pixels 4,5
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y4 Y4 Y4 Y4
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U2 U2 U2 U2
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y5 Y5 Y5 Y5
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V2 V2 V2 V2
+			V = _mm_sub_ps(V, const128);
+			row = _mm_srli_si128(row, 1);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 4
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 5
+
+			tempI = _mm_cvttps_epi32(Y0);
+			RGB1 = _mm_cvttps_epi32(Y1);
+			RGB1 = _mm_packus_epi32(tempI, RGB1);					// X5 B5 G5 R5 X4 B4 G4 R4 (words)
+
+			// Pixels 6,7
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y0 = _mm_cvtepi32_ps(tempI);							// Y6 Y6 Y6 Y6
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			U = _mm_cvtepi32_ps(tempI);								// U3 U3 U3 U3
+			U = _mm_sub_ps(U, const128);
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			Y1 = _mm_cvtepi32_ps(tempI);							// Y7 Y7 Y7 Y7
+			row = _mm_srli_si128(row, 1);
+			tempI = _mm_shuffle_epi8(row, mask);
+			V = _mm_cvtepi32_ps(tempI);								// V0 V0 V0 V3
+			V = _mm_sub_ps(V, const128);
+
+			U = _mm_mul_ps(U, weights_U2RGB);
+			V = _mm_mul_ps(V, weights_V2RGB);
+			U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+			Y0 = _mm_add_ps(Y0, U);									// RGB for pixel 6
+			Y1 = _mm_add_ps(Y1, U);									// RGB for pixel 7
+
+			tempI = _mm_cvttps_epi32(Y0);
+			row = _mm_cvttps_epi32(Y1);
+			row = _mm_packus_epi32(tempI, row);						// X7 B7 G7 R7 X6 B6 G6 R6 (words)
+			RGB1 = _mm_packus_epi16(RGB1, row);						// X7 B7 G7 R7 X6 B6 G6 R6 X5 B5 G5 R5 X4 B4 G4 R4 (bytes)
+
+			// Make the X component value 255
+			RGB0 = _mm_or_si128(RGB0, _mm_set1_epi32((int)0xFF000000));
+			RGB1 = _mm_or_si128(RGB1, _mm_set1_epi32((int)0xFF000000));
+
+			_mm_storeu_si128((__m128i *)pLocalDst, RGB0);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 16), RGB1);
+
+			pLocalSrc += 16;
+			pLocalDst += 32;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float Ypix1, Ypix2, Upix, Vpix, Rpix, Gpix, Bpix;
+			Ypix1 = (float)(*pLocalSrc++);
+			Upix = (float)(*pLocalSrc++) - 128.0f;
+			Ypix2 = (float)(*pLocalSrc++);
+			Vpix = (float)(*pLocalSrc++) - 128.0f;
+
+			Rpix = fminf(fmaxf(Ypix1 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix1 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix1 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+			*pLocalDst++ = (vx_uint8)255;
+
+			Rpix = fminf(fmaxf(Ypix2 + (Vpix * 1.5748f), 0.0f), 255.0f);
+			Gpix = fminf(fmaxf(Ypix2 - (Upix * 0.1873f) - (Vpix * 0.4681f), 0.0f), 255.0f);
+			Bpix = fminf(fmaxf(Ypix2 + (Upix * 1.8556f), 0.0f), 255.0f);
+
+			*pLocalDst++ = (vx_uint8)Rpix;
+			*pLocalDst++ = (vx_uint8)Gpix;
+			*pLocalDst++ = (vx_uint8)Bpix;
+			*pLocalDst++ = (vx_uint8)255;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGBX_IYUV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcYImage,
+		vx_uint32     srcYImageStrideInBytes,
+		vx_uint8    * pSrcUImage,
+		vx_uint32     srcUImageStrideInBytes,
+		vx_uint8    * pSrcVImage,
+		vx_uint32     srcVImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128 Y00, Y01, Y10, Y11, U, V;
+	__m128i Y0pix, Y1pix;
+	__m128i shufMask = _mm_set_epi8(-1, 14, 13, 12, -1, 10, 9, 8, -1, 6, 5, 4, -1, 2, 1, 0);
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+
+	vx_uint8 Upixels[8], Vpixels[8];
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrcY = pSrcYImage;
+		vx_uint8 * pLocalSrcU = pSrcUImage;
+		vx_uint8 * pLocalSrcV = pSrcVImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)	// Process 16 pixels at a time
+		{
+			Y0pix = _mm_loadu_si128((__m128i *) pLocalSrcY);
+			Y1pix = _mm_loadu_si128((__m128i *) (pLocalSrcY + srcYImageStrideInBytes));
+			*((int64_t *)Upixels) = *((int64_t *)pLocalSrcU);
+			*((int64_t *)Vpixels) = *((int64_t *)pLocalSrcV);
+
+			for (int i = 0; i < 4; i++)
+			{
+				// For pixels 00, 01
+				//			  10, 11
+				U = _mm_cvtepi32_ps(_mm_set1_epi32((int)Upixels[2 * i]));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				V = _mm_cvtepi32_ps(_mm_set1_epi32((int)Vpixels[2 * i]));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 00
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 01
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 10
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 11
+
+				__m128i tempI0 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB00, RGB01 to U8
+				__m128i tempI1 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB10, RGB11 to U8
+
+				// For pixels 02, 03
+				//			  12, 13
+				U = _mm_cvtepi32_ps(_mm_set1_epi32((int)Upixels[2 * i + 1]));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				V = _mm_cvtepi32_ps(_mm_set1_epi32((int)Vpixels[2 * i + 1]));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 02
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 03
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 12
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 13
+
+				__m128i tempI2 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB02, RGB03 to U8
+				tempI0 = _mm_packus_epi16(tempI0, tempI2);
+				tempI0 = _mm_shuffle_epi8(tempI0, shufMask);
+				tempI0 = _mm_or_si128(tempI0, _mm_set1_epi32((int)0xFF000000));
+				_mm_storeu_si128((__m128i *)pLocalDst, tempI0);
+
+				__m128i tempI3 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB12, RGB13 to U8
+				tempI1 = _mm_packus_epi16(tempI1, tempI3);
+				tempI1 = _mm_shuffle_epi8(tempI1, shufMask);
+				tempI1 = _mm_or_si128(tempI1, _mm_set1_epi32((int)0xFF000000));
+				_mm_storeu_si128((__m128i *)(pLocalDst + dstImageStrideInBytes), tempI1);
+				
+				pLocalDst += 16;
+			}
+			pLocalSrcY += 16;
+			pLocalSrcU += 8;
+			pLocalSrcV += 8;
+		}
+
+		for (int width = 0; width < (postfixWidth >> 1); width += 2)		// Processing two pixels at a time in a row
+		{
+			float Ypix, Rpix, Gpix, Bpix;
+
+			Ypix = (float)(*pLocalSrcY);
+			Rpix = (float)(*pLocalSrcV++) - 128.0f;
+			Bpix = (float)(*pLocalSrcU++) - 128.0f;
+
+			Gpix = (Bpix * 0.1873f) + (Rpix * 0.4681f);
+			Rpix *= 1.5748f;
+			Bpix *= 1.8556f;
+
+			*pLocalDst = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + 3) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcY + 1));
+			*(pLocalDst + 4) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 5) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 6) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + 7) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcY + srcYImageStrideInBytes));
+			*(pLocalDst + dstImageStrideInBytes + 0) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 3) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcY + srcYImageStrideInBytes + 1));
+			*(pLocalDst + dstImageStrideInBytes + 4) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 5) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 6) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 7) = (vx_uint8)255;
+
+			pLocalSrcY += 2;
+			pLocalDst += 8;
+		}
+		pSrcYImage += (srcYImageStrideInBytes + srcYImageStrideInBytes);
+		pSrcUImage += srcUImageStrideInBytes;
+		pSrcVImage += srcVImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes + dstImageStrideInBytes);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGBX_NV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128 Y00, Y01, Y10, Y11, U, V;
+	__m128i Y0pix, Y1pix, UVpix;
+	__m128i shufMask = _mm_set_epi8(-1, 14, 13, 12, -1, 10, 9, 8, -1, 6, 5, 4, -1, 2, 1, 0);
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrcLuma = pSrcLumaImage;
+		vx_uint8 * pLocalSrcChroma = pSrcChromaImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)	// Process 16 pixels at a time
+		{
+			Y0pix = _mm_loadu_si128((__m128i *) pLocalSrcLuma);
+			Y1pix = _mm_loadu_si128((__m128i *) (pLocalSrcLuma + srcLumaImageStrideInBytes));
+			UVpix = _mm_loadu_si128((__m128i *) pLocalSrcChroma);
+
+			for (int i = 0; i < 4; i++)
+			{
+				// For pixels 00, 01
+				//			  10, 11
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 00
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 01
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 10
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 11
+
+				__m128i tempI0 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB00, RGB01 to U8
+				__m128i tempI1 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB10, RGB11 to U8
+
+				// For pixels 02, 03
+				//			  12, 13
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 02
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 03
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 12
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 13
+
+				__m128i tempI2 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB02, RGB03 to U8
+				tempI0 = _mm_packus_epi16(tempI0, tempI2);
+				tempI0 = _mm_shuffle_epi8(tempI0, shufMask);
+				tempI0 = _mm_or_si128(tempI0, _mm_set1_epi32(0xFF000000));
+				_mm_storeu_si128((__m128i *)pLocalDst, tempI0);
+
+				__m128i tempI3 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB12, RGB13 to U8
+				tempI1 = _mm_packus_epi16(tempI1, tempI3);
+				tempI1 = _mm_shuffle_epi8(tempI1, shufMask);
+				tempI1 = _mm_or_si128(tempI1, _mm_set1_epi32(0xFF000000));
+				_mm_storeu_si128((__m128i *)(pLocalDst + dstImageStrideInBytes), tempI1);
+				
+				pLocalDst += 16;
+			}
+			pLocalSrcLuma += 16;
+			pLocalSrcChroma += 16;
+		}
+
+		for (int width = 0; width < (postfixWidth >> 1); width += 2)		// Processing two pixels at a time in a row
+		{
+			float Ypix, Rpix, Gpix, Bpix;
+
+			Ypix = (float)(*pLocalSrcLuma);
+			Bpix = (float)(*pLocalSrcChroma++) - 128.0f;
+			Rpix = (float)(*pLocalSrcChroma++) - 128.0f;
+
+			Gpix = (Bpix * 0.1873f) + (Rpix * 0.4681f);
+			Rpix *= 1.5748f;
+			Bpix *= 1.8556f;
+
+			*pLocalDst = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + 3) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcLuma + 1));
+			*(pLocalDst + 4) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 5) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 6) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + 7) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes));
+			*(pLocalDst + dstImageStrideInBytes + 0) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 3) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes + 1));
+			*(pLocalDst + dstImageStrideInBytes + 4) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 5) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 6) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 7) = (vx_uint8)255;
+
+			pLocalSrcLuma += 2;
+			pLocalDst += 8;
+		}
+		pSrcLumaImage += (srcLumaImageStrideInBytes + srcLumaImageStrideInBytes);
+		pSrcChromaImage += srcChromaImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes + dstImageStrideInBytes);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGBX_NV21
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcLumaImage,
+		vx_uint32     srcLumaImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128 Y00, Y01, Y10, Y11, U, V;
+	__m128i Y0pix, Y1pix, UVpix;
+	__m128i shufMask = _mm_set_epi8(-1, 14, 13, 12, -1, 10, 9, 8, -1, 6, 5, 4, -1, 2, 1, 0);
+
+	// BT 709 conversion factors
+	__m128 weights_U2RGB = _mm_set_ps(0.0f, 1.8556f, -0.1873f, 0.0f);		// x R G B, The most significant float is don't care
+	__m128 weights_V2RGB = _mm_set_ps(0.0f, 0.0f, -0.4681f, 1.5748f);		// x R G B, The most significant float is don't care
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrcLuma = pSrcLumaImage;
+		vx_uint8 * pLocalSrcChroma = pSrcChromaImage;
+		vx_uint8 * pLocalDst = pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)	// Process 16 pixels at a time
+		{
+			Y0pix = _mm_loadu_si128((__m128i *) pLocalSrcLuma);
+			Y1pix = _mm_loadu_si128((__m128i *) (pLocalSrcLuma + srcLumaImageStrideInBytes));
+			UVpix = _mm_loadu_si128((__m128i *) pLocalSrcChroma);
+
+			for (int i = 0; i < 4; i++)
+			{
+				// For pixels 00, 01
+				//			  10, 11
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 00
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 01
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 10
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 11
+
+				__m128i tempI0 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB00, RGB01 to U8
+				__m128i tempI1 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB10, RGB11 to U8
+
+				// For pixels 02, 03
+				//			  12, 13
+				Y00 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y01 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y0pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y0pix = _mm_srli_si128(Y0pix, 1);
+				Y10 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				Y11 = _mm_cvtepi32_ps(_mm_shuffle_epi8(Y1pix, _mm_set1_epi32((int)0xFFFFFF00)));
+				Y1pix = _mm_srli_si128(Y1pix, 1);
+				V = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				V = _mm_sub_ps(V, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_cvtepi32_ps(_mm_shuffle_epi8(UVpix, _mm_set1_epi32((int)0xFFFFFF00)));
+				U = _mm_sub_ps(U, _mm_set1_ps(128.0f));
+				UVpix = _mm_srli_si128(UVpix, 1);
+				U = _mm_mul_ps(U, weights_U2RGB);
+				V = _mm_mul_ps(V, weights_V2RGB);
+				U = _mm_add_ps(U, V);									// weights_U*U + weights_V*V
+				Y00 = _mm_add_ps(Y00, U);								// RGB for pixel 02
+				Y01 = _mm_add_ps(Y01, U);								// RGB for pixel 03
+				Y10 = _mm_add_ps(Y10, U);								// RGB for pixel 12
+				Y11 = _mm_add_ps(Y11, U);								// RGB for pixel 13
+
+				__m128i tempI2 = _mm_packus_epi32(_mm_cvttps_epi32(Y00), _mm_cvttps_epi32(Y01));	// Convert RGB02, RGB03 to U8
+				tempI0 = _mm_packus_epi16(tempI0, tempI2);
+				tempI0 = _mm_shuffle_epi8(tempI0, shufMask);
+				tempI0 = _mm_or_si128(tempI0, _mm_set1_epi32(0xFF000000));
+				_mm_storeu_si128((__m128i *)pLocalDst, tempI0);
+
+				__m128i tempI3 = _mm_packus_epi32(_mm_cvttps_epi32(Y10), _mm_cvttps_epi32(Y11));	// Convert RGB12, RGB13 to U8
+				tempI1 = _mm_packus_epi16(tempI1, tempI3);
+				tempI1 = _mm_shuffle_epi8(tempI1, shufMask);
+				tempI1 = _mm_or_si128(tempI1, _mm_set1_epi32(0xFF000000));
+				_mm_storeu_si128((__m128i *)(pLocalDst + dstImageStrideInBytes), tempI1);
+
+				pLocalDst += 16;
+			}
+			pLocalSrcLuma += 16;
+			pLocalSrcChroma += 16;
+		}
+
+		for (int width = 0; width < (postfixWidth >> 1); width += 2)		// Processing two pixels at a time in a row
+		{
+			float Ypix, Rpix, Gpix, Bpix;
+
+			Ypix = (float)(*pLocalSrcLuma);
+			Rpix = (float)(*pLocalSrcChroma++) - 128.0f;
+			Bpix = (float)(*pLocalSrcChroma++) - 128.0f;
+
+			Gpix = (Bpix * 0.1873f) + (Rpix * 0.4681f);
+			Rpix *= 1.5748f;
+			Bpix *= 1.8556f;
+
+			*pLocalDst = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + 3) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcLuma + 1));
+			*(pLocalDst + 4) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + 5) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + 6) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + 7) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes));
+			*(pLocalDst + dstImageStrideInBytes + 0) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 1) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 2) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 3) = (vx_uint8)255;
+
+			Ypix = (float)(*(pLocalSrcLuma + srcLumaImageStrideInBytes + 1));
+			*(pLocalDst + dstImageStrideInBytes + 4) = (vx_uint8)fminf(fmaxf(Ypix + Rpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 5) = (vx_uint8)fminf(fmaxf(Ypix - Gpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 6) = (vx_uint8)fminf(fmaxf(Ypix + Bpix, 0.0f), 255.0f);
+			*(pLocalDst + dstImageStrideInBytes + 7) = (vx_uint8)255;
+
+			pLocalSrcLuma += 2;
+			pLocalDst += 8;
+		}
+		pSrcLumaImage += (srcLumaImageStrideInBytes + srcLumaImageStrideInBytes);
+		pSrcChromaImage += srcChromaImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes + dstImageStrideInBytes);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FormatConvert_IYUV_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char *pLocalSrc, *pLocalDstY, *pLocalDstU, *pLocalDstV;
+	unsigned char *pLocalSrcNextRow, *pLocalDstYNextRow;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+	__m128i maskY = _mm_load_si128(tbl + 3);
+	__m128i maskU = _mm_load_si128(tbl + 4);
+	__m128i maskV = _mm_load_si128(tbl + 5);
+	__m128i pixels0, pixels1, pixels0_NextRow, pixels1_NextRow, temp0, temp1;
+
+	bool isAligned = (((intptr_t(pDstYImage) & intptr_t(pDstUImage) & intptr_t(pDstVImage)) & 7) == ((intptr_t(pDstYImage) | intptr_t(pDstUImage) | intptr_t(pDstVImage)) & 7));		// Check for 8 byte alignment
+	isAligned = isAligned & ((intptr_t(pDstYImage) & 8) == 0);					// Y image should be 16 byte aligned or have same alignment as the Chroma planes
+
+	if (isAligned)
+	{
+		int prefixWidth = intptr_t(pDstYImage) & 15;
+		prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstY = (unsigned char *)pDstYImage;
+			pLocalDstYNextRow = (unsigned char *)pDstYImage + dstYImageStrideInBytes;
+			pLocalDstU = (unsigned char *)pDstUImage;
+			pLocalDstV = (unsigned char *)pDstVImage;
+
+			for (int x = 0; x < prefixWidth; x++)
+			{
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstU++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// U
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstV++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// V
+			}
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskY);								// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskY);								// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstY, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskY);						// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskY);						// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstYNextRow, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1, maskU);								// U plane, intermideate bytes 4..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskV);								// V plane, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1 = _mm_slli_si128(pixels1, 4);
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskU);								// U plane, intermideate bytes 0..3
+				pixels0 = _mm_shuffle_epi8(pixels0, maskV);								// V plane, intermideate bytes 0..3
+				temp0 = _mm_or_si128(temp0, temp1);										// U plane, intermideate bytes 0..7
+				pixels0 = _mm_or_si128(pixels0, pixels1);								// V plane, intermideate bytes 0..7
+				
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskU);						// U plane - next row, intermideate bytes 4..7
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskV);				// V plane - next row, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 4);
+
+				pixels1 = _mm_shuffle_epi8(pixels0_NextRow, maskU);						// U plane - next row, intermideate bytes 0..3
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskV);				// V plane - next row, intermideate bytes 0..3
+				temp1 = _mm_or_si128(temp1, pixels1);									// U plane - next row, intermideate bytes 0..7
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);		// V plane - next row, intermideate bytes 0..7
+
+				temp0 = _mm_avg_epu8(temp0, temp1);										// U plane, bytes 0..7
+				*((int64_t *)pLocalDstU) = M128I(temp0).m128i_i64[0];
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);						// V plane, bytes 0..7
+				*((int64_t *)pLocalDstV) = M128I(pixels0).m128i_i64[0];
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstY += 16;
+				pLocalDstYNextRow += 16;
+				pLocalDstU += 8;
+				pLocalDstV += 8;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstU++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// U
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstV++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// V
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstYImage += (dstYImageStrideInBytes + dstYImageStrideInBytes);			// Advance by 2 rows
+			pDstUImage += dstUImageStrideInBytes;
+			pDstVImage += dstVImageStrideInBytes;
+
+			height -= 2;
+		}
+	}
+	else
+	{
+		int postfixWidth = (int)dstWidth & 15;
+		int alignedWidth = (int)dstWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstY = (unsigned char *)pDstYImage;
+			pLocalDstYNextRow = (unsigned char *)pDstYImage + dstYImageStrideInBytes;
+			pLocalDstU = (unsigned char *)pDstUImage;
+			pLocalDstV = (unsigned char *)pDstVImage;
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskY);								// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskY);								// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstY, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskY);						// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskY);						// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstYNextRow, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1, maskU);								// U plane, intermideate bytes 4..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskV);								// V plane, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1 = _mm_slli_si128(pixels1, 4);
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskU);								// U plane, intermideate bytes 0..3
+				pixels0 = _mm_shuffle_epi8(pixels0, maskV);								// V plane, intermideate bytes 0..3
+				temp0 = _mm_or_si128(temp0, temp1);										// U plane, intermideate bytes 0..7
+				pixels0 = _mm_or_si128(pixels0, pixels1);								// V plane, intermideate bytes 0..7
+				
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskU);						// U plane - next row, intermideate bytes 4..7
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskV);				// V plane - next row, intermideate bytes 4..7
+				temp1 = _mm_slli_si128(temp1, 4);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 4);
+
+				pixels1 = _mm_shuffle_epi8(pixels0_NextRow, maskU);						// U plane - next row, intermideate bytes 0..3
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskV);				// V plane - next row, intermideate bytes 0..3
+				temp1 = _mm_or_si128(temp1, pixels1);									// U plane - next row, intermideate bytes 0..7
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);		// V plane - next row, intermideate bytes 0..7
+
+				temp0 = _mm_avg_epu8(temp0, temp1);										// U plane, bytes 0..7
+				_mm_storeu_si128((__m128i *) pLocalDstU, temp0);						// Only lower 8 bytes valid
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);						// V plane, bytes 0..7
+				_mm_storeu_si128((__m128i *) pLocalDstV, pixels0);						// Only lower 8 bytes valid
+
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstY += 16;
+				pLocalDstYNextRow += 16;
+				pLocalDstU += 8;
+				pLocalDstV += 8;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstU++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// U
+				*pLocalDstY++ = *pLocalSrc++;											// Y
+				*pLocalDstYNextRow++ = *pLocalSrcNextRow++;								// Y - next row
+				*pLocalDstV++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;				// V
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstYImage += (dstYImageStrideInBytes + dstYImageStrideInBytes);			// Advance by 2 rows
+			pDstUImage += dstUImageStrideInBytes;
+			pDstVImage += dstVImageStrideInBytes;
+
+			height -= 2;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FormatConvert_NV12_UYVY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char *pLocalSrc, *pLocalDstLuma, *pLocalDstChroma;
+	unsigned char *pLocalSrcNextRow, *pLocalDstLumaNextRow;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+	__m128i maskLuma = _mm_load_si128(tbl);
+	__m128i maskChroma = _mm_load_si128(tbl + 3);
+	__m128i pixels0, pixels1, pixels0_NextRow, pixels1_NextRow, temp0, temp1;
+
+	bool isAligned = ((intptr_t(pDstLumaImage) & 15) == (intptr_t(pDstChromaImage) & 15));
+
+	if (isAligned)													// Optimized routine for both dst images at same alignment
+	{
+		int prefixWidth = intptr_t(pDstLumaImage) & 15;
+		prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+		int postfixWidth = ((int) dstWidth - prefixWidth) & 15;
+		int alignedWidth = (int) dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int) dstHeight;
+		while (height > 0)
+		{
+			pLocalSrc = (unsigned char *) pSrcImage;
+			pLocalDstLuma = (unsigned char *) pDstLumaImage;
+			pLocalDstChroma = (unsigned char *) pDstChromaImage;
+			pLocalSrcNextRow = (unsigned char *) pSrcImage + srcImageStrideInBytes;
+			pLocalDstLumaNextRow = (unsigned char *) pDstLumaImage + dstLumaImageStrideInBytes;
+			
+			for (int x = 0; x < prefixWidth; x += 2)
+			{
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// U
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// V
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+			}
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskLuma);							// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskLuma);							// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstLuma, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskLuma);					// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskLuma);					// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstLumaNextRow, temp0);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, maskChroma);						// Chroma plane, bytes 0..7
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskChroma);		// Chroma plane - Next row, bytes 0..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskChroma);						// Chroma plane, bytes 8..15
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskChroma);		// Chroma plane - Next row, bytes 8..15
+				
+				pixels1 = _mm_slli_si128(pixels1, 8);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 8);
+				pixels0 = _mm_or_si128(pixels0, pixels1);
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);
+				_mm_store_si128((__m128i *) pLocalDstChroma, pixels0);
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstLuma += 16;
+				pLocalDstLumaNextRow += 16;
+				pLocalDstChroma += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x += 2)
+			{
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// U
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// V
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstLumaImage += (dstLumaImageStrideInBytes + dstLumaImageStrideInBytes);	// Advance by 2 rows
+			pDstChromaImage += dstChromaImageStrideInBytes;
+
+			height -= 2;
+		}
+		
+	}
+	else
+	{
+		int postfixWidth = (int)dstWidth & 15;
+		int alignedWidth = (int)dstWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height > 0)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDstLuma = (unsigned char *)pDstLumaImage;
+			pLocalDstChroma = (unsigned char *)pDstChromaImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstLumaNextRow = (unsigned char *)pDstLumaImage + dstLumaImageStrideInBytes;
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskLuma);							// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskLuma);							// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstLuma, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskLuma);					// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskLuma);					// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstLumaNextRow, temp0);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, maskChroma);						// Chroma plane, bytes 0..7
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskChroma);		// Chroma plane - Next row, bytes 0..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskChroma);						// Chroma plane, bytes 8..15
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskChroma);		// Chroma plane - Next row, bytes 8..15
+
+				pixels1 = _mm_slli_si128(pixels1, 8);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 8);
+				pixels0 = _mm_or_si128(pixels0, pixels1);
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);
+				_mm_storeu_si128((__m128i *) pLocalDstChroma, pixels0);
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstLuma += 16;
+				pLocalDstLumaNextRow += 16;
+				pLocalDstChroma += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x += 2)
+			{
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// U
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// V
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstLumaImage += (dstLumaImageStrideInBytes + dstLumaImageStrideInBytes);	// Advance by 2 rows
+			pDstChromaImage += dstChromaImageStrideInBytes;
+
+			height -= 2;
+		}	
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FormatConvert_NV12_YUYV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char *pLocalSrc, *pLocalDstLuma, *pLocalDstChroma;
+	unsigned char *pLocalSrcNextRow, *pLocalDstLumaNextRow;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+	__m128i maskLuma = _mm_load_si128(tbl + 3);
+	__m128i maskChroma = _mm_load_si128(tbl);
+	__m128i pixels0, pixels1, pixels0_NextRow, pixels1_NextRow, temp0, temp1;
+
+	bool isAligned = ((intptr_t(pDstLumaImage) & 15) == (intptr_t(pDstChromaImage) & 15));
+
+	if (isAligned)													// Optimized routine for both dst images at same alignment
+	{
+		int prefixWidth = intptr_t(pDstLumaImage) & 15;
+		prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height > 0)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDstLuma = (unsigned char *)pDstLumaImage;
+			pLocalDstChroma = (unsigned char *)pDstChromaImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstLumaNextRow = (unsigned char *)pDstLumaImage + dstLumaImageStrideInBytes;
+
+			for (int x = 0; x < prefixWidth; x += 2)
+			{
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// U
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// V
+			}
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskLuma);							// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskLuma);							// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstLuma, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskLuma);					// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskLuma);					// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_store_si128((__m128i *) pLocalDstLumaNextRow, temp0);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, maskChroma);						// Chroma plane, bytes 0..7
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskChroma);		// Chroma plane - Next row, bytes 0..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskChroma);						// Chroma plane, bytes 8..15
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskChroma);		// Chroma plane - Next row, bytes 8..15
+
+				pixels1 = _mm_slli_si128(pixels1, 8);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 8);
+				pixels0 = _mm_or_si128(pixels0, pixels1);
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);
+				_mm_store_si128((__m128i *) pLocalDstChroma, pixels0);
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstLuma += 16;
+				pLocalDstLumaNextRow += 16;
+				pLocalDstChroma += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x += 2)
+			{
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// U
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// V				
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstLumaImage += (dstLumaImageStrideInBytes + dstLumaImageStrideInBytes);	// Advance by 2 rows
+			pDstChromaImage += dstChromaImageStrideInBytes;
+
+			height -= 2;
+		}
+
+	}
+	else
+	{
+		int postfixWidth = (int)dstWidth & 15;
+		int alignedWidth = (int)dstWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height > 0)
+		{
+			pLocalSrc = (unsigned char *)pSrcImage;
+			pLocalDstLuma = (unsigned char *)pDstLumaImage;
+			pLocalDstChroma = (unsigned char *)pDstChromaImage;
+			pLocalSrcNextRow = (unsigned char *)pSrcImage + srcImageStrideInBytes;
+			pLocalDstLumaNextRow = (unsigned char *)pDstLumaImage + dstLumaImageStrideInBytes;
+
+			int width = alignedWidth >> 4;												// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+				pixels1 = _mm_loadu_si128((__m128i *) (pLocalSrc + 16));
+				pixels0_NextRow = _mm_loadu_si128((__m128i *) pLocalSrcNextRow);
+				pixels1_NextRow = _mm_loadu_si128((__m128i *) (pLocalSrcNextRow + 16));
+
+				temp0 = _mm_shuffle_epi8(pixels0, maskLuma);							// Y plane, bytes 0..7
+				temp1 = _mm_shuffle_epi8(pixels1, maskLuma);							// Y plane, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstLuma, temp0);
+
+				temp1 = _mm_shuffle_epi8(pixels1_NextRow, maskLuma);					// Y plane - next row, bytes 8..15
+				temp1 = _mm_slli_si128(temp1, 8);
+				temp0 = _mm_shuffle_epi8(pixels0_NextRow, maskLuma);					// Y plane - next row, bytes 0..7
+				temp0 = _mm_or_si128(temp0, temp1);
+				_mm_storeu_si128((__m128i *) pLocalDstLumaNextRow, temp0);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, maskChroma);						// Chroma plane, bytes 0..7
+				pixels0_NextRow = _mm_shuffle_epi8(pixels0_NextRow, maskChroma);		// Chroma plane - Next row, bytes 0..7
+				pixels1 = _mm_shuffle_epi8(pixels1, maskChroma);						// Chroma plane, bytes 8..15
+				pixels1_NextRow = _mm_shuffle_epi8(pixels1_NextRow, maskChroma);		// Chroma plane - Next row, bytes 8..15
+
+				pixels1 = _mm_slli_si128(pixels1, 8);
+				pixels1_NextRow = _mm_slli_si128(pixels1_NextRow, 8);
+				pixels0 = _mm_or_si128(pixels0, pixels1);
+				pixels0_NextRow = _mm_or_si128(pixels0_NextRow, pixels1_NextRow);
+				pixels0 = _mm_avg_epu8(pixels0, pixels0_NextRow);
+				_mm_storeu_si128((__m128i *) pLocalDstChroma, pixels0);
+
+				pLocalSrc += 32;
+				pLocalSrcNextRow += 32;
+				pLocalDstLuma += 16;
+				pLocalDstLumaNextRow += 16;
+				pLocalDstChroma += 16;
+				width--;
+			}
+
+			for (int x = 0; x < postfixWidth; x += 2)
+			{
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// U
+				*pLocalDstLuma++ = *pLocalSrc++;										// Y
+				*pLocalDstLumaNextRow++ = *pLocalSrcNextRow++;							// Y - next row
+				*pLocalDstChroma++ = (*pLocalSrc++ + *pLocalSrcNextRow++) >> 1;			// V
+			}
+
+			pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);				// Advance by 2 rows
+			pDstLumaImage += (dstLumaImageStrideInBytes + dstLumaImageStrideInBytes);	// Advance by 2 rows
+			pDstChromaImage += dstChromaImageStrideInBytes;
+
+			height -= 2;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGB_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+
+	__m128i mask_1_1 = _mm_load_si128(tbl + 8);				// First 16 bytes -RGBX- to first 16 bytes - RGB
+	__m128i mask_1_2 = _mm_load_si128(tbl + 9);				// Second 16 bytes -RGBX- to first 16 bytes - RGB
+	__m128i mask_2_2 = _mm_load_si128(tbl + 10);			// Second 16 bytes -RGBX- to second 16 bytes - RGB
+	__m128i mask_2_3 = _mm_load_si128(tbl + 11);			// Third 16 bytes -RGBX- to second 16 bytes - RGB
+	__m128i mask_3_3 = _mm_load_si128(tbl + 12);			// Third 16 bytes -RGBX- to third 16 bytes - RGB
+	__m128i mask_3_4 = _mm_load_si128(tbl + 13);			// Fourth 16 bytes -RGBX- to third 16 bytes - RGB
+	__m128i pixels1, pixels2, pixels3, pixels4, temp;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = (vx_uint8 *)pSrcImage;
+		vx_uint8 * pLocalDst = (vx_uint8 *)pDstImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels1 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			pixels4 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+
+			pixels4 = _mm_shuffle_epi8(pixels4, mask_3_4);
+			temp = _mm_shuffle_epi8(pixels3, mask_3_3);
+			pixels4 = _mm_or_si128(pixels4, temp);
+
+			pixels3 = _mm_shuffle_epi8(pixels3, mask_2_3);
+			temp = _mm_shuffle_epi8(pixels2, mask_2_2);
+			pixels3 = _mm_or_si128(pixels3, temp);
+
+			pixels2 = _mm_shuffle_epi8(pixels2, mask_1_2);
+			temp = _mm_shuffle_epi8(pixels1, mask_1_1);
+			pixels2 = _mm_or_si128(pixels2, temp);
+
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels2);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 16), pixels3);
+			_mm_storeu_si128((__m128i *)(pLocalDst + 32), pixels4);
+
+			pLocalDst += 48;
+			pLocalSrc += 64;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			*pLocalDst++ = *pLocalSrc++;
+			*pLocalDst++ = *pLocalSrc++;
+			*pLocalDst++ = *pLocalSrc++;
+			pLocalSrc++;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_RGBX_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 2;														// 4 bytes = 1 pixel
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	unsigned char *pLocalSrc, *pLocalDst;
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	__m128i * tbl = (__m128i*) dataColorConvert;
+
+	__m128i mask_1_1 = _mm_load_si128(tbl + 14);			// First 16 bytes -RGB- to first 16 bytes - RGBX
+	__m128i mask_2_1 = _mm_load_si128(tbl + 15);			// First 16 bytes -RGB- to second 16 bytes - RGBX
+	__m128i mask_2_2 = _mm_load_si128(tbl + 16);			// Second 16 bytes -RGB- to second 16 bytes - RGBX
+	__m128i mask_3_2 = _mm_load_si128(tbl + 17);			// Second 16 bytes -RGB- to third 16 bytes - RGBX
+	__m128i mask_3_3 = _mm_load_si128(tbl + 18);			// Third 16 bytes -RGB- to third 16 bytes - RGBX
+	__m128i mask_4_3 = _mm_load_si128(tbl + 19);			// Third 16 bytes -RGB- to fourth 16 bytes - RGBX
+	__m128i mask_fill = _mm_load_si128(tbl + 20);			// Fill in 255 at the X positions
+	__m128i pixels1, pixels2, pixels3, pixels4, temp;
+
+	int height = (int) dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *) pSrcImage;
+		pLocalDst = (unsigned char *) pDstImage;
+		for (int x = 0; x < prefixWidth; x++)
+		{
+			*pLocalDst++ = *pLocalSrc++;					// R
+			*pLocalDst++ = *pLocalSrc++;					// G
+			*pLocalDst++ = *pLocalSrc++;					// B
+			*pLocalDst++ = (unsigned char)255;
+		}
+
+		pLocalSrc_xmm = (__m128i *) pLocalSrc;
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);				// 16 pixels processed at a time
+		while (width)
+		{
+			pixels1 = _mm_loadu_si128(pLocalSrc_xmm++);
+			pixels2 = _mm_loadu_si128(pLocalSrc_xmm++);
+			pixels3 = _mm_loadu_si128(pLocalSrc_xmm++);
+
+			pixels4 = _mm_shuffle_epi8(pixels3, mask_4_3);
+
+			pixels3 = _mm_shuffle_epi8(pixels3, mask_3_3);
+			temp = _mm_shuffle_epi8(pixels2, mask_3_2);
+			pixels3 = _mm_or_si128(pixels3, temp);
+
+			pixels2 = _mm_shuffle_epi8(pixels2, mask_2_2);
+			temp = _mm_shuffle_epi8(pixels1, mask_2_1);
+			pixels2 = _mm_or_si128(pixels2, temp);
+
+			pixels1 = _mm_shuffle_epi8(pixels1, mask_1_1);
+
+			pixels1 = _mm_or_si128(pixels1, mask_fill);
+			pixels2 = _mm_or_si128(pixels2, mask_fill);
+			pixels3 = _mm_or_si128(pixels3, mask_fill);
+			pixels4 = _mm_or_si128(pixels4, mask_fill);
+
+			_mm_store_si128(pLocalDst_xmm++, pixels1);
+			_mm_store_si128(pLocalDst_xmm++, pixels2);
+			_mm_store_si128(pLocalDst_xmm++, pixels3);
+			_mm_store_si128(pLocalDst_xmm++, pixels4);
+
+			width--;
+		}
+
+		pLocalSrc = (unsigned char *) pLocalSrc_xmm;
+		pLocalDst = (unsigned char *) pLocalDst_xmm;
+		for (int x = 0; x < postfixWidth; x++)
+		{
+			*pLocalDst++ = *pLocalSrc++;					// R
+			*pLocalDst++ = *pLocalSrc++;					// G
+			*pLocalDst++ = *pLocalSrc++;					// B
+			*pLocalDst++ = (unsigned char)255;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_IYUV_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~3;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+	__m128i mask = _mm_load_si128(tbl + 14);		// 0 B3 G3 R3 0 B2 G2 R2 0 B1 G1 R1 0 B0 G0 R0
+	__m128i cvtmask = _mm_set1_epi32(255);			// 0 0 0 FF 0 0 0 FF 0 0 0 FF 0 0 0 FF
+	__m128i row0, row1, tempI;
+	__m128 Y0, U0, V0, Y1, U1, V1, weights_toY, weights_toU, weights_toV, temp, temp2;
+	__m128 const128 = _mm_set1_ps(128.0f);
+
+	DECL_ALIGN(16) unsigned int Ybuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned short Ubuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned short Vbuf[8] ATTR_ALIGN(16);
+
+	for (int height = 0; height < (int) dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDstY = pDstYImage;
+		vx_uint8 * pLocalDstU = pDstUImage;
+		vx_uint8 * pLocalDstV = pDstVImage;
+
+		for (int width = 0; width < (alignedWidth >> 2); width++)
+		{
+			row0 = _mm_loadu_si128((__m128i*)(pLocalSrc));
+			row1 = _mm_loadu_si128((__m128i*)(pLocalSrc + srcImageStrideInBytes));
+
+			row0 = _mm_shuffle_epi8(row0, mask);
+			row1 = _mm_shuffle_epi8(row1, mask);
+
+			// R0..R3
+			weights_toY = _mm_set_ps1(0.2126f);
+			weights_toU = _mm_set_ps1(-0.1146f);
+			weights_toV = _mm_set_ps1(0.5f);
+			tempI = _mm_and_si128(row0, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y0 = _mm_mul_ps(temp, weights_toY);
+			U0 = _mm_mul_ps(temp, weights_toU);
+			V0 = _mm_mul_ps(temp, weights_toV);
+
+			tempI = _mm_and_si128(row1, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y1 = _mm_mul_ps(temp, weights_toY);
+			U1 = _mm_mul_ps(temp, weights_toU);
+			V1 = _mm_mul_ps(temp, weights_toV);
+
+			// G0..G3
+			weights_toY = _mm_set_ps1(0.7152f);
+			weights_toU = _mm_set_ps1(-0.3854f);
+			weights_toV = _mm_set_ps1(-0.4542f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			// B0..B3
+			weights_toY = _mm_set_ps1(0.0722f);
+			weights_toU = _mm_set_ps1(0.5f);
+			weights_toV = _mm_set_ps1(-0.0458f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			tempI = _mm_cvttps_epi32(Y0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			row1 = _mm_cvttps_epi32(Y1);
+			row1 = _mm_packus_epi32(row1, row1);
+			row1 = _mm_packus_epi16(row1, row1);
+			_mm_store_si128((__m128i *)Ybuf, tempI);
+			_mm_store_si128((__m128i *)(Ybuf + 4), row1);
+
+			// u00 u01 u02 u03
+			// u10 u11 u12 u13
+			U0 = _mm_add_ps(U0, const128);
+			U1 = _mm_add_ps(U1, const128);
+			tempI = _mm_cvttps_epi32(U0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvttps_epi32(U1);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI,tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			row0 = _mm_set1_epi16(1);
+			tempI = _mm_add_epi16(tempI, row0);
+			tempI = _mm_srli_epi16(tempI, 1);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Ubuf, tempI);
+
+			// v00 v01 v02 v03
+			// v10 v11 v12 v13
+			V0 = _mm_add_ps(V0, const128);
+			V1 = _mm_add_ps(V1, const128);
+			tempI = _mm_cvttps_epi32(V0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvttps_epi32(V1);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			tempI = _mm_add_epi16(tempI, row0);
+			tempI = _mm_srli_epi16(tempI, 1);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Vbuf, tempI);
+
+			*(unsigned int *)(pLocalDstY) = Ybuf[0];
+			*(unsigned int *)(pLocalDstY + dstYImageStrideInBytes) = Ybuf[4];
+			*(unsigned short *)(pLocalDstU) = Ubuf[0];
+			*(unsigned short *)(pLocalDstV) = Vbuf[0];
+
+			pLocalSrc += 12;
+			pLocalDstY += 4;
+			pLocalDstU += 2;
+			pLocalDstV += 2;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float R = (float)*(pLocalSrc);
+			float G = (float)*(pLocalSrc + 1);
+			float B = (float)*(pLocalSrc + 2);
+
+			*pLocalDstY = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			float U = (R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f;
+			float V = (R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f;
+
+			R = (float)*(pLocalSrc + 3);
+			G = (float)*(pLocalSrc + 4);
+			B = (float)*(pLocalSrc + 5);
+
+			*(pLocalDstY + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 1);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 2);
+
+			*(pLocalDstY + dstYImageStrideInBytes) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes + 3);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 4);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 5);
+
+			*(pLocalDstY + dstYImageStrideInBytes + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			U /= 4.0f;	V /= 4.0f;
+
+			*pLocalDstU++ = (vx_uint8)U;
+			*pLocalDstY++ = (vx_uint8)V;
+
+			pLocalSrc += 6;
+			pLocalDstY += 2;
+		}
+
+		pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);
+		pDstYImage += (dstYImageStrideInBytes + dstYImageStrideInBytes);
+		pDstUImage += dstUImageStrideInBytes;
+		pDstVImage += dstVImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_NV12_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~3;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+	__m128i mask = _mm_load_si128(tbl + 14);		// 0 B3 G3 R3 0 B2 G2 R2 0 B1 G1 R1 0 B0 G0 R0
+	__m128i cvtmask = _mm_set1_epi32(255);			// 0 0 0 FF 0 0 0 FF 0 0 0 FF 0 0 0 FF
+	__m128i row0, row1, tempI;
+	__m128 Y0, U0, V0, Y1, U1, V1, weights_toY, weights_toU, weights_toV, temp, temp2;
+	__m128 const128 = _mm_set1_ps(128.0f);
+
+	DECL_ALIGN(16) unsigned int Ybuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned short Ubuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned short Vbuf[8] ATTR_ALIGN(16);
+
+	for (int height = 0; height < (int)dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDstLuma = pDstLumaImage;
+		vx_uint8 * pLocalDstChroma = pDstChromaImage;
+
+		for (int width = 0; width < (alignedWidth >> 2); width++)
+		{
+			row0 = _mm_loadu_si128((__m128i*)(pLocalSrc));
+			row1 = _mm_loadu_si128((__m128i*)(pLocalSrc + srcImageStrideInBytes));
+
+			row0 = _mm_shuffle_epi8(row0, mask);
+			row1 = _mm_shuffle_epi8(row1, mask);
+
+			// R0..R3
+			weights_toY = _mm_set_ps1(0.2126f);
+			weights_toU = _mm_set_ps1(-0.1146f);
+			weights_toV = _mm_set_ps1(0.5f);
+			tempI = _mm_and_si128(row0, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y0 = _mm_mul_ps(temp, weights_toY);
+			U0 = _mm_mul_ps(temp, weights_toU);
+			V0 = _mm_mul_ps(temp, weights_toV);
+
+			tempI = _mm_and_si128(row1, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y1 = _mm_mul_ps(temp, weights_toY);
+			U1 = _mm_mul_ps(temp, weights_toU);
+			V1 = _mm_mul_ps(temp, weights_toV);
+
+			// G0..G3
+			weights_toY = _mm_set_ps1(0.7152f);
+			weights_toU = _mm_set_ps1(-0.3854f);
+			weights_toV = _mm_set_ps1(-0.4542f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			// B0..B3
+			weights_toY = _mm_set_ps1(0.0722f);
+			weights_toU = _mm_set_ps1(0.5f);
+			weights_toV = _mm_set_ps1(-0.0458f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, cvtmask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			tempI = _mm_cvttps_epi32(Y0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			row1 = _mm_cvttps_epi32(Y1);
+			row1 = _mm_packus_epi32(row1, row1);
+			row1 = _mm_packus_epi16(row1, row1);
+			_mm_store_si128((__m128i *)Ybuf, tempI);
+			_mm_store_si128((__m128i *)(Ybuf + 4), row1);
+
+			// u00 u01 u02 u03
+			// u10 u11 u12 u13
+			U0 = _mm_add_ps(U0, const128);
+			U1 = _mm_add_ps(U1, const128);
+			tempI = _mm_cvttps_epi32(U0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvttps_epi32(U1);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			row0 = _mm_set1_epi16(1);
+			tempI = _mm_add_epi16(tempI, row0);
+			tempI = _mm_srli_epi16(tempI, 1);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Ubuf, tempI);
+
+			// v00 v01 v02 v03
+			// v10 v11 v12 v13
+			V0 = _mm_add_ps(V0, const128);
+			V1 = _mm_add_ps(V1, const128);
+			tempI = _mm_cvttps_epi32(V0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvttps_epi32(V1);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			tempI = _mm_add_epi16(tempI, row0);
+			tempI = _mm_srli_epi16(tempI, 1);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Vbuf, tempI);
+
+			*(unsigned int *)(pLocalDstLuma) = Ybuf[0];
+			*(unsigned int *)(pLocalDstLuma + dstLumaImageStrideInBytes) = Ybuf[4];
+			*(unsigned int *)(pLocalDstChroma) = Ubuf[0] | (Vbuf[0] << 8) | (Ubuf[1] << 16) | (Vbuf[1] << 24);
+
+			pLocalSrc += 12;
+			pLocalDstLuma += 4;
+			pLocalDstChroma += 4;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float R = (float)*(pLocalSrc);
+			float G = (float)*(pLocalSrc + 1);
+			float B = (float)*(pLocalSrc + 2);
+
+			*pLocalDstLuma = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			float U = (R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f;
+			float V = (R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f;
+
+			R = (float)*(pLocalSrc + 3);
+			G = (float)*(pLocalSrc + 4);
+			B = (float)*(pLocalSrc + 5);
+
+			*(pLocalDstLuma + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 1);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 2);
+
+			*(pLocalDstLuma + dstLumaImageStrideInBytes) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes + 3);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 4);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 5);
+
+			*(pLocalDstLuma + dstLumaImageStrideInBytes + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			U /= 4.0f;	V /= 4.0f;
+
+			*pLocalDstChroma++ = (vx_uint8)U;
+			*pLocalDstChroma++ = (vx_uint8)V;
+
+			pLocalSrc += 6;
+			pLocalDstLuma += 2;
+		}
+		pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);
+		pDstLumaImage += (dstLumaImageStrideInBytes + dstLumaImageStrideInBytes);
+		pDstChromaImage += dstChromaImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_Y_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+
+	__m128i pixels0, pixels1, pixels2, R, G, B;
+	__m128i mask1 = _mm_load_si128(tbl + 21);
+	__m128i mask2 = _mm_load_si128(tbl + 22);
+	__m128i mask3 = _mm_load_si128(tbl + 23);
+	__m128 weights_R = _mm_set_ps1((float) 0.2126);
+	__m128 weights_G = _mm_set_ps1((float) 0.7152);
+	__m128 weights_B = _mm_set_ps1((float) 0.0722);
+	__m128 temp, Y;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstYImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			R = _mm_shuffle_epi8(pixels0, mask1);						// 0 0 0 0 0 0 0 0 0 0 R5 R4 R3 R2 R1 R0
+			G = _mm_shuffle_epi8(pixels0, mask3);						// 0 0 0 0 0 0 0 0 0 0 0 G4 G3 G2 G1 G0
+			B = _mm_shuffle_epi8(pixels0, mask2);						// 0 0 0 0 0 0 0 0 0 0 0 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels1, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 0 R10 R9 R8 R7 R6
+			pixels0 = _mm_slli_si128(pixels0, 6);
+			R = _mm_or_si128(R, pixels0);								// 0 0 0 0 0 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask1);					// 0 0 0 0 0 0 0 0 0 0 G10 G9 G8 G7 G6 G5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			G = _mm_or_si128(G, pixels0);								// 0 0 0 0 0 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 B9 B8 B7 B6 B5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			B = _mm_or_si128(B, pixels0);								// 0 0 0 0 0 0 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels2, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 R15 R14 R13 R12 R11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			R = _mm_or_si128(R, pixels0);								// R15 R14 R13 R12 R11 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 G15 G14 G13 G12 G11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			G = _mm_or_si128(G, pixels0);								// G15 G14 G13 G12 G11 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask1);					// 0 0 0 0 0 0 0 0 0 0 B15 B14 B13 B12 B11 B10
+			pixels0 = _mm_slli_si128(pixels0, 10);
+			B = _mm_or_si128(B, pixels0);								// B15 B14 B13 B12 B11 B10 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			// For pixels 0..3
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R0..R3
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G0..G3
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B0..B3
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_cvttps_epi32(Y);
+
+			// For pixels 4..7
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R4..R7
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G4..G7
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B4..B7
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels0 = _mm_packus_epi32(pixels0, pixels1);
+
+			// For pixels 8..11
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R8..R11
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G8..G11
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B8..B11
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+
+			// For pixels 12..15
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R12..R15
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G12..G15
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B12..B15
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+
+			pixels0 = _mm_packus_epi16(pixels0, pixels1);
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels0);
+
+			pLocalSrc += 48;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+
+			*pLocalDst++ = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstYImage += dstYImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_U_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+
+	__m128i pixels0, pixels1, pixels2, R, G, B;
+	__m128i mask1 = _mm_load_si128(tbl + 21);
+	__m128i mask2 = _mm_load_si128(tbl + 22);
+	__m128i mask3 = _mm_load_si128(tbl + 23);
+	__m128i offset = _mm_set1_epi32((int) 128);
+	__m128 weights_R = _mm_set_ps1((float) -0.1146);
+	__m128 weights_G = _mm_set_ps1((float) -0.3854);
+	__m128 weights_B = _mm_set_ps1((float) 0.5);
+	__m128 temp, Y;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstUImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			R = _mm_shuffle_epi8(pixels0, mask1);						// 0 0 0 0 0 0 0 0 0 0 R5 R4 R3 R2 R1 R0
+			G = _mm_shuffle_epi8(pixels0, mask3);						// 0 0 0 0 0 0 0 0 0 0 0 G4 G3 G2 G1 G0
+			B = _mm_shuffle_epi8(pixels0, mask2);						// 0 0 0 0 0 0 0 0 0 0 0 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels1, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 0 R10 R9 R8 R7 R6
+			pixels0 = _mm_slli_si128(pixels0, 6);
+			R = _mm_or_si128(R, pixels0);								// 0 0 0 0 0 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask1);					// 0 0 0 0 0 0 0 0 0 0 G10 G9 G8 G7 G6 G5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			G = _mm_or_si128(G, pixels0);								// 0 0 0 0 0 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 B9 B8 B7 B6 B5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			B = _mm_or_si128(B, pixels0);								// 0 0 0 0 0 0 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels2, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 R15 R14 R13 R12 R11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			R = _mm_or_si128(R, pixels0);								// R15 R14 R13 R12 R11 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 G15 G14 G13 G12 G11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			G = _mm_or_si128(G, pixels0);								// G15 G14 G13 G12 G11 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask1);					// 0 0 0 0 0 0 0 0 0 0 B15 B14 B13 B12 B11 B10
+			pixels0 = _mm_slli_si128(pixels0, 10);
+			B = _mm_or_si128(B, pixels0);								// B15 B14 B13 B12 B11 B10 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			// For pixels 0..3
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R0..R3
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G0..G3
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B0..B3
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_cvttps_epi32(Y);
+			pixels0 = _mm_add_epi32(pixels0, offset);
+
+			// For pixels 4..7
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R4..R7
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G4..G7
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B4..B7
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_add_epi32(pixels1, offset);
+			pixels0 = _mm_packus_epi32(pixels0, pixels1);
+			
+			// For pixels 8..11
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R8..R11
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G8..G11
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B8..B11
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_add_epi32(pixels1, offset);
+
+			// For pixels 12..15
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R12..R15
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G12..G15
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B12..B15
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvttps_epi32(Y);
+			pixels2 = _mm_add_epi32(pixels2, offset);
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+			
+			pixels0 = _mm_packus_epi16(pixels0, pixels1);
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels0);
+
+			pLocalSrc += 48;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+
+			*pLocalDst++ = (vx_uint8)((R * -0.1146f) + (G * -0.3854) + (B * 0.5f) + 128.0f);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstUImage += dstUImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_V_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+
+	__m128i pixels0, pixels1, pixels2, R, G, B;
+	__m128i mask1 = _mm_load_si128(tbl + 21);
+	__m128i mask2 = _mm_load_si128(tbl + 22);
+	__m128i mask3 = _mm_load_si128(tbl + 23);
+	__m128i offset = _mm_set1_epi32((int)128);
+	__m128 weights_R = _mm_set_ps1((float) 0.5);
+	__m128 weights_G = _mm_set_ps1((float)-0.4542);
+	__m128 weights_B = _mm_set_ps1((float)-0.0458);
+	__m128 temp, Y;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstVImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			R = _mm_shuffle_epi8(pixels0, mask1);						// 0 0 0 0 0 0 0 0 0 0 R5 R4 R3 R2 R1 R0
+			G = _mm_shuffle_epi8(pixels0, mask3);						// 0 0 0 0 0 0 0 0 0 0 0 G4 G3 G2 G1 G0
+			B = _mm_shuffle_epi8(pixels0, mask2);						// 0 0 0 0 0 0 0 0 0 0 0 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels1, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 0 R10 R9 R8 R7 R6
+			pixels0 = _mm_slli_si128(pixels0, 6);
+			R = _mm_or_si128(R, pixels0);								// 0 0 0 0 0 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask1);					// 0 0 0 0 0 0 0 0 0 0 G10 G9 G8 G7 G6 G5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			G = _mm_or_si128(G, pixels0);								// 0 0 0 0 0 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 B9 B8 B7 B6 B5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			B = _mm_or_si128(B, pixels0);								// 0 0 0 0 0 0 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels2, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 R15 R14 R13 R12 R11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			R = _mm_or_si128(R, pixels0);								// R15 R14 R13 R12 R11 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 G15 G14 G13 G12 G11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			G = _mm_or_si128(G, pixels0);								// G15 G14 G13 G12 G11 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask1);					// 0 0 0 0 0 0 0 0 0 0 B15 B14 B13 B12 B11 B10
+			pixels0 = _mm_slli_si128(pixels0, 10);
+			B = _mm_or_si128(B, pixels0);								// B15 B14 B13 B12 B11 B10 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			// For pixels 0..3
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R0..R3
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G0..G3
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B0..B3
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_cvttps_epi32(Y);
+			pixels0 = _mm_add_epi32(pixels0, offset);
+
+			// For pixels 4..7
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R4..R7
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G4..G7
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B4..B7
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_add_epi32(pixels1, offset);
+			pixels0 = _mm_packus_epi32(pixels0, pixels1);
+
+			// For pixels 8..11
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R8..R11
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G8..G11
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B8..B11
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_add_epi32(pixels1, offset);
+
+			// For pixels 12..15
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp = _mm_cvtepi32_ps(pixels2);							// R12..R15
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_G);							// G12..G15
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp = _mm_cvtepi32_ps(pixels2);
+			temp = _mm_mul_ps(temp, weights_B);							// B12..B15
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvttps_epi32(Y);
+			pixels2 = _mm_add_epi32(pixels2, offset);
+			pixels1 = _mm_packus_epi32(pixels1, pixels2);
+
+			pixels0 = _mm_packus_epi16(pixels0, pixels1);
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels0);
+
+			pLocalSrc += 48;
+			pLocalDst += 16;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+
+			*pLocalDst++ = (vx_uint8)((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstVImage += dstVImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_Y_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+	
+	__m128i pixels0, pixels1, pixels2, pixels3, tempI;
+	__m128i mask = _mm_set_epi8((char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128 weights_R = _mm_set_ps1((float) 0.2126);
+	__m128 weights_G = _mm_set_ps1((float) 0.7152);
+	__m128 weights_B = _mm_set_ps1((float) 0.0722);
+	__m128 temp, Y;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstYImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			pixels3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+
+			// For pixels 0..3
+			tempI = _mm_and_si128(pixels0, mask);						// R0..R3
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels0 = _mm_srli_si128(pixels0, 1);
+			tempI = _mm_and_si128(pixels0, mask);						// G0..G3
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_srli_si128(pixels0, 1);
+			tempI = _mm_and_si128(pixels0, mask);						// B0..B3
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_cvttps_epi32(Y);
+
+			// For pixels 4..7
+			tempI = _mm_and_si128(pixels1, mask);						// R4..R7
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels1 = _mm_srli_si128(pixels1, 1);
+			tempI = _mm_and_si128(pixels1, mask);						// G4..G7
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_srli_si128(pixels1, 1);
+			tempI = _mm_and_si128(pixels1, mask);						// B4..B7
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels0 = _mm_packus_epi32(pixels0, pixels1);
+
+			// For pixels 8..11
+			tempI = _mm_and_si128(pixels2, mask);						// R8..R11
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_srli_si128(pixels2, 1);
+			tempI = _mm_and_si128(pixels2, mask);						// G8..G11
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_srli_si128(pixels2, 1);
+			tempI = _mm_and_si128(pixels2, mask);						// B8..B11
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvttps_epi32(Y);
+
+			// For pixels 12..15
+			tempI = _mm_and_si128(pixels3, mask);						// R12..R15
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels3 = _mm_srli_si128(pixels3, 1);
+			tempI = _mm_and_si128(pixels3, mask);						// G12..G15
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels3 = _mm_srli_si128(pixels3, 1);
+			tempI = _mm_and_si128(pixels3, mask);						// B12..B15
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels3 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_packus_epi32(pixels2, pixels3);
+
+			pixels0 = _mm_packus_epi16(pixels0, pixels1);
+			_mm_storeu_si128((__m128i *)pLocalDst, pixels0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+			pLocalSrc++;
+
+			*pLocalDst++ = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstYImage += dstYImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_U_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i pixels0, pixels1, pixels2, pixels3, tempI;
+	__m128i mask = _mm_set_epi8((char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i offset = _mm_set1_epi32((int)128);
+	__m128 weights_R = _mm_set_ps1((float) -0.1146);
+	__m128 weights_G = _mm_set_ps1((float) -0.3854);
+	__m128 weights_B = _mm_set_ps1((float) 0.5);
+	__m128 temp, Y;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstUImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			pixels3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+
+			// For pixels 0..3
+			tempI = _mm_and_si128(pixels0, mask);						// R0..R3
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels0 = _mm_srli_si128(pixels0, 1);
+			tempI = _mm_and_si128(pixels0, mask);						// G0..G3
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_srli_si128(pixels0, 1);
+			tempI = _mm_and_si128(pixels0, mask);						// B0..B3
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_cvttps_epi32(Y);
+			pixels0 = _mm_add_epi32(pixels0, offset);
+
+			// For pixels 4..7
+			tempI = _mm_and_si128(pixels1, mask);						// R4..R7
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels1 = _mm_srli_si128(pixels1, 1);
+			tempI = _mm_and_si128(pixels1, mask);						// G4..G7
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_srli_si128(pixels1, 1);
+			tempI = _mm_and_si128(pixels1, mask);						// B4..B7
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_add_epi32(pixels1, offset);
+			pixels0 = _mm_packus_epi32(pixels0, pixels1);
+			
+			// For pixels 8..11
+			tempI = _mm_and_si128(pixels2, mask);						// R8..R11
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_srli_si128(pixels2, 1);
+			tempI = _mm_and_si128(pixels2, mask);						// G8..G11
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_srli_si128(pixels2, 1);
+			tempI = _mm_and_si128(pixels2, mask);						// B8..B11
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvttps_epi32(Y);
+			pixels2 = _mm_add_epi32(pixels2, offset);
+
+			// For pixels 12..15
+			tempI = _mm_and_si128(pixels3, mask);						// R12..R15
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels3 = _mm_srli_si128(pixels3, 1);
+			tempI = _mm_and_si128(pixels3, mask);						// G12..G15
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels3 = _mm_srli_si128(pixels3, 1);
+			tempI = _mm_and_si128(pixels3, mask);						// B12..B15
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels3 = _mm_cvttps_epi32(Y);
+			pixels3 = _mm_add_epi32(pixels3, offset);
+			pixels1 = _mm_packus_epi32(pixels2, pixels3);
+			
+			pixels0 = _mm_packus_epi16(pixels0, pixels1);
+			_mm_store_si128((__m128i *)pLocalDst, pixels0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+			pLocalSrc++;
+
+			*pLocalDst++ = (vx_uint8)((R * -0.1146f) + (G * -0.3854) + (B * 0.5f) + 128.0f);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstUImage += dstUImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_V_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i pixels0, pixels1, pixels2, pixels3, tempI;
+	__m128i mask = _mm_set_epi8((char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i offset = _mm_set1_epi32((int)128);
+	__m128 weights_R = _mm_set_ps1((float) 0.5);
+	__m128 weights_G = _mm_set_ps1((float)-0.4542);
+	__m128 weights_B = _mm_set_ps1((float)-0.0458);
+	__m128 temp, Y;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDst = pDstVImage;
+
+		for (int width = 0; width < (alignedWidth>>4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+			pixels3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 48));
+
+			// For pixels 0..3
+			tempI = _mm_and_si128(pixels0, mask);						// R0..R3
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels0 = _mm_srli_si128(pixels0, 1);
+			tempI = _mm_and_si128(pixels0, mask);						// G0..G3
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_srli_si128(pixels0, 1);
+			tempI = _mm_and_si128(pixels0, mask);						// B0..B3
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels0 = _mm_cvttps_epi32(Y);
+			pixels0 = _mm_add_epi32(pixels0, offset);
+
+			// For pixels 4..7
+			tempI = _mm_and_si128(pixels1, mask);						// R4..R7
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels1 = _mm_srli_si128(pixels1, 1);
+			tempI = _mm_and_si128(pixels1, mask);						// G4..G7
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_srli_si128(pixels1, 1);
+			tempI = _mm_and_si128(pixels1, mask);						// B4..B7
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels1 = _mm_cvttps_epi32(Y);
+			pixels1 = _mm_add_epi32(pixels1, offset);
+			pixels0 = _mm_packus_epi32(pixels0, pixels1);
+
+			// For pixels 8..11
+			tempI = _mm_and_si128(pixels2, mask);						// R8..R11
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels2 = _mm_srli_si128(pixels2, 1);
+			tempI = _mm_and_si128(pixels2, mask);						// G8..G11
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_srli_si128(pixels2, 1);
+			tempI = _mm_and_si128(pixels2, mask);						// B8..B11
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels2 = _mm_cvttps_epi32(Y);
+			pixels2 = _mm_add_epi32(pixels2, offset);
+
+			// For pixels 12..15
+			tempI = _mm_and_si128(pixels3, mask);						// R12..R15
+			temp = _mm_cvtepi32_ps(tempI);
+			Y = _mm_mul_ps(temp, weights_R);
+			pixels3 = _mm_srli_si128(pixels3, 1);
+			tempI = _mm_and_si128(pixels3, mask);						// G12..G15
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_G);
+			Y = _mm_add_ps(Y, temp);
+			pixels3 = _mm_srli_si128(pixels3, 1);
+			tempI = _mm_and_si128(pixels3, mask);						// B12..B15
+			temp = _mm_cvtepi32_ps(tempI);
+			temp = _mm_mul_ps(temp, weights_B);
+			Y = _mm_add_ps(Y, temp);
+			pixels3 = _mm_cvttps_epi32(Y);
+			pixels3 = _mm_add_epi32(pixels3, offset);
+			pixels1 = _mm_packus_epi32(pixels2, pixels3);
+
+			pixels0 = _mm_packus_epi16(pixels0, pixels1);
+			_mm_store_si128((__m128i *)pLocalDst, pixels0);
+
+			pLocalSrc += 64;
+			pLocalDst += 16;
+		}
+		
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+			pLocalSrc++;
+
+			*pLocalDst++ = (vx_uint8)((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstVImage += dstVImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_YUV4_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i pixels, tempI;
+	__m128 Y, U, V, weights_toY, weights_toU, weights_toV, temp;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDstY = pDstYImage;
+		vx_uint8 * pLocalDstU = pDstUImage;
+		vx_uint8 * pLocalDstV = pDstVImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			__m128i Yout = _mm_setzero_si128();
+			__m128i Uout = _mm_setzero_si128();
+			__m128i Vout = _mm_setzero_si128();
+
+			for (int i = 0; i < 4; i++)
+			{
+				pixels = _mm_loadu_si128((__m128i *) pLocalSrc);
+
+				weights_toY = _mm_set_ps1(0.2126f);
+				weights_toU = _mm_set_ps1(-0.1146f);
+				weights_toV = _mm_set_ps1(0.5f);
+				tempI = _mm_and_si128(pixels, _mm_set1_epi32((int)0x000000FF));						// R0..R3
+				temp = _mm_cvtepi32_ps(tempI);
+				Y = _mm_mul_ps(temp, weights_toY);
+				U = _mm_mul_ps(temp, weights_toU);
+				V = _mm_mul_ps(temp, weights_toV);
+
+				weights_toY = _mm_set_ps1(0.7152f);
+				weights_toU = _mm_set_ps1(-0.3854f);
+				weights_toV = _mm_set_ps1(-0.4542f);
+				pixels = _mm_srli_si128(pixels, 1);
+				tempI = _mm_and_si128(pixels, _mm_set1_epi32((int)0x000000FF));						// G0..G3
+				temp = _mm_cvtepi32_ps(tempI);
+				weights_toY = _mm_mul_ps(temp, weights_toY);
+				Y = _mm_add_ps(Y, weights_toY);
+				weights_toY = _mm_mul_ps(temp, weights_toU);
+				U = _mm_add_ps(U, weights_toY);
+				weights_toY = _mm_mul_ps(temp, weights_toV);
+				V = _mm_add_ps(V, weights_toY);
+
+				weights_toY = _mm_set_ps1(0.0722f);
+				weights_toU = _mm_set_ps1(0.5f);
+				weights_toV = _mm_set_ps1(-0.0458f);
+				pixels = _mm_srli_si128(pixels, 1);
+				tempI = _mm_and_si128(pixels, _mm_set1_epi32((int)0x000000FF));						// B0..B3
+				temp = _mm_cvtepi32_ps(tempI);
+				weights_toY = _mm_mul_ps(temp, weights_toY);
+				Y = _mm_add_ps(Y, weights_toY);
+				weights_toY = _mm_mul_ps(temp, weights_toU);
+				U = _mm_add_ps(U, weights_toY);
+				weights_toY = _mm_mul_ps(temp, weights_toV);
+				V = _mm_add_ps(V, weights_toY);
+
+				tempI = _mm_cvtps_epi32(Y);
+				tempI = _mm_packus_epi32(tempI, tempI);
+				tempI = _mm_packus_epi16(tempI, tempI);
+				tempI = _mm_and_si128(tempI, _mm_set_epi32((int)0xFFFFFFFF, 0, 0, 0));
+				Yout = _mm_srli_si128(Yout, 4);
+				Yout = _mm_or_si128(Yout, tempI);
+				
+				tempI = _mm_cvtps_epi32(U);
+				tempI = _mm_add_epi32(tempI, _mm_set1_epi32((int)128));
+				tempI = _mm_packus_epi32(tempI, tempI);
+				tempI = _mm_packus_epi16(tempI, tempI);
+				tempI = _mm_and_si128(tempI, _mm_set_epi32((int)0xFFFFFFFF, 0, 0, 0));
+				Uout = _mm_srli_si128(Uout, 4);
+				Uout = _mm_or_si128(Uout, tempI);
+				
+				tempI = _mm_cvtps_epi32(V);
+				tempI = _mm_add_epi32(tempI, _mm_set1_epi32((int)128));
+				tempI = _mm_packus_epi32(tempI, tempI);
+				tempI = _mm_packus_epi16(tempI, tempI);
+				tempI = _mm_and_si128(tempI, _mm_set_epi32((int)0xFFFFFFFF, 0, 0, 0));
+				Vout = _mm_srli_si128(Vout, 4);
+				Vout = _mm_or_si128(Vout, tempI);
+								
+				pixels = _mm_srli_si128(pixels, 1);
+				pLocalSrc += 16;
+			}
+			
+			_mm_storeu_si128((__m128i *) pLocalDstY, Yout);
+			_mm_storeu_si128((__m128i *) pLocalDstU, Uout);
+			_mm_storeu_si128((__m128i *) pLocalDstV, Vout);
+
+			pLocalDstY += 16;
+			pLocalDstU += 16;
+			pLocalDstV += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+			pLocalSrc++;
+
+			*pLocalDstY++ = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			*pLocalDstU++ = (vx_uint8)((R * -0.1146f) + (G * -0.3854) + (B * 0.5f) + 128.0f);
+			*pLocalDstV++ = (vx_uint8)((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstYImage += dstYImageStrideInBytes;
+		pDstUImage += dstUImageStrideInBytes;
+		pDstVImage += dstVImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_IYUV_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~3;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i row0, row1, tempI;
+	__m128i addToChroma = _mm_set1_epi32((int)128);
+	__m128i mask = _mm_set_epi8((char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128 Y0, U0, V0, Y1, U1, V1, weights_toY, weights_toU, weights_toV, temp, temp2;
+
+	DECL_ALIGN(16) unsigned int Ybuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned short Ubuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned short Vbuf[8] ATTR_ALIGN(16);
+
+	for (int height = 0; height < (int) dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDstY = pDstYImage;
+		vx_uint8 * pLocalDstU = pDstUImage;
+		vx_uint8 * pLocalDstV = pDstVImage;
+
+		for (int width = 0; width < (alignedWidth >> 2); width++)
+		{
+			row0 = _mm_load_si128((__m128i *)pLocalSrc);
+			row1 = _mm_load_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes));
+
+			// R0..R3
+			weights_toY = _mm_set_ps1(0.2126f);
+			weights_toU = _mm_set_ps1(-0.1146f);
+			weights_toV = _mm_set_ps1(0.5f);
+			tempI = _mm_and_si128(row0, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y0 = _mm_mul_ps(temp, weights_toY);
+			U0 = _mm_mul_ps(temp, weights_toU);
+			V0 = _mm_mul_ps(temp, weights_toV);
+
+			tempI = _mm_and_si128(row1, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y1 = _mm_mul_ps(temp, weights_toY);
+			U1 = _mm_mul_ps(temp, weights_toU);
+			V1 = _mm_mul_ps(temp, weights_toV);
+
+			// G0..G3
+			weights_toY = _mm_set_ps1(0.7152f);
+			weights_toU = _mm_set_ps1(-0.3854f);
+			weights_toV = _mm_set_ps1(-0.4542f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			// G0..G3
+			weights_toY = _mm_set_ps1(0.0722f);
+			weights_toU = _mm_set_ps1(0.5f);
+			weights_toV = _mm_set_ps1(-0.0458f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			tempI = _mm_cvtps_epi32(Y0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			row1 = _mm_cvtps_epi32(Y1);
+			row1 = _mm_packus_epi32(row1, row1);
+			row1 = _mm_packus_epi16(row1, row1);
+			_mm_store_si128((__m128i *)Ybuf, tempI);
+			_mm_store_si128((__m128i *)(Ybuf + 4), row1);
+
+			// u00 u01 u02 u03
+			// u10 u11 u12 u13
+			tempI = _mm_cvtps_epi32(U0);
+			tempI = _mm_add_epi32(tempI, addToChroma);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvtps_epi32(U1);
+			row1 = _mm_add_epi32(row1, addToChroma);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			row0 = _mm_set1_epi32(1);
+			tempI = _mm_add_epi32(tempI, row0);
+			tempI = _mm_srli_epi32(tempI, 1);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Ubuf, tempI);
+
+			// v00 v01 v02 v03
+			// v10 v11 v12 v13
+			tempI = _mm_cvtps_epi32(V0);
+			tempI = _mm_add_epi32(tempI, addToChroma);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvtps_epi32(V1);
+			row1 = _mm_add_epi32(row1, addToChroma);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			tempI = _mm_add_epi32(tempI, row0);
+			tempI = _mm_srli_epi32(tempI, 1);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Vbuf, tempI);
+
+			*(unsigned int *)(pLocalDstY) = Ybuf[0];
+			*(unsigned int *)(pLocalDstY + dstYImageStrideInBytes) = Ybuf[4];
+			*(unsigned short *)(pLocalDstU) = Ubuf[0];
+			*(unsigned short *)(pLocalDstV) = Vbuf[0];
+
+			pLocalSrc += 16;
+			pLocalDstY += 4;
+			pLocalDstU += 2;
+			pLocalDstV += 2;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float R = (float)*(pLocalSrc);
+			float G = (float)*(pLocalSrc + 1);
+			float B = (float)*(pLocalSrc + 2);
+
+			*pLocalDstY = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			float U = (R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f;
+			float V = (R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f;
+
+			R = (float)*(pLocalSrc + 4);
+			G = (float)*(pLocalSrc + 5);
+			B = (float)*(pLocalSrc + 6);
+
+			*(pLocalDstY + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 1);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 2);
+
+			*(pLocalDstY + dstYImageStrideInBytes) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes + 4);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 5);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 6);
+
+			*(pLocalDstY + dstYImageStrideInBytes + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			U /= 4.0f;	V /= 4.0f;
+
+			*pLocalDstU++ = (vx_uint8)U;
+			*pLocalDstY++ = (vx_uint8)V;
+
+			pLocalSrc += 8;
+			pLocalDstY += 2;
+		}
+
+		pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);
+		pDstYImage += (dstYImageStrideInBytes + dstYImageStrideInBytes);
+		pDstUImage += dstUImageStrideInBytes;
+		pDstVImage += dstVImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_NV12_RGBX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstLumaImage,
+		vx_uint32     dstLumaImageStrideInBytes,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~3;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i row0, row1, tempI;
+	__m128i addToChroma = _mm_set1_epi32((int)128);
+	__m128i mask = _mm_set_epi8((char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128 Y0, U0, V0, Y1, U1, V1, weights_toY, weights_toU, weights_toV, temp, temp2;
+
+	DECL_ALIGN(16) unsigned int Ybuf[8] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned char Ubuf[16] ATTR_ALIGN(16);
+	DECL_ALIGN(16) unsigned char Vbuf[16] ATTR_ALIGN(16);
+
+	for (int height = 0; height < (int) dstHeight; height += 2)
+	{
+		vx_uint8 * pLocalSrc = pSrcImage;
+		vx_uint8 * pLocalDstLuma = pDstLumaImage;
+		vx_uint8 * pLocalDstChroma = pDstChromaImage;
+
+		for (int width = 0; width < (alignedWidth >> 2); width++)
+		{
+			row0 = _mm_load_si128((__m128i *)pLocalSrc);
+			row1 = _mm_load_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes));
+
+			// R0..R3
+			weights_toY = _mm_set_ps1(0.2126f);
+			weights_toU = _mm_set_ps1(-0.1146f);
+			weights_toV = _mm_set_ps1(0.5f);
+			tempI = _mm_and_si128(row0, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y0 = _mm_mul_ps(temp, weights_toY);
+			U0 = _mm_mul_ps(temp, weights_toU);
+			V0 = _mm_mul_ps(temp, weights_toV);
+
+			tempI = _mm_and_si128(row1, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			Y1 = _mm_mul_ps(temp, weights_toY);
+			U1 = _mm_mul_ps(temp, weights_toU);
+			V1 = _mm_mul_ps(temp, weights_toV);
+
+			// G0..G3
+			weights_toY = _mm_set_ps1(0.7152f);
+			weights_toU = _mm_set_ps1(-0.3854f);
+			weights_toV = _mm_set_ps1(-0.4542f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			// G0..G3
+			weights_toY = _mm_set_ps1(0.0722f);
+			weights_toU = _mm_set_ps1(0.5f);
+			weights_toV = _mm_set_ps1(-0.0458f);
+			row0 = _mm_srli_si128(row0, 1);
+			tempI = _mm_and_si128(row0, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y0 = _mm_add_ps(Y0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U0 = _mm_add_ps(U0, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V0 = _mm_add_ps(V0, temp2);
+
+			row1 = _mm_srli_si128(row1, 1);
+			tempI = _mm_and_si128(row1, mask);
+			temp = _mm_cvtepi32_ps(tempI);
+			temp2 = _mm_mul_ps(temp, weights_toY);
+			Y1 = _mm_add_ps(Y1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toU);
+			U1 = _mm_add_ps(U1, temp2);
+			temp2 = _mm_mul_ps(temp, weights_toV);
+			V1 = _mm_add_ps(V1, temp2);
+
+			tempI = _mm_cvttps_epi32(Y0);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			row1 = _mm_cvttps_epi32(Y1);
+			row1 = _mm_packus_epi32(row1, row1);
+			row1 = _mm_packus_epi16(row1, row1);
+			_mm_store_si128((__m128i *)Ybuf, tempI);
+			_mm_store_si128((__m128i *)(Ybuf + 4), row1);
+
+			// u00 u01 u02 u03
+			// u10 u11 u12 u13
+			tempI = _mm_cvttps_epi32(U0);
+			tempI = _mm_add_epi32(tempI, addToChroma);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvttps_epi32(U1);
+			row1 = _mm_add_epi32(row1, addToChroma);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			row0 = _mm_set1_epi16(1);
+			tempI = _mm_add_epi16(tempI, row0);
+			tempI = _mm_srli_epi16(tempI, 1);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Ubuf, tempI);
+
+			// v00 v01 v02 v03
+			// v10 v11 v12 v13
+			tempI = _mm_cvttps_epi32(V0);
+			tempI = _mm_add_epi32(tempI, addToChroma);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			row1 = _mm_cvttps_epi32(V1);
+			row1 = _mm_add_epi32(row1, addToChroma);
+			row1 = _mm_packus_epi32(row1, row1);
+			tempI = _mm_avg_epu16(tempI, row1);			// Average u00, u10; u01, u11 ...
+			//tempI = _mm_haddd_epu16(tempI);					// TBD: XOP instruction - not supported on all platforms
+			tempI = _mm_hadd_epi16(tempI, tempI);				// Average horizontally
+			tempI = _mm_cvtepi16_epi32(tempI);
+			tempI = _mm_add_epi16(tempI, row0);
+			tempI = _mm_srli_epi16(tempI, 1);
+			tempI = _mm_packus_epi32(tempI, tempI);
+			tempI = _mm_packus_epi16(tempI, tempI);
+			_mm_store_si128((__m128i *)Vbuf, tempI);
+
+			*(unsigned int *) (pLocalDstLuma) = Ybuf[0];
+			*(unsigned int *)(pLocalDstLuma + dstLumaImageStrideInBytes) = Ybuf[4];
+			*(unsigned int *) (pLocalDstChroma) = Ubuf[0] | (Vbuf[0] << 8) | (Ubuf[1] << 16) | (Vbuf[1] << 24);
+
+			pLocalSrc += 16;
+			pLocalDstLuma += 4;
+			pLocalDstChroma += 4;
+		}
+
+		for (int width = 0; width < postfixWidth; width += 2)
+		{
+			float R = (float)*(pLocalSrc);
+			float G = (float)*(pLocalSrc + 1);
+			float B = (float)*(pLocalSrc + 2);
+
+			*pLocalDstLuma = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			float U = (R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f;
+			float V = (R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f;
+
+			R = (float)*(pLocalSrc + 4);
+			G = (float)*(pLocalSrc + 5);
+			B = (float)*(pLocalSrc + 6);
+
+			*(pLocalDstLuma + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 1);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 2);
+
+			*(pLocalDstLuma + dstLumaImageStrideInBytes) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			R = (float)*(pLocalSrc + srcImageStrideInBytes + 4);
+			G = (float)*(pLocalSrc + srcImageStrideInBytes + 5);
+			B = (float)*(pLocalSrc + srcImageStrideInBytes + 6);
+
+			*(pLocalDstLuma + dstLumaImageStrideInBytes + 1) = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			U += ((R * -0.1146f) + (G * -0.3854f) + (B * 0.5f) + 128.0f);
+			V += ((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+
+			U /= 4.0f;	V /= 4.0f;
+
+			*pLocalDstChroma++ = (vx_uint8)U;
+			*pLocalDstChroma++ = (vx_uint8)V;
+
+			pLocalSrc += 8;
+			pLocalDstLuma += 2;
+		}
+
+		pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);
+		pDstLumaImage += (dstLumaImageStrideInBytes + dstLumaImageStrideInBytes);
+		pDstChromaImage += dstChromaImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ColorConvert_YUV4_RGB
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstYImage,
+		vx_uint32     dstYImageStrideInBytes,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i*) dataColorConvert;
+
+	__m128i pixels0, pixels1, pixels2, R, G, B;
+	__m128i addToChroma = _mm_set1_epi32((int)128);
+	__m128i mask1 = _mm_load_si128(tbl + 21);
+	__m128i mask2 = _mm_load_si128(tbl + 22);
+	__m128i mask3 = _mm_load_si128(tbl + 23);
+	__m128 weights_R2Y = _mm_set_ps1((float) 0.2126);
+	__m128 weights_G2Y = _mm_set_ps1((float) 0.7152);
+	__m128 weights_B2Y = _mm_set_ps1((float) 0.0722);
+	__m128 weights_R2U = _mm_set_ps1((float) -0.1146);
+	__m128 weights_G2U = _mm_set_ps1((float) -0.3854);
+	__m128 weights_B2U = _mm_set_ps1((float) 0.5);
+	__m128 weights_R2V = _mm_set_ps1((float) 0.5);
+	__m128 weights_G2V = _mm_set_ps1((float) -0.4542);
+	__m128 weights_B2V = _mm_set_ps1((float) -0.0458);
+	__m128 temp0, temp1, Y, U, V;
+
+	for (int height = 0; height < (int) dstHeight; height++)
+	{
+		vx_uint8 * pLocalSrc =  pSrcImage;
+		vx_uint8 * pLocalDstY = pDstYImage;
+		vx_uint8 * pLocalDstU = pDstUImage;
+		vx_uint8 * pLocalDstV = pDstVImage;
+
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)pLocalSrc);
+			pixels1 = _mm_loadu_si128((__m128i *)(pLocalSrc + 16));
+			pixels2 = _mm_loadu_si128((__m128i *)(pLocalSrc + 32));
+
+			R = _mm_shuffle_epi8(pixels0, mask1);						// 0 0 0 0 0 0 0 0 0 0 R5 R4 R3 R2 R1 R0
+			G = _mm_shuffle_epi8(pixels0, mask3);						// 0 0 0 0 0 0 0 0 0 0 0 G4 G3 G2 G1 G0
+			B = _mm_shuffle_epi8(pixels0, mask2);						// 0 0 0 0 0 0 0 0 0 0 0 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels1, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 0 R10 R9 R8 R7 R6
+			pixels0 = _mm_slli_si128(pixels0, 6);
+			R = _mm_or_si128(R, pixels0);								// 0 0 0 0 0 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask1);					// 0 0 0 0 0 0 0 0 0 0 G10 G9 G8 G7 G6 G5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			G = _mm_or_si128(G, pixels0);								// 0 0 0 0 0 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels1, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 B9 B8 B7 B6 B5
+			pixels0 = _mm_slli_si128(pixels0, 5);
+			B = _mm_or_si128(B, pixels0);								// 0 0 0 0 0 0 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			pixels0 = _mm_shuffle_epi8(pixels2, mask3);					// 0 0 0 0 0 0 0 0 0 0 0 R15 R14 R13 R12 R11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			R = _mm_or_si128(R, pixels0);								// R15 R14 R13 R12 R11 R10 R9 R8 R7 R6 R5 R4 R3 R2 R1 R0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask2);					// 0 0 0 0 0 0 0 0 0 0 0 G15 G14 G13 G12 G11
+			pixels0 = _mm_slli_si128(pixels0, 11);
+			G = _mm_or_si128(G, pixels0);								// G15 G14 G13 G12 G11 G10 G9 G8 G7 G6 G5 G4 G3 G2 G1 G0
+			pixels0 = _mm_shuffle_epi8(pixels2, mask1);					// 0 0 0 0 0 0 0 0 0 0 B15 B14 B13 B12 B11 B10
+			pixels0 = _mm_slli_si128(pixels0, 10);
+			B = _mm_or_si128(B, pixels0);								// B15 B14 B13 B12 B11 B10 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+
+			// For pixels 0..3
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp0 = _mm_cvtepi32_ps(pixels2);							// R0..R3
+			Y = _mm_mul_ps(temp0, weights_R2Y);
+			U = _mm_mul_ps(temp0, weights_R2U);
+			V = _mm_mul_ps(temp0, weights_R2V);
+			
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp0 = _mm_cvtepi32_ps(pixels2);
+			temp1 = _mm_mul_ps(temp0, weights_G2Y);						// G0..G3
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2V);
+			V = _mm_add_ps(V, temp1);
+
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp0 = _mm_cvtepi32_ps(pixels2);
+			temp1 = _mm_mul_ps(temp0, weights_B2Y);						// B0..B3
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2V);
+			V = _mm_add_ps(V, temp1);
+
+			__m128i tempI0 = _mm_cvtps_epi32(Y);
+			__m128i tempI1 = _mm_cvtps_epi32(U);
+			tempI1 = _mm_add_epi32(tempI1, addToChroma);
+			__m128i tempI2 = _mm_cvtps_epi32(V);
+			tempI2 = _mm_add_epi32(tempI2, addToChroma);
+
+			// For pixels 4..7
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp0 = _mm_cvtepi32_ps(pixels2);							// R4..R7
+			Y = _mm_mul_ps(temp0, weights_R2Y);
+			U = _mm_mul_ps(temp0, weights_R2U);
+			V = _mm_mul_ps(temp0, weights_R2V);
+
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp0 = _mm_cvtepi32_ps(pixels2);
+			temp1 = _mm_mul_ps(temp0, weights_G2Y);						// G4..G7
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2V);
+			V = _mm_add_ps(V, temp1);
+
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp0 = _mm_cvtepi32_ps(pixels2);
+			temp1 = _mm_mul_ps(temp0, weights_B2Y);						// B4..B7
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2V);
+			V = _mm_add_ps(V, temp1);
+
+			pixels1 = _mm_cvtps_epi32(Y);
+			tempI0 = _mm_packus_epi32(tempI0, pixels1);
+			pixels1 = _mm_cvtps_epi32(U);
+			pixels1 = _mm_add_epi32(pixels1, addToChroma);
+			tempI1 = _mm_packus_epi32(tempI1, pixels1);
+			pixels1 = _mm_cvtps_epi32(V);
+			pixels1 = _mm_add_epi32(pixels1, addToChroma);
+			tempI2 = _mm_packus_epi32(tempI2, pixels1);
+
+			// For pixels 8..11
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+
+			pixels2 = _mm_cvtepu8_epi32(R);
+			temp0 = _mm_cvtepi32_ps(pixels2);							// R8..R11
+			Y = _mm_mul_ps(temp0, weights_R2Y);
+			U = _mm_mul_ps(temp0, weights_R2U);
+			V = _mm_mul_ps(temp0, weights_R2V);
+
+			pixels2 = _mm_cvtepu8_epi32(G);
+			temp0 = _mm_cvtepi32_ps(pixels2);
+			temp1 = _mm_mul_ps(temp0, weights_G2Y);						// G8..G11
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2V);
+			V = _mm_add_ps(V, temp1);
+
+			pixels2 = _mm_cvtepu8_epi32(B);
+			temp0 = _mm_cvtepi32_ps(pixels2);
+			temp1 = _mm_mul_ps(temp0, weights_B2Y);						// B8..B11
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2V);
+			V = _mm_add_ps(V, temp1);
+
+			pixels0 = _mm_cvtps_epi32(Y);
+			pixels1 = _mm_cvtps_epi32(U);
+			pixels1 = _mm_add_epi32(pixels1, addToChroma);
+			pixels2 = _mm_cvtps_epi32(V);
+			pixels2 = _mm_add_epi32(pixels2, addToChroma);
+
+			// For pixels 12..15
+			R = _mm_srli_si128(R, 4);
+			G = _mm_srli_si128(G, 4);
+			B = _mm_srli_si128(B, 4);
+
+			R = _mm_cvtepu8_epi32(R);
+			temp0 = _mm_cvtepi32_ps(R);									// R12..R15
+			Y = _mm_mul_ps(temp0, weights_R2Y);
+			U = _mm_mul_ps(temp0, weights_R2U);
+			V = _mm_mul_ps(temp0, weights_R2V);
+
+			G = _mm_cvtepu8_epi32(G);
+			temp0 = _mm_cvtepi32_ps(G);
+			temp1 = _mm_mul_ps(temp0, weights_G2Y);						// G12..G15
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_G2V);
+			V = _mm_add_ps(V, temp1);
+
+			B = _mm_cvtepu8_epi32(B);
+			temp0 = _mm_cvtepi32_ps(B);
+			temp1 = _mm_mul_ps(temp0, weights_B2Y);						// B12..B15
+			Y = _mm_add_ps(Y, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2U);
+			U = _mm_add_ps(U, temp1);
+			temp1 = _mm_mul_ps(temp0, weights_B2V);
+			V = _mm_add_ps(V, temp1);
+
+			R = _mm_cvtps_epi32(Y);
+			pixels0 = _mm_packus_epi32(pixels0, R);
+			G = _mm_cvtps_epi32(U);
+			G = _mm_add_epi32(G, addToChroma);
+			pixels1 = _mm_packus_epi32(pixels1, G);
+			B = _mm_cvtps_epi32(V);
+			B = _mm_add_epi32(B, addToChroma);
+			pixels2 = _mm_packus_epi32(pixels2, B);
+
+			tempI0 = _mm_packus_epi16(tempI0, pixels0);
+			tempI1 = _mm_packus_epi16(tempI1, pixels1);
+			tempI2 = _mm_packus_epi16(tempI2, pixels2);
+
+			_mm_storeu_si128((__m128i *)pLocalDstY, tempI0);
+			_mm_storeu_si128((__m128i *)pLocalDstU, tempI1);
+			_mm_storeu_si128((__m128i *)pLocalDstV, tempI2);
+
+			pLocalSrc += 48;
+			pLocalDstY += 16;
+			pLocalDstU += 16;
+			pLocalDstV += 16;
+		}
+
+		for (int width = 0; width < postfixWidth; width++)
+		{
+			float R = (float)*pLocalSrc++;
+			float G = (float)*pLocalSrc++;
+			float B = (float)*pLocalSrc++;
+
+			*pLocalDstY++ = (vx_uint8)((R * 0.2126f) + (G * 0.7152f) + (B * 0.0722));
+			*pLocalDstU++ = (vx_uint8)((R * -0.1146f) + (G * -0.3854) + (B * 0.5f) + 128.0f);
+			*pLocalDstV++ = (vx_uint8)((R * 0.5f) + (G * -0.4542f) + (B * -0.0458f) + 128.0f);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstYImage += dstYImageStrideInBytes;
+		pDstUImage += dstUImageStrideInBytes;
+		pDstVImage += dstVImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FormatConvert_IUV_UV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	)
+{
+	__m128i * tbl = (__m128i *) dataColorConvert;
+	
+	bool isAligned = ((intptr_t(pDstUImage) & intptr_t(pDstVImage) & 15) == ((intptr_t(pDstUImage) | intptr_t(pDstVImage)) & 15));
+
+	unsigned char *pLocalSrc, *pLocalDstU, *pLocalDstV;
+	__m128i *pLocalSrc_xmm, *pLocalDstU_xmm, *pLocalDstV_xmm;
+
+	__m128i pixels0, pixels1, U, temp;
+	__m128i mask_UV12ToIUV_Ulow = _mm_load_si128(tbl + 3);
+	__m128i mask_UV12ToIUV_Uhi  = _mm_load_si128(tbl + 25);
+	__m128i mask_UV12ToIUV_Vlow = _mm_load_si128(tbl + 0);
+	__m128i mask_UV12ToIUV_Vhi  = _mm_load_si128(tbl + 24);
+
+	if (isAligned)
+	{
+		int prefixWidth = intptr_t(pDstUImage) & 15;
+		prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+		int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+		int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc = (unsigned char *)pSrcChromaImage;
+			pLocalDstU = (unsigned char *)pDstUImage;
+			pLocalDstV = (unsigned char *)pDstVImage;
+
+			for (int x = 0; x < prefixWidth; x++)
+			{
+				*pLocalDstU++ = *pLocalSrc++;
+				*pLocalDstV++ = *pLocalSrc++;
+			}
+
+			pLocalSrc_xmm = (__m128i *) pLocalSrc;
+			pLocalDstU_xmm = (__m128i *) pLocalDstU;
+			pLocalDstV_xmm = (__m128i *) pLocalDstV;
+
+			int width = (int)(alignedWidth >> 4);								// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels1 = _mm_loadu_si128(pLocalSrc_xmm++);
+
+				U = _mm_shuffle_epi8(pixels0, mask_UV12ToIUV_Ulow);
+				temp = _mm_shuffle_epi8(pixels1, mask_UV12ToIUV_Uhi);
+				U = _mm_or_si128(U, temp);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask_UV12ToIUV_Vlow);
+				temp = _mm_shuffle_epi8(pixels1, mask_UV12ToIUV_Vhi);
+				pixels0 = _mm_or_si128(pixels0, temp);
+
+				_mm_store_si128(pLocalDstU_xmm++, U);
+				_mm_store_si128(pLocalDstV_xmm++, pixels0);
+
+				width--;
+			}
+
+			pLocalSrc = (unsigned char *) pLocalSrc_xmm;
+			pLocalDstU = (unsigned char *) pLocalDstU_xmm;
+			pLocalDstV = (unsigned char *) pLocalDstV_xmm;
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDstU++ = *pLocalSrc++;
+				*pLocalDstV++ = *pLocalSrc++;
+			}
+
+			pSrcChromaImage += srcChromaImageStrideInBytes;
+			pDstUImage += dstUImageStrideInBytes;
+			pDstVImage += dstVImageStrideInBytes;
+			height--;
+		}
+	}
+	else
+	{
+		int postfixWidth = (int)dstWidth & 15;					// 16 pixels processed at a time in SSE loop
+		int alignedWidth = (int)dstWidth - postfixWidth;
+
+		int height = (int)dstHeight;
+		while (height)
+		{
+			pLocalSrc_xmm = (__m128i *) pSrcChromaImage;
+			pLocalDstU_xmm = (__m128i *) pDstUImage;
+			pLocalDstV_xmm = (__m128i *) pDstVImage;
+
+			int width = (int)(alignedWidth >> 4);								// 16 pixels processed at a time
+			while (width)
+			{
+				pixels0 = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels1 = _mm_loadu_si128(pLocalSrc_xmm++);
+
+				U = _mm_shuffle_epi8(pixels0, mask_UV12ToIUV_Ulow);
+				temp = _mm_shuffle_epi8(pixels1, mask_UV12ToIUV_Uhi);
+				U = _mm_or_si128(U, temp);
+
+				pixels0 = _mm_shuffle_epi8(pixels0, mask_UV12ToIUV_Vlow);
+				temp = _mm_shuffle_epi8(pixels1, mask_UV12ToIUV_Vhi);
+				pixels0 = _mm_or_si128(pixels0, temp);
+
+				_mm_storeu_si128(pLocalDstU_xmm++, U);
+				_mm_storeu_si128(pLocalDstV_xmm++, pixels0);
+
+				width--;
+			}
+
+			pLocalSrc = (unsigned char *)pLocalSrc_xmm;
+			pLocalDstU = (unsigned char *)pLocalDstU_xmm;
+			pLocalDstV = (unsigned char *)pLocalDstV_xmm;
+			for (int x = 0; x < postfixWidth; x++)
+			{
+				*pLocalDstU++ = *pLocalSrc++;
+				*pLocalDstV++ = *pLocalSrc++;
+			}
+
+			pSrcChromaImage += srcChromaImageStrideInBytes;
+			pDstUImage += dstUImageStrideInBytes;
+			pDstVImage += dstVImageStrideInBytes;
+			height--;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FormatConvert_UV12_IUV
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstChromaImage,
+		vx_uint32     dstChromaImageStrideInBytes,
+		vx_uint8    * pSrcUImage,
+		vx_uint32     srcUImageStrideInBytes,
+		vx_uint8    * pSrcVImage,
+		vx_uint32     srcVImageStrideInBytes
+	)
+{
+	int prefixWidth = intptr_t(pDstChromaImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	unsigned char *pLocalSrcU, *pLocalSrcV, *pLocalDst;
+	__m128i *pLocalSrcU_xmm, *pLocalSrcV_xmm, *pLocalDst_xmm;
+	__m128i pixels_U, pixels_V, pixels_out;
+
+	int height = (int) dstHeight;
+
+	while (height)
+	{
+		pLocalSrcU = (unsigned char *) pSrcUImage;
+		pLocalSrcV = (unsigned char *) pSrcVImage;
+		pLocalDst = (unsigned char *) pDstChromaImage;
+
+		for (int x = 0; x < prefixWidth; x++)
+		{
+			*pLocalDst++ = *pLocalSrcU++;
+			*pLocalDst++ = *pLocalSrcV++;
+		}
+
+		pLocalSrcU_xmm = (__m128i *) pLocalSrcU;
+		pLocalSrcV_xmm = (__m128i *) pLocalSrcV;
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+
+		int width = (int) (dstWidth >> 4);									// Each inner loop writes 16 pixels of each chroma plane in destination buffer
+		while (width)
+		{
+			pixels_U = _mm_loadu_si128(pLocalSrcU_xmm++);
+			pixels_V = _mm_loadu_si128(pLocalSrcV_xmm++);
+			pixels_out = _mm_unpacklo_epi8(pixels_U, pixels_V);
+			pixels_U = _mm_unpackhi_epi8(pixels_U, pixels_V);
+
+			_mm_store_si128(pLocalDst_xmm++, pixels_out);
+			_mm_store_si128(pLocalDst_xmm++, pixels_U);
+			width--;
+		}
+
+		pLocalSrcU = (unsigned char *) pLocalSrcU_xmm;
+		pLocalSrcV = (unsigned char *) pLocalSrcV_xmm;
+		pLocalDst = (unsigned char *) pLocalDst_xmm;
+		for (int x = 0; x < postfixWidth; x++)
+		{
+			*pLocalDst++ = *pLocalSrcU++;
+			*pLocalDst++ = *pLocalSrcV++;
+		}
+
+		pSrcUImage += srcUImageStrideInBytes;
+		pSrcVImage += srcVImageStrideInBytes;
+		pDstChromaImage += dstChromaImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FormatConvert_UV_UV12
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstUImage,
+		vx_uint32     dstUImageStrideInBytes,
+		vx_uint8    * pDstVImage,
+		vx_uint32     dstVImageStrideInBytes,
+		vx_uint8    * pSrcChromaImage,
+		vx_uint32     srcChromaImageStrideInBytes
+	)
+{
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	__m128i * tbl = (__m128i *) dataColorConvert;
+	vx_uint8 *pLocalSrc, *pLocalDstUCurrentRow, *pLocalDstUNextRow, *pLocalDstVCurrentRow, *pLocalDstVNextRow;
+	__m128i pixels, U;
+	__m128i maskU = _mm_load_si128(tbl + 6);
+	__m128i maskV = _mm_load_si128(tbl + 7);
+
+	int height = (int) (dstHeight >> 1);				// Each inner loop writes out two rows of dst buffer
+	while (height)
+	{
+		pLocalSrc = pSrcChromaImage;
+		pLocalDstUCurrentRow = pDstUImage;
+		pLocalDstUNextRow = pDstUImage + dstUImageStrideInBytes;
+		pLocalDstVCurrentRow = pDstVImage;
+		pLocalDstVNextRow = pDstVImage + dstVImageStrideInBytes;
+
+		int width = (int) (alignedWidth >> 4);				// Each inner loop iteration processess 16 output pixels
+		while (width)
+		{
+			pixels = _mm_loadu_si128((__m128i*) pLocalSrc);
+			U = _mm_shuffle_epi8(pixels, maskU);
+			pixels = _mm_shuffle_epi8(pixels, maskV);
+
+			_mm_storeu_si128((__m128i*) pLocalDstUCurrentRow, U);
+			_mm_storeu_si128((__m128i*) pLocalDstUNextRow, U);
+			_mm_storeu_si128((__m128i*) pLocalDstVCurrentRow, pixels);
+			_mm_storeu_si128((__m128i*) pLocalDstVNextRow, pixels);
+
+			pLocalSrc += 16;
+			pLocalDstUCurrentRow += 16;
+			pLocalDstUNextRow += 16;
+			pLocalDstVCurrentRow += 16;
+			pLocalDstVNextRow += 16;
+			width--;
+		}
+
+		for (int w = 0; w < postfixWidth; w += 2)
+		{
+			*pLocalDstUCurrentRow++ = *pLocalSrc;
+			*pLocalDstUCurrentRow++ = *pLocalSrc;
+			*pLocalDstUNextRow++ = *pLocalSrc;
+			*pLocalDstUNextRow++ = *pLocalSrc++;
+
+			*pLocalDstVCurrentRow++ = *pLocalSrc;
+			*pLocalDstVCurrentRow++ = *pLocalSrc;
+			*pLocalDstVNextRow++ = *pLocalSrc;
+			*pLocalDstVNextRow++ = *pLocalSrc++;
+		}
+
+		pSrcChromaImage += srcChromaImageStrideInBytes;
+		pDstUImage += (dstUImageStrideInBytes + dstUImageStrideInBytes);
+		pDstVImage += (dstVImageStrideInBytes + dstVImageStrideInBytes);
+		height--;
+	}
+	return AGO_SUCCESS;
+}
\ No newline at end of file
diff --git a/openvx/ago/ago_haf_cpu_fast_corners.cpp b/openvx/ago/ago_haf_cpu_fast_corners.cpp
new file mode 100644
index 0000000..1e1a0df
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_fast_corners.cpp
@@ -0,0 +1,826 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+DECL_ALIGN(16) unsigned char dataFastCornersPixelMask[7 * 16] ATTR_ALIGN(16) = {
+	  1,   2, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0,
+	255, 255,   4, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0, 255,
+	255, 255, 255,   6, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0, 255, 255,
+	255, 255, 255, 255,   6, 255, 255, 255, 255, 255, 255, 255,   0, 255, 255, 255,
+	255, 255, 255, 255, 255,   6, 255, 255, 255, 255, 255,   0, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255,   4, 255, 255, 255,   0, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255,   2,   1,   0, 255, 255, 255, 255, 255, 255
+};
+
+static inline void generateOffset(int srcStride, int * offsets)
+{
+	offsets[0] = -3 * srcStride;
+	offsets[15] = offsets[0] - 1;
+	offsets[1] = offsets[0] + 1;
+	offsets[2] = -(srcStride << 1) + 2;
+	offsets[14] = offsets[2] - 4;
+	offsets[3] = -srcStride + 3;
+	offsets[13] = offsets[3] - 6;
+	offsets[4] = 3;
+	offsets[12] = -3;
+	offsets[5] = srcStride + 3;
+	offsets[11] = offsets[5] - 6;
+	offsets[6] = (srcStride << 1) + 2;
+	offsets[10] = offsets[6] - 4;
+	offsets[7] = 3 * srcStride + 1;
+	offsets[8] = offsets[7] - 1;
+	offsets[9] = offsets[8] - 1;
+
+	return;
+}
+
+static inline void generateMasks_C(unsigned char * src, int srcStride, int* offsets, short t, int mask[2])
+{
+	mask[0] = 0;
+	mask[1] = 0;
+	int iterMask = 1;
+
+	// Early exit conditions
+	if ((abs((short)src[0] - (short)src[offsets[0]]) < t) && (abs((short)src[0] - (short)src[offsets[8]]) < t))					// Pixels 1 and 9 within t of the candidate
+		return;
+	if ((abs((short)src[0] - (short)src[offsets[4]]) < t) && (abs((short)src[0] - (short)src[offsets[12]]) < t))				// Pixels 5 and 13 within t of the candidate
+		return;
+
+	// Check for I_p + t
+	short cand = (short)(*src) + t;
+	for (int i = 0; i < 16; i++)
+	{
+		if ((short)src[offsets[i]] > cand)
+			mask[0] |= iterMask;
+		iterMask <<= 1;
+	}
+
+	// Check for I_p - t
+	iterMask = 1;
+	cand = (short)(*src) - t;
+	for (int i = 0; i < 16; i++)
+	{
+		if ((short)src[offsets[i]] < cand)
+			mask[1] |= iterMask;
+		iterMask <<= 1;
+	}
+}
+
+static inline bool isCorner(int mask[2])
+{
+	int cornerMask = 0x1FF;									// Nine 1's in the LSB
+
+	if (mask[0] || mask[1])
+	{
+		mask[0] = mask[0] | (mask[0] << 16);
+		mask[1] = mask[1] | (mask[1] << 16);
+
+		for (int i = 0; i < 16; i++)
+		{
+			if (((mask[0] & cornerMask) == cornerMask) || ((mask[1] & cornerMask) == cornerMask))
+				return true;
+			mask[0] >>= 1;
+			mask[1] >>= 1;
+		}
+	}
+	
+	return false;
+}
+
+static inline bool isCorner(int mask)
+{
+	int cornerMask = 0x1FF;									// Nine 1's in the LSB
+
+	if (mask)
+	{
+		mask = mask | (mask << 16);
+		for (int i = 0; i < 16; i++)
+		{
+			if ((mask & cornerMask) == cornerMask)
+				return true;
+			mask >>= 1;
+		}
+	}
+	return false;
+}
+
+static inline bool isCornerPlus(short candidate, short * boundary, short t)
+{
+	// Early exit conditions
+	if ((abs(candidate - boundary[0]) < t) && (abs(candidate - boundary[8]) < t))					// Pixels 1 and 9 within t of the candidate
+		return false;
+	if ((abs(candidate - boundary[4]) < t) && (abs(candidate - boundary[12]) < t))					// Pixels 5 and 13 within t of the candidate
+		return false;
+
+	candidate += t;
+	int mask = 0;
+	int iterMask = 1;
+	for (int i = 0; i < 16; i++)
+	{
+		if (boundary[i] > candidate)
+			mask |= iterMask;
+		iterMask <<= 1;
+	}
+
+	return isCorner(mask);
+}
+
+static inline bool isCornerPlus_SSE(__m128i candidate, __m128i boundary, short t)
+{
+	__m128i boundaryH = _mm_unpackhi_epi8(boundary, _mm_setzero_si128());								// Boundary 8..15 (words)
+	__m128i boundaryL = _mm_cvtepu8_epi16(boundary);													// Boundary 0..7 (words)
+	__m128i threshold = _mm_set1_epi16(t);
+
+	short cand = M128I(candidate).m128i_i16[0];
+
+	// Early exit conditions
+	if ((abs(cand - M128I(boundaryL).m128i_i16[0]) < t) && (abs(cand - M128I(boundaryH).m128i_i16[0]) < t))			// Pixels 1 and 9 within t of the candidate
+		return false;
+	if ((abs(cand - M128I(boundaryL).m128i_i16[4]) < t) && (abs(cand - M128I(boundaryH).m128i_i16[4]) < t))					// Pixels 5 and 13 within t of the candidate
+		return false;
+
+	candidate = _mm_add_epi16(candidate, threshold);
+	boundaryH = _mm_cmpgt_epi16(boundaryH, candidate);
+	boundaryL = _mm_cmpgt_epi16(boundaryL, candidate);
+	boundaryL = _mm_packs_epi16(boundaryL, boundaryH);											// 255 at ith byte if boundary[i] > pixel + t
+	int mask = _mm_movemask_epi8(boundaryL);
+
+	return isCorner(mask);
+}
+
+static inline bool isCornerMinus(short candidate, short * boundary, short t)
+{
+	// Early exit conditions
+	if ((abs(candidate - boundary[0]) < t) && (abs(candidate - boundary[8]) < t))					// Pixels 1 and 9 within t of the candidate
+		return false;
+	if ((abs(candidate - boundary[4]) < t) && (abs(candidate - boundary[12]) < t))					// Pixels 5 and 13 within t of the candidate
+		return false;
+
+	candidate -= t;
+	int mask = 0;
+	int iterMask = 1;
+	for (int i = 0; i < 16; i++)
+	{
+		if (boundary[i] < candidate)
+			mask |= iterMask;
+		iterMask <<= 1;
+	}
+
+	return isCorner(mask);
+}
+
+static inline bool isCornerMinus_SSE(__m128i candidate, __m128i boundary, short t)
+{
+	__m128i boundaryH = _mm_unpackhi_epi8(boundary, _mm_setzero_si128());								// Boundary 8..15 (words)
+	__m128i boundaryL = _mm_cvtepu8_epi16(boundary);													// Boundary 0..7 (words)
+	__m128i threshold = _mm_set1_epi16(t);
+
+	short cand = M128I(candidate).m128i_i16[0];
+
+	// Early exit conditions
+	if ((abs(cand - M128I(boundaryL).m128i_i16[0]) < t) && (abs(cand - M128I(boundaryH).m128i_i16[0]) < t))			// Pixels 1 and 9 within t of the candidate
+		return false;
+	if ((abs(cand - M128I(boundaryL).m128i_i16[4]) < t) && (abs(cand - M128I(boundaryH).m128i_i16[4]) < t))					// Pixels 5 and 13 within t of the candidate
+		return false;
+
+	candidate = _mm_sub_epi16(candidate, threshold);
+	boundaryH = _mm_cmplt_epi16(boundaryH, candidate);
+	boundaryL = _mm_cmplt_epi16(boundaryL, candidate);
+	boundaryL = _mm_packs_epi16(boundaryL, boundaryH);											// 255 at ith byte if boundary[i] > pixel + t
+	int mask = _mm_movemask_epi8(boundaryL);
+
+	return isCorner(mask);
+}
+
+static inline bool checkForCornerAndGetStrength(unsigned char * src, int* offsets, short t, short * strength)
+{
+	// Early exit conditions
+	if ((abs((short)src[0] - (short)src[offsets[0]]) < t) && (abs((short)src[0] - (short)src[offsets[8]]) < t))					// Pixels 1 and 9 within t of the candidate
+		return false;
+	if ((abs((short)src[0] - (short)src[offsets[4]]) < t) && (abs((short)src[0] - (short)src[offsets[12]]) < t))				// Pixels 5 and 13 within t of the candidate
+		return false;
+
+	// Get boundary
+	short boundary[16];
+	for (int i = 0; i < 16; i++)
+		boundary[i] = (short)src[offsets[i]];
+
+	// Check for I_p + t
+	short cand = (short)(*src) + t;
+	int maskP = 0;
+	int iterMask = 1;
+	for (int i = 0; i < 16; i++)
+	{
+		if (boundary[i] > cand)
+			maskP |= iterMask;
+		iterMask <<= 1;
+	}
+
+	// If it is a corner, then compute the threshold
+	short strength_pos = 0;
+	cand = src[0];
+	if (isCorner(maskP))
+	{
+		short thresh_upper = 255;
+		short thresh_lower = t;
+		
+		while (thresh_upper - thresh_lower > 1)						// Binary search
+		{
+			strength_pos = (thresh_upper + thresh_lower) >> 1;
+			if (isCornerPlus(cand, boundary, strength_pos))
+				thresh_lower = strength_pos;
+			else
+				thresh_upper = strength_pos;
+		}
+		strength_pos = thresh_lower;
+	}
+
+	// Check for I_p - t
+	cand = (short)(*src) - t;
+	int maskN = 0;
+	iterMask = 1;
+	for (int i = 0; i < 16; i++)
+	{
+		if (boundary[i] < cand)
+			maskN |= iterMask;
+		iterMask <<= 1;
+	}
+
+	// If it is a corner, then compute the threshold
+	short strength_neg = 0;
+	cand = src[0];
+	if (isCorner(maskN))
+	{
+		short thresh_upper = 255;
+		short thresh_lower = t;
+		
+		while (thresh_upper - thresh_lower > 1)						// Binary search
+		{
+			strength_neg = (thresh_upper + thresh_lower) >> 1;
+			if (isCornerMinus(cand, boundary, strength_neg))
+				thresh_lower = strength_neg;
+			else
+				thresh_upper = strength_neg;
+		}
+		strength_neg = thresh_lower;
+	}
+
+	if (maskP || maskN)
+	{
+		*strength = max(strength_pos, strength_neg);
+		return true;
+	}
+	return false;
+}
+
+bool isCorner_SSE(unsigned char pixel, __m128i boundary, __m128i t)
+{
+	// Check for boundary > pixel + t
+	__m128i cand = _mm_set1_epi16((short)pixel);									// The candidate pixel
+	cand = _mm_add_epi16(cand, t);													// Pixel + t
+
+	__m128i temp0 = _mm_unpackhi_epi8(boundary, _mm_setzero_si128());				// Boundary 8..15 (words)
+	__m128i temp1 = _mm_cvtepu8_epi16(boundary);									// Boundary 0..7 (words)
+
+	temp0 = _mm_cmpgt_epi16(temp0, cand);
+	temp1 = _mm_cmpgt_epi16(temp1, cand);
+	temp1 = _mm_packs_epi16(temp1, temp0);											// 255 at ith byte if boundary[i] > pixel + t
+	int mask = _mm_movemask_epi8(temp1);
+	int plusMask = mask | (mask << 16);
+
+	// Check for boundary > pixel - t
+	cand = _mm_sub_epi16(cand, t);													// pixel + t - t = pixel
+	cand = _mm_sub_epi16(cand, t);													// pixel - t
+
+	temp0 = _mm_unpackhi_epi8(boundary, _mm_setzero_si128());						// Boundary 8..15 (words)
+	temp1 = _mm_cvtepu8_epi16(boundary);											// Boundary 0..7 (words)
+
+	temp0 = _mm_cmplt_epi16(temp0, cand);
+	temp1 = _mm_cmplt_epi16(temp1, cand);
+	temp1 = _mm_packs_epi16(temp1, temp0);											// 255 at ith byte if boundary[i] > pixel + t
+	mask = _mm_movemask_epi8(temp1);
+	int minusMask = mask | (mask << 16);
+
+	if (plusMask || minusMask)
+	{
+		int cornerMask = 0x1FF;														// Nine 1's in the LSB
+
+		for (int i = 0; i < 16; i++)
+		{
+			if (((plusMask & cornerMask) == cornerMask) || ((minusMask & cornerMask) == cornerMask))
+				return true;
+			plusMask >>= 1;
+			minusMask >>= 1;
+		}
+	}
+
+	return false;
+}
+
+static inline bool checkForCornerAndGetStrength_SSE(unsigned char pixel, __m128i boundary, short threshold, short * strength)
+{
+	__m128i t = _mm_set1_epi16(threshold);
+
+	// Check for boundary > pixel + t
+	__m128i cand = _mm_set1_epi16((short)pixel);									// The candidate pixel
+	cand = _mm_add_epi16(cand, t);													// Pixel + t
+
+	__m128i temp0 = _mm_unpackhi_epi8(boundary, _mm_setzero_si128());				// Boundary 8..15 (words)
+	__m128i temp1 = _mm_cvtepu8_epi16(boundary);									// Boundary 0..7 (words)
+
+	temp0 = _mm_cmpgt_epi16(temp0, cand);
+	temp1 = _mm_cmpgt_epi16(temp1, cand);
+	temp1 = _mm_packs_epi16(temp1, temp0);											// 255 at ith byte if boundary[i] > pixel + t
+	int plusMask = _mm_movemask_epi8(temp1);
+
+	// If it is a corner, then compute the threshold
+	short strength_pos = 0;
+	cand = _mm_sub_epi16(cand, t);
+	if (isCorner(plusMask))
+	{
+		short thresh_upper = 255;
+		short thresh_lower = threshold;
+
+		while (thresh_upper - thresh_lower > 1)										// Binary search
+		{
+			strength_pos = (thresh_upper + thresh_lower) >> 1;
+			if (isCornerPlus_SSE(cand, boundary, strength_pos))
+				thresh_lower = strength_pos;
+			else
+				thresh_upper = strength_pos;
+		}
+		strength_pos = thresh_lower;
+	}
+
+	// Check for boundary > pixel - t
+	cand = _mm_sub_epi16(cand, t);													// pixel - t
+
+	temp0 = _mm_unpackhi_epi8(boundary, _mm_setzero_si128());						// Boundary 8..15 (words)
+	temp1 = _mm_cvtepu8_epi16(boundary);											// Boundary 0..7 (words)
+
+	temp0 = _mm_cmplt_epi16(temp0, cand);
+	temp1 = _mm_cmplt_epi16(temp1, cand);
+	temp1 = _mm_packs_epi16(temp1, temp0);											// 255 at ith byte if boundary[i] > pixel + t
+	int minusMask = _mm_movemask_epi8(temp1);
+
+	// If it is a corner, then compute the threshold
+	short strength_neg = 0;
+	cand = _mm_add_epi16(cand, t);
+	if (isCorner(minusMask))
+	{
+		short thresh_upper = 255;
+		short thresh_lower = threshold;
+
+		while (thresh_upper - thresh_lower > 1)										// Binary search
+		{
+			strength_neg = (thresh_upper + thresh_lower) >> 1;
+			if (isCornerMinus_SSE(cand, boundary, strength_neg))
+				thresh_lower = strength_neg;
+			else
+				thresh_upper = strength_neg;
+		}
+		strength_neg = thresh_lower;
+	}
+
+	if (plusMask || minusMask)
+	{
+		*strength = max(strength_pos, strength_neg);
+		return true;
+	}
+	return false;
+}
+
+int HafCpu_FastCorners_XY_U8_NoSupression
+	(
+		vx_uint32       capacityOfDstCorner,
+		vx_keypoint_t   dstCorner[],
+		vx_uint32     * pDstCornerCount,
+		vx_uint32       srcWidth,
+		vx_uint32       srcHeight,
+		vx_uint8      * pSrcImage,
+		vx_uint32       srcImageStrideInBytes,
+		vx_float32      strength_threshold
+	)
+{
+	unsigned char * pLocalSrc;
+	int srcStride = (int)srcImageStrideInBytes;
+	vx_uint32 cornerCount = 0;
+	short t = (short)floorf(strength_threshold);
+	
+	pSrcImage += (srcStride * 3);														// Leave first three rows
+
+	int alignedWidth = (int)srcWidth & ~7;
+	int postfixWidth = (int)srcWidth & 7;
+
+	// Generate offsets for C code if necessary
+	int neighbor_offset[16] = { 0 };
+	if (postfixWidth)
+		generateOffset(srcStride, neighbor_offset);
+
+	__m128i zeromask = _mm_setzero_si128();
+
+	for (int height = 0; height < (int)(srcHeight - 6); height++)
+	{
+		pLocalSrc = (unsigned char *) pSrcImage;
+		int width = 0;
+		
+		for (int x = 0; x < (alignedWidth >> 3); x++)
+		{
+			__m128i rowMinus3, rowMinus2, rowMinus1, row, rowPlus1, rowPlus2, rowPlus3;
+			__m128i thresh = _mm_set1_epi16(t);
+
+			// Check for early escape based on pixels 1 and 9 around the candidate
+			rowMinus3 = _mm_loadu_si128((__m128i *)(pLocalSrc - 3 * srcStride - 1));
+			rowMinus2 = _mm_srli_si128(rowMinus3, 1);									// row - 3: Pixels 0..7 in lower 7 bytes
+			rowMinus2 = _mm_cvtepu8_epi16(rowMinus2);
+
+			row = _mm_loadu_si128((__m128i *)(pLocalSrc - 3));
+			rowMinus1 = _mm_srli_si128(row, 3);											// row: Pixels 0..7 in lower 7 bytes
+			rowMinus1 = _mm_cvtepu8_epi16(rowMinus1);
+
+			rowPlus3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 3 * srcStride - 1));
+			rowPlus2 = _mm_srli_si128(rowPlus3, 1);										// row + 3: Pixels 0..7 in lower 7 bytes
+			rowPlus2 = _mm_cvtepu8_epi16(rowPlus2);
+
+			rowPlus1 = _mm_loadu_si128((__m128i *)(pLocalSrc + srcStride - 3));
+
+			rowMinus2 = _mm_sub_epi16(rowMinus2, rowMinus1);
+			rowMinus2 = _mm_abs_epi16(rowMinus2);
+			rowPlus2 = _mm_sub_epi16(rowPlus2, rowMinus1);
+			rowPlus2 = _mm_abs_epi16(rowPlus2);
+
+			rowMinus2 = _mm_cmplt_epi16(rowMinus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+			rowPlus2 = _mm_cmplt_epi16(rowPlus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+
+			int maskSkip = _mm_movemask_epi8(rowMinus2);
+			maskSkip &= _mm_movemask_epi8(rowPlus2);									// 1 if both 0 and 8 are within 't' of the candidate pixel
+
+			// Check for early escape based on pixels 12 and 4 around the candidate
+			rowMinus2 = _mm_cvtepu8_epi16(row);
+			rowPlus2 = _mm_srli_si128(row, 6);
+			rowPlus2 = _mm_cvtepu8_epi16(rowPlus2);
+
+			rowMinus2 = _mm_sub_epi16(rowMinus2, rowMinus1);
+			rowMinus2 = _mm_abs_epi16(rowMinus2);
+			rowPlus2 = _mm_sub_epi16(rowPlus2, rowMinus1);
+			rowPlus2 = _mm_abs_epi16(rowPlus2);
+
+			rowMinus1 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcStride - 3));
+
+			rowMinus2 = _mm_cmplt_epi16(rowMinus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+			rowPlus2 = _mm_cmplt_epi16(rowPlus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+
+			int maskSkip1 = _mm_movemask_epi8(rowMinus2);
+			rowMinus2 = _mm_loadu_si128((__m128i *)(pLocalSrc - (srcStride + srcStride) - 2));
+
+			maskSkip1 &= _mm_movemask_epi8(rowPlus2);									// 1 if both 0 and 8 are within 't' of the candidate pixel
+			rowPlus2 = _mm_loadu_si128((__m128i *)(pLocalSrc + (srcStride + srcStride) - 2));
+
+			maskSkip |= maskSkip1;
+
+			// Check for corners in the eight pixels
+			if (maskSkip != 0xFFFF)
+			{
+				for (int i = 0; i < 8; i++)
+				{
+					__m128i * tbl = (__m128i *) dataFastCornersPixelMask;
+
+					if (!(maskSkip & 1))
+					{
+						// Get the boundary pixels in an XMM register
+						__m128i mask = _mm_load_si128(tbl++);
+						__m128i boundary = _mm_shuffle_epi8(rowMinus3, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowMinus2, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowMinus1, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(row, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowPlus1, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowPlus2, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowPlus3, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						if (isCorner_SSE(M128I(row).m128i_u8[3], boundary, thresh))
+						{
+							if (cornerCount < capacityOfDstCorner)
+							{
+								dstCorner[cornerCount].y = height + 3;
+								dstCorner[cornerCount].x = width + i;
+								dstCorner[cornerCount].strength = strength_threshold;			// Undefined as per the 1.0.1 spec
+								dstCorner[cornerCount].scale = 0;
+								dstCorner[cornerCount].orientation = 0;
+								dstCorner[cornerCount].error = 0;
+								dstCorner[cornerCount++].tracking_status = 1;
+							}
+							else
+								cornerCount++;
+						}
+					}
+					maskSkip >>= 2;
+					rowMinus3 = _mm_srli_si128(rowMinus3, 1);
+					rowMinus2 = _mm_srli_si128(rowMinus2, 1);
+					rowMinus1 = _mm_srli_si128(rowMinus1, 1);
+					row = _mm_srli_si128(row, 1);
+					rowPlus1 = _mm_srli_si128(rowPlus1, 1);
+					rowPlus2 = _mm_srli_si128(rowPlus2, 1);
+					rowPlus3 = _mm_srli_si128(rowPlus3, 1);
+				}
+			}
+
+			width += 8;
+			pLocalSrc += 8;
+		}
+
+		for (int x = 0; x < postfixWidth; x++)
+		{
+			int masks[2];
+			generateMasks_C(pLocalSrc, srcStride, neighbor_offset, t, masks);
+			if (isCorner(masks))
+			{
+				if (cornerCount < capacityOfDstCorner)
+				{
+					dstCorner[cornerCount].y = height + 3;
+					dstCorner[cornerCount].x = width;
+					dstCorner[cornerCount].strength = strength_threshold;			// Undefined as per the 1.0.1 spec
+					dstCorner[cornerCount].scale = 0;
+					dstCorner[cornerCount].orientation = 0;
+					dstCorner[cornerCount].error = 0;
+					dstCorner[cornerCount++].tracking_status = 1;
+				}
+				else
+					cornerCount++;
+			}
+			width++;
+			pLocalSrc++;
+		}
+		pSrcImage += srcStride;
+	}
+
+	*pDstCornerCount = cornerCount;
+	return AGO_SUCCESS;
+}
+
+int HafCpu_FastCorners_XY_U8_Supression
+	(
+		vx_uint32       capacityOfDstCorner,
+		vx_keypoint_t   dstCorner[],
+		vx_uint32     * pDstCornerCount,
+		vx_uint32       srcWidth,
+		vx_uint32       srcHeight,
+		vx_uint8      * pSrcImage,
+		vx_uint32       srcImageStrideInBytes,
+		vx_float32      strength_threshold,
+		vx_uint8	  * pScratch
+	)
+{
+	unsigned char * pLocalSrc;
+	int srcStride = (int)srcImageStrideInBytes;
+	vx_uint32 cornerCount = 0;
+	short t = (short)floorf(strength_threshold);
+
+	pSrcImage += (srcStride * 3) + 3;														// Leave first three rows and start from the third pixel
+
+	int alignedWidth = (int)(srcWidth - 6) & ~7;
+	int postfixWidth = (int)(srcWidth - 6) & 7;
+
+	// Generate offsets for C code if necessary
+	int neighbor_offset[16] = { 0 };
+	if (postfixWidth)
+		generateOffset(srcStride, neighbor_offset);
+
+	memset(pScratch, 0, sizeof(vx_uint8) * srcWidth * srcHeight);
+
+	for (int height = 0; height < (int)(srcHeight - 6); height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		int width = 3;
+
+		for (int x = 0; x < (alignedWidth >> 3); x++)
+		{
+			__m128i rowMinus3, rowMinus2, rowMinus1, row, rowPlus1, rowPlus2, rowPlus3;
+			__m128i thresh = _mm_set1_epi16(t);
+
+			// Check for early escape based on pixels 1 and 9 around the candidate
+			rowMinus3 = _mm_loadu_si128((__m128i *)(pLocalSrc - 3 * srcStride - 1));
+			rowMinus2 = _mm_srli_si128(rowMinus3, 1);									// row - 3: Pixels 0..7 in lower 7 bytes
+			rowMinus2 = _mm_cvtepu8_epi16(rowMinus2);
+
+			row = _mm_loadu_si128((__m128i *)(pLocalSrc - 3));
+			rowMinus1 = _mm_srli_si128(row, 3);											// row: Pixels 0..7 in lower 7 bytes
+			rowMinus1 = _mm_cvtepu8_epi16(rowMinus1);
+
+			rowPlus3 = _mm_loadu_si128((__m128i *)(pLocalSrc + 3 * srcStride - 1));
+			rowPlus2 = _mm_srli_si128(rowPlus3, 1);										// row + 3: Pixels 0..7 in lower 7 bytes
+			rowPlus2 = _mm_cvtepu8_epi16(rowPlus2);
+
+			rowPlus1 = _mm_loadu_si128((__m128i *)(pLocalSrc + srcStride - 3));
+
+			rowMinus2 = _mm_sub_epi16(rowMinus2, rowMinus1);
+			rowMinus2 = _mm_abs_epi16(rowMinus2);
+			rowPlus2 = _mm_sub_epi16(rowPlus2, rowMinus1);
+			rowPlus2 = _mm_abs_epi16(rowPlus2);
+
+			rowMinus2 = _mm_cmplt_epi16(rowMinus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+			rowPlus2 = _mm_cmplt_epi16(rowPlus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+
+			int maskSkip = _mm_movemask_epi8(rowMinus2);
+			maskSkip &= _mm_movemask_epi8(rowPlus2);									// 1 if both 0 and 8 are within 't' of the candidate pixel
+
+			// Check for early escape based on pixels 12 and 4 around the candidate
+			rowMinus2 = _mm_cvtepu8_epi16(row);
+			rowPlus2 = _mm_srli_si128(row, 6);
+			rowPlus2 = _mm_cvtepu8_epi16(rowPlus2);
+
+			rowMinus2 = _mm_sub_epi16(rowMinus2, rowMinus1);
+			rowMinus2 = _mm_abs_epi16(rowMinus2);
+			rowPlus2 = _mm_sub_epi16(rowPlus2, rowMinus1);
+			rowPlus2 = _mm_abs_epi16(rowPlus2);
+
+			rowMinus1 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcStride - 3));
+
+			rowMinus2 = _mm_cmplt_epi16(rowMinus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+			rowPlus2 = _mm_cmplt_epi16(rowPlus2, thresh);								// Check if pixel 0 is less than 't' different from the candidate
+
+			int maskSkip1 = _mm_movemask_epi8(rowMinus2);
+			rowMinus2 = _mm_loadu_si128((__m128i *)(pLocalSrc - (srcStride + srcStride) - 2));
+
+			maskSkip1 &= _mm_movemask_epi8(rowPlus2);									// 1 if both 0 and 8 are within 't' of the candidate pixel
+			rowPlus2 = _mm_loadu_si128((__m128i *)(pLocalSrc + (srcStride + srcStride) - 2));
+
+			maskSkip |= maskSkip1;
+
+			// Check for corners in the eight pixels
+			if (maskSkip != 0xFFFF)
+			{
+				for (int i = 0; i < 8; i++)
+				{
+					__m128i * tbl = (__m128i *) dataFastCornersPixelMask;
+
+					if (!(maskSkip & 1))
+					{
+						// Get the boundary pixels in an XMM register
+						__m128i mask = _mm_load_si128(tbl++);
+						__m128i boundary = _mm_shuffle_epi8(rowMinus3, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowMinus2, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowMinus1, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(row, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowPlus1, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowPlus2, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						mask = _mm_load_si128(tbl++);
+						mask = _mm_shuffle_epi8(rowPlus3, mask);
+						boundary = _mm_or_si128(boundary, mask);
+
+						short strength = 0;
+						if (checkForCornerAndGetStrength_SSE(M128I(row).m128i_u8[3], boundary, t, &strength))
+							pScratch[(height + 3) * srcWidth + width + i] = (vx_uint8)strength;
+					}
+					maskSkip >>= 2;
+					rowMinus3 = _mm_srli_si128(rowMinus3, 1);
+					rowMinus2 = _mm_srli_si128(rowMinus2, 1);
+					rowMinus1 = _mm_srli_si128(rowMinus1, 1);
+					row = _mm_srli_si128(row, 1);
+					rowPlus1 = _mm_srli_si128(rowPlus1, 1);
+					rowPlus2 = _mm_srli_si128(rowPlus2, 1);
+					rowPlus3 = _mm_srli_si128(rowPlus3, 1);
+				}
+			}
+
+			width += 8;
+			pLocalSrc += 8;
+		}
+
+		for (int x = 0; x < postfixWidth; x++)
+		{
+			short strength = 0;
+			if (checkForCornerAndGetStrength(pLocalSrc, neighbor_offset, t, &strength))
+				pScratch[(height + 3) * srcWidth + width] = (vx_uint8)strength;
+
+			width++;
+			pLocalSrc++;
+		}
+		pSrcImage += srcStride;
+	}
+
+	// Non-max supression
+	pScratch += (3 * srcWidth + 3);
+	cornerCount = 0;
+	for (int height = 0; height < int(srcHeight - 6); height++)
+	{
+		for (int width = 0; width < int(srcWidth - 6); width++)
+		{
+			vx_uint8 * prev = pScratch - srcWidth;
+			vx_uint8 * nxt = pScratch + srcWidth;
+			vx_uint8 cand = *pScratch;
+			if (cand && (cand >= *(prev - 1)) && (cand >= *prev) && (cand >= *(prev + 1))
+				&& (cand >= *(pScratch - 1)) && (cand > *(pScratch + 1))
+				&& (cand > *(nxt - 1)) && (cand > *nxt) && (cand > *(nxt + 1)))
+			{
+				if (cornerCount < capacityOfDstCorner)
+				{
+					dstCorner[cornerCount].x = (vx_int32)(width + 3);
+					dstCorner[cornerCount].y = (vx_int32)(height + 3);
+					dstCorner[cornerCount].strength = (vx_float32)cand;
+					dstCorner[cornerCount].scale = 0;
+					dstCorner[cornerCount].orientation = 0;
+					dstCorner[cornerCount].error = 0;
+					dstCorner[cornerCount++].tracking_status = 1;
+				}
+				else
+					cornerCount++;
+			}
+			pScratch++;
+		}
+		pScratch += 6;
+	}
+	*pDstCornerCount = cornerCount;
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_FastCornerMerge_XY_XY
+	(
+		vx_uint32       capacityOfDstCorner,
+		vx_keypoint_t   dstCorner[],
+		vx_uint32     * pDstCornerCount,
+		vx_uint32		numSrcCornerBuffers,
+		vx_keypoint_t * pSrcCorners[],
+		vx_uint32       numSrcCorners[]
+	)
+{
+	int dstCount = 0;
+	int srcCount;
+	vx_keypoint_t * srcList;
+
+	for (int i = 0; i < (int)numSrcCornerBuffers; i++)
+	{
+		srcCount = numSrcCorners[i];
+		srcList = pSrcCorners[i];
+
+		while (srcCount)
+		{
+			*dstCorner++ = *srcList++;
+			dstCount++;
+			srcCount--;
+			if (dstCount >(int) capacityOfDstCorner)
+			{
+				*pDstCornerCount = (vx_uint32)(dstCount - 1);
+				return AGO_SUCCESS;
+			}
+		}
+	}
+
+	*pDstCornerCount = (vx_uint32)(dstCount - 1);
+	return AGO_SUCCESS;
+}
\ No newline at end of file
diff --git a/openvx/ago/ago_haf_cpu_filter.cpp b/openvx/ago/ago_haf_cpu_filter.cpp
new file mode 100644
index 0000000..988ee6d
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_filter.cpp
@@ -0,0 +1,4048 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+extern vx_uint32 dataConvertU1ToU8_4bytes[16];
+
+/* The function assumes at least one pixel padding on the top, left, right and bottom
+Separable filter
+	1    1 1 1
+	1
+	1
+*/
+int HafCpu_Box_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8    * pScratch
+	)
+{
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	unsigned char *pLocalDst = (unsigned char *)pDstImage;
+	
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	vx_uint16 * pPrevRow = (vx_uint16*) pScratch;
+	vx_uint16 * pCurrRow = ((vx_uint16*) pScratch) + tmpWidth;
+	vx_uint16 * pNextRow = ((vx_uint16*)pScratch) + (tmpWidth + tmpWidth);
+
+	__m128i row0, shiftedR, shiftedL, temp0, temp1, resultH, resultL;
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i divFactor = _mm_set1_epi16((short)7282);							// ceil((2^16)/9) = 7282
+
+	vx_uint16 * pLocalPrevRow = pPrevRow;
+	vx_uint16 * pLocalCurrRow = pCurrRow;
+	vx_uint16 * pLocalNextRow = pNextRow;
+	vx_uint16 * pTemp;
+
+	// Process first two rows - Horizontal filtering
+	for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes] + (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+		*pLocalCurrRow++ = (vx_uint16)pLocalSrc[-1] + (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+	}
+
+	for (int x = 0; x < (alignedWidth >> 4); x++)
+	{
+		// row above
+		row0 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes));
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,-1)
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,-1)
+
+		shiftedL = _mm_unpackhi_epi8(row0, zeromask);					// H: 1 * (-1, 0)
+		row0 = _mm_cvtepu8_epi16(row0);									// L: 1 * (-1, 0)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, row0);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, shiftedR);
+
+		_mm_storeu_si128((__m128i *) pLocalPrevRow, resultL);
+		_mm_storeu_si128((__m128i *) (pLocalPrevRow + 8), resultH);
+
+		// current row
+		row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,-1)
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,-1)
+
+		shiftedL = _mm_unpackhi_epi8(row0, zeromask);					// H: 1 * (0,-1)
+		row0 = _mm_cvtepu8_epi16(row0);									// L: 2 * (0,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, row0);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, shiftedR);
+
+		_mm_storeu_si128((__m128i *) pLocalCurrRow, resultL);
+		_mm_storeu_si128((__m128i *) (pLocalCurrRow + 8), resultH);
+
+		pLocalSrc += 16;
+		pLocalPrevRow += 16;
+		pLocalCurrRow += 16;
+	}
+
+	for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes] + (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+		*pLocalCurrRow++ = (vx_uint16)pLocalSrc[-1] + (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+	}
+
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+	pLocalNextRow = pNextRow;
+
+	// Process rows 3 till the end
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcImageStrideInBytes);				// Pointing to the row below
+		pLocalDst = (unsigned char *) pDstImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_uint16 temp = (vx_uint16)pLocalSrc[-1] + (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+			*pLocalNextRow++ = temp;										// Save the next row temp pixels
+			*pLocalDst++ = (char)((float)(temp + *pLocalPrevRow++ + *pLocalCurrRow++) / 9.0f);
+		}
+
+		int width = (int)(alignedWidth >> 4);
+		while (width)
+		{
+			// Horizontal Filtering
+			// current row
+			row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,-1)
+			resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,-1)
+
+			shiftedL = _mm_unpackhi_epi8(row0, zeromask);					// H: 2 * (0,-1)
+			row0 = _mm_cvtepu8_epi16(row0);									// L: 2 * (0,-1)
+			resultH = _mm_add_epi16(resultH, shiftedL);
+			resultL = _mm_add_epi16(resultL, row0);
+
+			temp0 = _mm_loadu_si128((__m128i*) pLocalPrevRow);				// Prev Row
+			temp1 = _mm_loadu_si128((__m128i*) (pLocalPrevRow + 8));
+
+			shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+			resultH = _mm_add_epi16(resultH, shiftedL);
+			resultL = _mm_add_epi16(resultL, shiftedR);
+
+			shiftedL = _mm_loadu_si128((__m128i*) pLocalCurrRow);			// Current Row
+			shiftedR = _mm_loadu_si128((__m128i*) (pLocalCurrRow + 8));
+
+			temp1 = _mm_add_epi16(temp1, resultH);							// Prev row + next row
+			temp0 = _mm_add_epi16(temp0, resultL);
+
+			_mm_storeu_si128((__m128i*) pLocalNextRow, resultL);			// Save the horizontal filtered pixels from the next row
+			_mm_storeu_si128((__m128i*) (pLocalNextRow + 8), resultH);
+
+			temp1 = _mm_add_epi16(temp1, shiftedR);							// Prev row + curr row + next row
+			temp0 = _mm_add_epi16(temp0, shiftedL);
+			temp1 = _mm_mulhi_epi16(temp1, divFactor);
+			temp0 = _mm_mulhi_epi16(temp0, divFactor);
+
+			temp0 = _mm_packus_epi16(temp0, temp1);
+			_mm_store_si128((__m128i*) pLocalDst, temp0);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+			pLocalPrevRow += 16;
+			pLocalCurrRow += 16;
+			pLocalNextRow += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_uint16 temp = (vx_uint16)pLocalSrc[-1] + (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+			*pLocalNextRow++ = temp;										// Save the next row temp pixels
+			*pLocalDst++ = (char)((float)(temp + *pLocalPrevRow++ + *pLocalCurrRow++) / 9.0f);
+		}
+
+		pTemp = pPrevRow;
+		pPrevRow = pCurrRow;
+		pCurrRow = pNextRow;
+		pNextRow = pTemp;
+
+		pLocalPrevRow = pPrevRow;
+		pLocalCurrRow = pCurrRow;
+		pLocalNextRow = pNextRow;
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Dilate_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char *pLocalSrc, *pLocalDst;
+	__m128i row0, row1, row2, shiftedR, shiftedL;
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+	
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			unsigned char temp1, temp2;
+			temp1 = max(max(pLocalSrc[-(int)srcImageStrideInBytes - 1], pLocalSrc[-(int)srcImageStrideInBytes]), pLocalSrc[-(int)srcImageStrideInBytes + 1]);
+			temp2 = max(max(pLocalSrc[-1], pLocalSrc[0]), pLocalSrc[1]);
+			temp1 = max(temp1, temp2);
+			temp2 = max(max(pLocalSrc[(int)srcImageStrideInBytes - 1], pLocalSrc[(int)srcImageStrideInBytes]), pLocalSrc[(int)srcImageStrideInBytes + 1]);
+			*pLocalDst++ = max(temp1, temp2);
+		}
+
+		for (int width = 0; width < (int)(alignedWidth >> 4); width++, pLocalSrc += 16, pLocalDst += 16)
+		{
+			// For the row above
+			row0 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes));
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+			row0 = _mm_max_epu8(row0, shiftedL);
+			row0 = _mm_max_epu8(row0, shiftedR);
+
+			// For the current row
+			row1 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+			row1 = _mm_max_epu8(row1, shiftedL);
+			row1 = _mm_max_epu8(row1, shiftedR);
+
+			// For the row below
+			row2 = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes));
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes + 1));
+			row2 = _mm_max_epu8(row2, shiftedL);
+			row2 = _mm_max_epu8(row2, shiftedR);
+
+			row0 = _mm_max_epu8(row0, row1);
+			row0 = _mm_max_epu8(row0, row2);
+			_mm_store_si128((__m128i *) pLocalDst, row0);
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			unsigned char temp1, temp2;
+			temp1 = max(max(pLocalSrc[-(int)srcImageStrideInBytes - 1], pLocalSrc[-(int)srcImageStrideInBytes]), pLocalSrc[-(int)srcImageStrideInBytes + 1]);
+			temp2 = max(max(pLocalSrc[-1], pLocalSrc[0]), pLocalSrc[1]);
+			temp1 = max(temp1, temp2);
+			temp2 = max(max(pLocalSrc[(int)srcImageStrideInBytes - 1], pLocalSrc[(int)srcImageStrideInBytes]), pLocalSrc[(int)srcImageStrideInBytes + 1]);
+			*pLocalDst++ = max(temp1, temp2);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Erode_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char *pLocalSrc, *pLocalDst;
+	__m128i row0, row1, row2, shiftedR, shiftedL;
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			unsigned char temp1, temp2;
+			temp1 = min(min(pLocalSrc[-(int)srcImageStrideInBytes - 1], pLocalSrc[-(int)srcImageStrideInBytes]), pLocalSrc[-(int)srcImageStrideInBytes + 1]);
+			temp2 = min(min(pLocalSrc[-1], pLocalSrc[0]), pLocalSrc[1]);
+			temp1 = min(temp1, temp2);
+			temp2 = min(min(pLocalSrc[(int)srcImageStrideInBytes - 1], pLocalSrc[(int)srcImageStrideInBytes]), pLocalSrc[(int)srcImageStrideInBytes + 1]);
+			*pLocalDst++ = min(temp1, temp2);
+		}
+
+		for (int width = 0; width < (int)(alignedWidth >> 4); width++, pLocalSrc += 16, pLocalDst += 16)
+		{
+			// For the row above
+			row0 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes));
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+			row0 = _mm_min_epu8(row0, shiftedL);
+			row0 = _mm_min_epu8(row0, shiftedR);
+
+			// For the current row
+			row1 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+			row1 = _mm_min_epu8(row1, shiftedL);
+			row1 = _mm_min_epu8(row1, shiftedR);
+
+			// For the row below
+			row2 = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes));
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes + 1));
+			row2 = _mm_min_epu8(row2, shiftedL);
+			row2 = _mm_min_epu8(row2, shiftedR);
+
+			row0 = _mm_min_epu8(row0, row1);
+			row0 = _mm_min_epu8(row0, row2);
+			_mm_store_si128((__m128i *) pLocalDst, row0);
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			unsigned char temp1, temp2;
+			temp1 = min(min(pLocalSrc[-(int)srcImageStrideInBytes - 1], pLocalSrc[-(int)srcImageStrideInBytes]), pLocalSrc[-(int)srcImageStrideInBytes + 1]);
+			temp2 = min(min(pLocalSrc[-1], pLocalSrc[0]), pLocalSrc[1]);
+			temp1 = min(temp1, temp2);
+			temp2 = min(min(pLocalSrc[(int)srcImageStrideInBytes - 1], pLocalSrc[(int)srcImageStrideInBytes]), pLocalSrc[(int)srcImageStrideInBytes + 1]);
+			*pLocalDst++ = min(temp1, temp2);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the source image pointer is 16 byte aligned, and the source stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth.
+The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_Dilate_U1_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * src = (__m128i*) pSrcImage;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp;
+	__m128i maskL = _mm_set_epi8((char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i maskR = _mm_set_epi8((char)0xFF, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0);
+	
+	uint64_t maskConv = 0x0101010101010101;
+	uint64_t result[2];
+	char lpixel, rpixel;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			row0 = _mm_load_si128(&src[(width >> 4) - (srcImageStrideInBytes >> 4)]);			// row above
+			row1 = _mm_load_si128(&src[width >> 4]);
+			row2 = _mm_load_si128(&src[(width >> 4) + (srcImageStrideInBytes >> 4)]);			// row below
+
+			// For the row above
+			lpixel = (char)*(pSrcImage - srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage - srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row0, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row0, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row0 = _mm_or_si128(row0, shiftedL);
+			row0 = _mm_or_si128(row0, shiftedR);
+
+			// For the current row
+			lpixel = (char)*(pSrcImage - 1);
+			rpixel = (char)*(pSrcImage + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row1, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row1, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row1 = _mm_or_si128(row1, shiftedL);
+			row1 = _mm_or_si128(row1, shiftedR);
+
+			// For the row below
+			lpixel = (char)*(pSrcImage + srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage + srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row2, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row2, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row2 = _mm_or_si128(row2, shiftedL);
+			row2 = _mm_or_si128(row2, shiftedR);
+
+			row0 = _mm_or_si128(row0, row1);
+			row0 = _mm_or_si128(row0, row2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			result[0] = _pext_u64(row0.m128i_u64[0], maskConv);
+			result[1] = _pext_u64(row0.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			
+			*((unsigned char*)pDstImage + (width >> 4))= (unsigned char)(result[0]);
+			*((unsigned char*)pDstImage + (width >> 4) + 1) = (unsigned char)(result[1]);
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the source image pointer is 16 byte aligned, and the source stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth.
+The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_Erode_U1_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * src = (__m128i*) pSrcImage;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp;
+	__m128i maskL = _mm_set_epi8((char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i maskR = _mm_set_epi8((char)0xFF, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0);
+
+	uint64_t maskConv = 0x0101010101010101;
+	uint64_t result[2];
+	char lpixel, rpixel;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			row0 = _mm_load_si128(&src[(width >> 4) - (srcImageStrideInBytes >> 4)]);			// row above
+			row1 = _mm_load_si128(&src[width >> 4]);											// current row
+			row2 = _mm_load_si128(&src[(width >> 4) + (srcImageStrideInBytes >> 4)]);			// row below
+
+			// For the row above
+			lpixel = (char)*(pSrcImage - srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage - srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row0, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row0, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row0 = _mm_and_si128(row0, shiftedL);
+			row0 = _mm_and_si128(row0, shiftedR);
+
+			// For the current row
+			lpixel = (char)*(pSrcImage - 1);
+			rpixel = (char)*(pSrcImage + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row1, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row1, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row1 = _mm_and_si128(row1, shiftedL);
+			row1 = _mm_and_si128(row1, shiftedR);
+
+			// For the row below
+			lpixel = (char)*(pSrcImage + srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage + srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row2, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row2, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row2 = _mm_and_si128(row2, shiftedL);
+			row2 = _mm_and_si128(row2, shiftedR);
+
+			row0 = _mm_and_si128(row0, row1);
+			row0 = _mm_and_si128(row0, row2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			result[0] = _pext_u64(row0.m128i_u64[0], maskConv);
+			result[1] = _pext_u64(row0.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((result[1] & 0xFF) << 8) | (result[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the destination image pointer is 16 byte aligned, and the destination stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth.
+The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_Dilate_U8_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*) pDstImage;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp;
+	__m128i maskL = _mm_set_epi8((char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i maskR = _mm_set_epi8((char)0xFF, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0);
+	
+	__declspec(align(16)) uint64_t pixels[2];
+	uint64_t maskConv = 0x0101010101010101;
+	char lpixel, rpixel;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the row above
+			pixels[0] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row0 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the current row
+			pixels[0] = (uint64_t)(*pSrcImage);
+			pixels[1] = (uint64_t)(*(pSrcImage + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row1 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the row below
+			pixels[0] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row2 = _mm_load_si128((__m128i*) pixels);
+
+			// For the row above
+			lpixel = (char)*(pSrcImage - srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage - srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row0, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row0, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row0 = _mm_or_si128(row0, shiftedL);
+			row0 = _mm_or_si128(row0, shiftedR);
+
+			// For the current row
+			lpixel = (char)*(pSrcImage - 1);
+			rpixel = (char)*(pSrcImage + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row1, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row1, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row1 = _mm_or_si128(row1, shiftedL);
+			row1 = _mm_or_si128(row1, shiftedR);
+
+			// For the row below
+			lpixel = (char)*(pSrcImage + srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage + srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row2, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row2, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row2 = _mm_or_si128(row2, shiftedL);
+			row2 = _mm_or_si128(row2, shiftedR);
+
+			row0 = _mm_or_si128(row0, row1);
+			row0 = _mm_or_si128(row0, row2);
+			
+			// Convert the bytes from 0x01 -> 0xFF and 0x0 -> 0x0
+			temp = _mm_setzero_si128();
+			row0 = _mm_cmpgt_epi8(row0, temp);
+
+			_mm_store_si128(&dst[width >> 4], row0);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the destination image pointer is 16 byte aligned, and the destination stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth.
+The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_Erode_U8_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*) pDstImage;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp;
+	__m128i maskL = _mm_set_epi8((char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i maskR = _mm_set_epi8((char)0xFF, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0);
+
+	__declspec(align(16)) uint64_t pixels[2];
+	uint64_t maskConv = 0x0101010101010101;
+	char lpixel, rpixel;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the row above
+			pixels[0] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row0 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the current row
+			pixels[0] = (uint64_t)(*pSrcImage);
+			pixels[1] = (uint64_t)(*(pSrcImage + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row1 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the row below
+			pixels[0] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row2 = _mm_load_si128((__m128i*) pixels);
+
+			// For the row above
+			lpixel = (char)*(pSrcImage - srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage - srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row0, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row0, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row0 = _mm_and_si128(row0, shiftedL);
+			row0 = _mm_and_si128(row0, shiftedR);
+
+			// For the current row
+			lpixel = (char)*(pSrcImage - 1);
+			rpixel = (char)*(pSrcImage + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row1, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row1, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row1 = _mm_and_si128(row1, shiftedL);
+			row1 = _mm_and_si128(row1, shiftedR);
+
+			// For the row below
+			lpixel = (char)*(pSrcImage + srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage + srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row2, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row2, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row2 = _mm_and_si128(row2, shiftedL);
+			row2 = _mm_and_si128(row2, shiftedR);
+
+			row0 = _mm_and_si128(row0, row1);
+			row0 = _mm_and_si128(row0, row2);
+
+			// Convert the bytes from 0x01 -> 0xFF and 0x0 -> 0x0
+			temp = _mm_setzero_si128();
+			row0 = _mm_cmpgt_epi8(row0, temp);
+
+			_mm_store_si128(&dst[width >> 4], row0);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function processes the pixels in a width which is the next highest multiple of 16 after dstWidth.
+The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_Dilate_U1_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp;
+	__m128i maskL = _mm_set_epi8((char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i maskR = _mm_set_epi8((char)0xFF, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0);
+
+	__declspec(align(16)) uint64_t pixels[2];
+	uint64_t maskConv = 0x0101010101010101;
+	char lpixel, rpixel;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the row above
+			pixels[0] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row0 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the current row
+			pixels[0] = (uint64_t)(*pSrcImage);
+			pixels[1] = (uint64_t)(*(pSrcImage + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row1 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the row below
+			pixels[0] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row2 = _mm_load_si128((__m128i*) pixels);
+
+			// For the row above
+			lpixel = (char)*(pSrcImage - srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage - srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row0, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row0, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row0 = _mm_or_si128(row0, shiftedL);
+			row0 = _mm_or_si128(row0, shiftedR);
+
+			// For the current row
+			lpixel = (char)*(pSrcImage - 1);
+			rpixel = (char)*(pSrcImage + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row1, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row1, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row1 = _mm_or_si128(row1, shiftedL);
+			row1 = _mm_or_si128(row1, shiftedR);
+
+			// For the row below
+			lpixel = (char)*(pSrcImage + srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage + srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row2, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row2, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row2 = _mm_or_si128(row2, shiftedL);
+			row2 = _mm_or_si128(row2, shiftedR);
+
+			row0 = _mm_or_si128(row0, row1);
+			row0 = _mm_or_si128(row0, row2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels[0] = _pext_u64(row0.m128i_u64[0], maskConv);
+			pixels[1] = _pext_u64(row0.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels[1] & 0xFF) << 8) | (pixels[0] & 0xFF));
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function processes the pixels in a width which is the next highest multiple of 16 after dstWidth.
+The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_Erode_U1_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp;
+	__m128i maskL = _mm_set_epi8((char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0xFF);
+	__m128i maskR = _mm_set_epi8((char)0xFF, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0, (char)0);
+
+	__declspec(align(16)) uint64_t pixels[2];
+	uint64_t maskConv = 0x0101010101010101;
+	char lpixel, rpixel;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the row above
+			pixels[0] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage - srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row0 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the current row
+			pixels[0] = (uint64_t)(*pSrcImage);
+			pixels[1] = (uint64_t)(*(pSrcImage + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row1 = _mm_load_si128((__m128i*) pixels);
+
+			// Read the row below
+			pixels[0] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes));
+			pixels[1] = (uint64_t)(*(pSrcImage + srcImageStrideInBytes + 8));
+#ifdef _WIN64
+			pixels[0] = _pdep_u64(pixels[0], maskConv);
+			pixels[1] = _pdep_u64(pixels[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			row2 = _mm_load_si128((__m128i*) pixels);
+
+			// For the row above
+			lpixel = (char)*(pSrcImage - srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage - srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row0, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row0, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row0 = _mm_and_si128(row0, shiftedL);
+			row0 = _mm_and_si128(row0, shiftedR);
+
+			// For the current row
+			lpixel = (char)*(pSrcImage - 1);
+			rpixel = (char)*(pSrcImage + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row1, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row1, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row1 = _mm_and_si128(row1, shiftedL);
+			row1 = _mm_and_si128(row1, shiftedR);
+
+			// For the row below
+			lpixel = (char)*(pSrcImage + srcImageStrideInBytes - 1);
+			rpixel = (char)*(pSrcImage + srcImageStrideInBytes + 17);
+			temp = _mm_set1_epi8(lpixel);
+			shiftedL = _mm_slli_si128(row2, 1);
+			shiftedL = _mm_blendv_epi8(shiftedL, temp, maskL);
+			temp = _mm_set1_epi8(rpixel);
+			shiftedR = _mm_srli_si128(row2, 1);
+			shiftedR = _mm_blendv_epi8(shiftedR, temp, maskR);
+			row2 = _mm_and_si128(row2, shiftedL);
+			row2 = _mm_and_si128(row2, shiftedR);
+
+			row0 = _mm_and_si128(row0, row1);
+			row0 = _mm_and_si128(row0, row2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels[0] = _pext_u64(row0.m128i_u64[0], maskConv);
+			pixels[1] = _pext_u64(row0.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels[1] & 0xFF) << 8) | (pixels[0] & 0xFF));
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+#else
+/* The function assumes that the source width is a multiple of 8 pixels and the source image pointer points to the second row of the image (first row of the valid region)*/
+int HafCpu_Dilate_U1_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i *pLocalSrcCurrRow, *pLocalSrcPrevRow, *pLocalSrcNextRow;
+	vx_int16 * pLocalDst;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL;
+	
+	int pixelmask;
+	int alignedWidth = (int)dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	int strideDiv16 = (int)(srcImageStrideInBytes >> 4);
+	int height = (int)dstHeight;
+
+	while (height > 0)
+	{
+		pLocalSrcCurrRow = (__m128i*) pSrcImage;
+		pLocalSrcPrevRow = pLocalSrcCurrRow - strideDiv16;
+		pLocalSrcNextRow = pLocalSrcCurrRow + strideDiv16;
+		pLocalDst = (vx_int16 *)pDstImage;
+
+		int width = alignedWidth >> 4;						// 16 pixels (bits) are processed at a time in the inner loop
+		while (width > 0)
+		{
+			row0 = _mm_loadu_si128(pLocalSrcPrevRow);
+			row1 = _mm_loadu_si128(pLocalSrcCurrRow);
+			row2 = _mm_loadu_si128(pLocalSrcNextRow);
+			
+			// For the row above
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrcPrevRow - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrcPrevRow + 1));
+			row0 = _mm_or_si128(row0, shiftedL);
+			row0 = _mm_or_si128(row0, shiftedR);
+
+			// For the current row
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrcCurrRow - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrcCurrRow + 1));
+			row1 = _mm_or_si128(row1, shiftedL);
+			row1 = _mm_or_si128(row1, shiftedR);
+
+			// For the row below
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrcNextRow - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrcNextRow + 1));
+			row2 = _mm_or_si128(row2, shiftedL);
+			row2 = _mm_or_si128(row2, shiftedR);
+
+			row0 = _mm_or_si128(row0, row1);
+			row0 = _mm_or_si128(row0, row2);
+			pixelmask = _mm_movemask_epi8(row0);						// Convert U8 to U1
+			*pLocalDst++ = (vx_int16)(pixelmask & 0xFFFF);
+			pLocalSrcCurrRow++;
+			pLocalSrcPrevRow++;
+			pLocalSrcNextRow++;
+			width--;
+		}
+
+		if (postfixWidth)					// XX XX valid XX
+		{
+			vx_int16 * pRow = ((vx_int16 *)pLocalSrcPrevRow) - 1;
+			pixelmask = *((int *)pRow);
+			pixelmask = (pixelmask << 1) | pixelmask | (pixelmask >> 1);
+
+			pRow = ((vx_int16 *)pLocalSrcCurrRow) - 1;
+			int temp = *((int *)pRow);
+			pixelmask |= ((temp << 1) | temp | (temp >> 1));
+
+			pRow = ((vx_int16 *)pLocalSrcNextRow) - 1;
+			temp = *((int *)pRow);
+			pixelmask |= ((temp << 1) | temp | (temp >> 1));
+
+			*((vx_uint8*)pLocalDst) = (vx_uint8)((pixelmask >> 8) & 0xFF);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Erode_U1_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i *pLocalSrcCurrRow, *pLocalSrcPrevRow, *pLocalSrcNextRow;
+	vx_int16 * pLocalDst;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL;
+
+	int pixelmask;
+	int alignedWidth = (int)dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	int strideDiv16 = (int)(srcImageStrideInBytes >> 4);
+	int height = (int)dstHeight;
+
+	while (height > 0)
+	{
+		pLocalSrcCurrRow = (__m128i*) pSrcImage;
+		pLocalSrcPrevRow = pLocalSrcCurrRow - (srcImageStrideInBytes >> 4);
+		pLocalSrcNextRow = pLocalSrcCurrRow + (srcImageStrideInBytes >> 4);
+		pLocalDst = (vx_int16 *)pDstImage;
+
+		int width = alignedWidth >> 4;					// 16 pixels (bits) are processed at a time in the inner loop
+		while (width > 0)
+		{
+			row0 = _mm_loadu_si128(pLocalSrcPrevRow);
+			row1 = _mm_loadu_si128(pLocalSrcCurrRow);
+			row2 = _mm_loadu_si128(pLocalSrcNextRow);
+
+			// For the row above
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrcPrevRow - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrcPrevRow + 1));
+			row0 = _mm_and_si128(row0, shiftedL);
+			row0 = _mm_and_si128(row0, shiftedR);
+
+			// For the current row
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrcCurrRow - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrcCurrRow + 1));
+			row1 = _mm_and_si128(row1, shiftedL);
+			row1 = _mm_and_si128(row1, shiftedR);
+
+			// For the row below
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrcNextRow - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrcNextRow + 1));
+			row2 = _mm_and_si128(row2, shiftedL);
+			row2 = _mm_and_si128(row2, shiftedR);
+
+			row0 = _mm_and_si128(row0, row1);
+			row0 = _mm_and_si128(row0, row2);
+			pixelmask = _mm_movemask_epi8(row0);						// Convert U8 to U1
+			*pLocalDst++ = (vx_int16)(pixelmask & 0xFFFF);
+			pLocalSrcCurrRow++;
+			pLocalSrcPrevRow++;
+			pLocalSrcNextRow++;
+			width--;
+		}
+
+		if (postfixWidth)					// XX XX valid XX
+		{
+			vx_int16 * pRow = ((vx_int16 *)pLocalSrcPrevRow) - 1;
+			pixelmask = *((int *)pRow);
+			pixelmask = (pixelmask << 1) & pixelmask & (pixelmask >> 1);
+
+			pRow = ((vx_int16 *)pLocalSrcCurrRow) - 1;
+			int temp = *((int *)pRow);
+			pixelmask &= ((temp << 1) & temp & (temp >> 1));
+
+			pRow = ((vx_int16 *)pLocalSrcNextRow) - 1;
+			temp = *((int *)pRow);
+			pixelmask &= ((temp << 1) & temp & (temp >> 1));
+
+			*((vx_uint8*)pLocalDst) = (vx_uint8)((pixelmask >> 8) & 0xFF);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Dilate_U1_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	short *pLocalSrc, *pLocalDst;
+
+	int pixels, row, shiftedL, shiftedR;
+
+	int alignedWidth = (int)dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (short *) (pSrcImage - 1);
+		pLocalDst = (short *) pDstImage;
+		int width = alignedWidth >> 4;					// 16 pixels processed at a time in the inner loop
+
+		int strideDiv2 = (int)(srcImageStrideInBytes >> 1);
+		while (width)
+		{
+			// Each read, reads 32 bits, first 8 bits don't care, next 16 bits useful and last 8 again don't care
+			// Previous row
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row | shiftedL | shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			*pLocalDst++ = (short)((pixels >> 8) & 0xFFFF);
+			pLocalSrc++;
+			width--;
+		}
+
+		if (postfixWidth)
+		{
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row | shiftedL | shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			*((vx_uint8*)pLocalDst) = (vx_uint8)((pixels >> 16) & 0xFF);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Erode_U1_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	short *pLocalSrc, *pLocalDst;
+
+	int pixels, row, shiftedL, shiftedR;
+
+	int alignedWidth = (int)dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (short *)(pSrcImage - 1);
+		pLocalDst = (short *)pDstImage;
+		int width = alignedWidth >> 4;
+		int strideDiv2 = (int)(srcImageStrideInBytes >> 1);
+		while (width)
+		{
+			// Each read, reads 32 bits, first 8 bits don't care, next 16 bits useful and last 8 again don't care
+
+			// Previous row
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row & shiftedL & shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			*pLocalDst++ = (short)((pixels >> 8) & 0xFFFF);
+			pLocalSrc++;
+			width--;
+		}
+
+		if (postfixWidth)
+		{
+			// Previous row
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row & shiftedL & shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			*((vx_uint8*)pLocalDst) = (vx_uint8)((pixels >> 16) & 0xFF);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Dilate_U8_U1_3x3
+	(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes
+	)
+{
+	vx_int16 * pLocalSrc;
+	vx_int32 * pLocalDst;
+
+	int pixels, row, shiftedL, shiftedR;
+
+	int alignedWidth = (int)dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	int height = (int)dstHeight;
+	int strideDiv2 = (int)(srcImageStrideInBytes >> 1);
+	while (height)
+	{
+		pLocalSrc = (vx_int16 *)(pSrcImage - 1);
+		pLocalDst = (vx_int32 *)pDstImage;
+		int width = alignedWidth >> 4;
+
+		while (width)
+		{
+			// Each read, reads 32 bits, first 8 bits don't care, next 16 bits useful and last 8 again don't care
+			// Previous row
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row | shiftedL | shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			pixels >>= 8;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pLocalSrc++;
+			width--;
+		}
+
+		if (postfixWidth)
+		{
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row | shiftedL | shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels |= (row | shiftedL | shiftedR);
+
+			pixels >>= 16;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Erode_U8_U1_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	short * pLocalSrc;
+	int * pLocalDst;
+
+	int pixels, row, shiftedL, shiftedR;
+
+	int alignedWidth = (int)dstWidth & ~15;
+	int postfixWidth = (int)dstWidth - alignedWidth;
+
+	int height = (int)dstHeight;
+	int strideDiv2 = (int)(srcImageStrideInBytes >> 1);
+
+	while (height)
+	{
+		pLocalSrc = (short *)(pSrcImage - 1);
+		pLocalDst = (int *)pDstImage;
+		int width = alignedWidth >> 4;
+
+		while (width)
+		{
+			// Each read, reads 32 bits, first 8 bits don't care, next 16 bits useful and last 8 again don't care
+			// Previous row
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row & shiftedL & shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			pixels >>= 8;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pLocalSrc++;
+			width--;
+		}
+
+		if (postfixWidth)
+		{
+			// Previous row
+			row = *((int*)(pLocalSrc - strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels = row & shiftedL & shiftedR;
+
+			// Current row
+			row = *((int*)pLocalSrc);
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			// Next row
+			row = *((int*)(pLocalSrc + strideDiv2));
+			shiftedL = row << 1;
+			shiftedR = row >> 1;
+			pixels &= (row & shiftedL & shiftedR);
+
+			pixels >>= 16;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+			pixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels & 0xF];
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+#endif
+
+/* The function assumes at least one pixel padding on the top, left, right and bottom 
+   Separable filter
+	1    1 2 1
+	2
+	1
+*/
+int HafCpu_Gaussian_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	)
+{
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	unsigned char *pLocalDst = (unsigned char *)pDstImage;
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	vx_uint16 * pPrevRow = (vx_uint16*)pScratch;
+	vx_uint16 * pCurrRow = ((vx_uint16*)pScratch) + tmpWidth;
+	vx_uint16 * pNextRow = ((vx_uint16*)pScratch) + (tmpWidth + tmpWidth);
+
+	__m128i row0, shiftedR, shiftedL, temp0, temp1, resultH, resultL;
+	__m128i zeromask = _mm_setzero_si128();
+
+	vx_uint16 * pLocalPrevRow = pPrevRow;
+	vx_uint16 * pLocalCurrRow = pCurrRow;
+	vx_uint16 * pLocalNextRow = pNextRow;
+	vx_uint16 * pTemp;
+
+	// Process first two rows - Horizontal filtering
+	for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + 2 * (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes] + (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+		*pLocalCurrRow++ = (vx_uint16)pLocalSrc[-1] + 2 * (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+	}
+
+	for (int x = 0; x < (alignedWidth >> 4); x++)
+	{
+		// row above
+		row0 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes));
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,-1)
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,-1)
+
+		shiftedL = _mm_unpackhi_epi8(row0, zeromask);
+		shiftedL = _mm_slli_epi16(shiftedL, 1);							// H: 2 * (0,-1)
+		row0 = _mm_cvtepu8_epi16(row0);
+		row0 = _mm_slli_epi16(row0, 1);									// L: 2 * (0,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, row0);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, shiftedR);
+
+		_mm_storeu_si128((__m128i *) pLocalPrevRow, resultL);
+		_mm_storeu_si128((__m128i *) (pLocalPrevRow + 8), resultH);
+
+		// current row
+		row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,0)
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,0)
+
+		shiftedL = _mm_unpackhi_epi8(row0, zeromask);
+		shiftedL = _mm_slli_epi16(shiftedL, 1);							// H: 2 * (0,0)
+		row0 = _mm_cvtepu8_epi16(row0);	
+		row0 = _mm_slli_epi16(row0, 1);									// L: 2 * (0,0)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, row0);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,0)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,0)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, shiftedR);
+
+		_mm_storeu_si128((__m128i *) pLocalCurrRow, resultL);
+		_mm_storeu_si128((__m128i *) (pLocalCurrRow + 8), resultH);
+
+		pLocalSrc += 16;
+		pLocalPrevRow += 16;
+		pLocalCurrRow += 16;
+	}
+
+	for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + 2 * (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes] + (vx_uint16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+		*pLocalCurrRow++ = (vx_uint16)pLocalSrc[-1] + 2 * (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+	}
+
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+	pLocalNextRow = pNextRow;
+
+	// Process rows 3 till the end
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcImageStrideInBytes);				// Pointing to the row below
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_uint16 temp = (vx_uint16)pLocalSrc[-1] + 2 * (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+			*pLocalNextRow++ = temp;													// Save the next row temp pixels
+			*pLocalDst++ = (char)((temp + *pLocalPrevRow++ + 2*(*pLocalCurrRow++)) >> 4);
+		}
+
+		int width = (int)(alignedWidth >> 4);
+		while (width)
+		{
+			// Horizontal Filtering
+			// current row
+			row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,-1)
+			resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,-1)
+
+			shiftedL = _mm_unpackhi_epi8(row0, zeromask);
+			shiftedL = _mm_slli_epi16(shiftedL, 1);							// H: 2 * (0,0)
+			row0 = _mm_cvtepu8_epi16(row0);
+			row0 = _mm_slli_epi16(row0, 1);									// L: 2 * (0,0)
+			resultH = _mm_add_epi16(resultH, shiftedL);
+			resultL = _mm_add_epi16(resultL, row0);
+
+			temp0 = _mm_loadu_si128((__m128i*) pLocalPrevRow);				// Prev Row
+			temp1 = _mm_loadu_si128((__m128i*) (pLocalPrevRow + 8));
+
+			shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+			resultH = _mm_add_epi16(resultH, shiftedL);
+			resultL = _mm_add_epi16(resultL, shiftedR);
+
+			shiftedL = _mm_loadu_si128((__m128i*) pLocalCurrRow);			// Current Row
+			shiftedL = _mm_slli_epi16(shiftedL, 1);
+			shiftedR = _mm_loadu_si128((__m128i*) (pLocalCurrRow + 8));
+			shiftedR = _mm_slli_epi16(shiftedR, 1);
+
+			temp1 = _mm_add_epi16(temp1, resultH);							// Prev row + next row
+			temp0 = _mm_add_epi16(temp0, resultL);
+
+			_mm_storeu_si128((__m128i*) pLocalNextRow, resultL);			// Save the horizontal filtered pixels from the next row
+			_mm_storeu_si128((__m128i*) (pLocalNextRow + 8), resultH);
+
+			temp1 = _mm_add_epi16(temp1, shiftedR);							// Prev row + curr row + next row
+			temp0 = _mm_add_epi16(temp0, shiftedL);
+			temp1 = _mm_srli_epi16(temp1, 4);
+			temp0 = _mm_srli_epi16(temp0, 4);
+
+			temp0 = _mm_packus_epi16(temp0, temp1);
+			_mm_store_si128((__m128i*) pLocalDst, temp0);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+			pLocalPrevRow += 16;
+			pLocalCurrRow += 16;
+			pLocalNextRow += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_uint16 temp = (vx_uint16)pLocalSrc[-1] + 2 * (vx_uint16)pLocalSrc[0] + (vx_uint16)pLocalSrc[1];
+			*pLocalNextRow++ = temp;										// Save the next row temp pixels
+			*pLocalDst++ = (char)((temp + *pLocalPrevRow++ + 2*(*pLocalCurrRow++)) >> 4);
+		}
+
+		pTemp = pPrevRow;
+		pPrevRow = pCurrRow;
+		pCurrRow = pNextRow;
+		pNextRow = pTemp;
+
+		pLocalPrevRow = pPrevRow;
+		pLocalCurrRow = pCurrRow;
+		pLocalNextRow = pNextRow;
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes at least one pixel padding on the top, left, right and bottom
+   Separable filter
+	 1    -1 0 1
+	 2
+	 1
+*/
+int HafCpu_Sobel_S16_U8_3x3_GX
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstGxImage,
+		vx_uint32     dstGxImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	)
+{
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	short * pLocalDst;
+
+	int prefixWidth = intptr_t(pDstGxImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	vx_int16 * pPrevRow = (vx_int16*)pScratch;
+	vx_int16 * pCurrRow = ((vx_int16*)pScratch) + tmpWidth;
+	vx_int16 * pNextRow = ((vx_int16*)pScratch) + (tmpWidth + tmpWidth);
+
+	__m128i row0, shiftedR, shiftedL, temp0, temp1, resultH, resultL;
+	__m128i zeromask = _mm_setzero_si128();
+
+	vx_int16 * pLocalPrevRow = pPrevRow;
+	vx_int16 * pLocalCurrRow = pCurrRow;
+	vx_int16 * pLocalNextRow = pNextRow;
+	vx_int16 * pTemp;
+
+	// Process first two rows - Horizontal filtering
+	for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1];
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+	}
+
+	for (int x = 0; x < (int)(alignedWidth >> 4); x++)
+	{
+		// row above
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: -1 * (-1,-1)
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: -1 * (-1,-1)
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+		resultH = _mm_sub_epi16(shiftedL, resultH);
+		resultL = _mm_sub_epi16(shiftedR, resultL);
+
+		_mm_store_si128((__m128i *) pLocalPrevRow, resultL);
+		_mm_store_si128((__m128i *) (pLocalPrevRow + 8), resultH);
+
+		// current row
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: -1 * (-1,0)
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: -1 * (-1,0)
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,0)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,0)
+		resultH = _mm_sub_epi16(shiftedL, resultH);
+		resultL = _mm_sub_epi16(shiftedR, resultL);
+
+		_mm_store_si128((__m128i *) pLocalCurrRow, resultL);
+		_mm_store_si128((__m128i *) (pLocalCurrRow + 8), resultH);
+
+		pLocalSrc += 16;
+		pLocalPrevRow += 16;
+		pLocalCurrRow += 16;
+	}
+
+	for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1];
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+	}
+
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+	pLocalNextRow = pNextRow;
+
+	// Process rows 3 till the end
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcImageStrideInBytes);				// Pointing to the row below
+		pLocalDst = (short *)pDstGxImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 temp = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+			*pLocalNextRow++ = temp;													// Save the next row temp pixels
+			*pLocalDst++ = temp + *pLocalPrevRow++ + 2 * (*pLocalCurrRow++);
+		}
+
+		int width = (int)(alignedWidth >> 4);
+		while (width)
+		{
+			// Horizontal Filtering
+			// current row
+			row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			resultL = _mm_cvtepu8_epi16(shiftedL);							// L: -1 * (-1,-1)
+			resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: -1 * (-1,-1)
+
+			temp0 = _mm_load_si128((__m128i*) pLocalPrevRow);				// Prev Row
+			temp1 = _mm_load_si128((__m128i*) (pLocalPrevRow + 8));
+
+			shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+			resultH = _mm_sub_epi16(shiftedL, resultH);
+			resultL = _mm_sub_epi16(shiftedR, resultL);
+
+			shiftedL = _mm_load_si128((__m128i*) pLocalCurrRow);			// Current Row
+			shiftedR = _mm_load_si128((__m128i*) (pLocalCurrRow + 8));
+
+			temp1 = _mm_add_epi16(temp1, resultH);							// Prev row + next row
+			temp0 = _mm_add_epi16(temp0, resultL);
+
+			shiftedR = _mm_slli_epi16(shiftedR, 1);
+			shiftedL = _mm_slli_epi16(shiftedL, 1);
+
+			_mm_store_si128((__m128i*) pLocalNextRow, resultL);				// Save the horizontal filtered pixels from the next row
+			_mm_store_si128((__m128i*) (pLocalNextRow + 8), resultH);
+
+			temp1 = _mm_add_epi16(temp1, shiftedR);							// Prev row + 2*curr row + next row
+			temp0 = _mm_add_epi16(temp0, shiftedL);
+
+			_mm_store_si128((__m128i*) pLocalDst, temp0);
+			_mm_store_si128((__m128i*) (pLocalDst + 8), temp1);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+			pLocalPrevRow += 16;
+			pLocalCurrRow += 16;
+			pLocalNextRow += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 temp = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+			*pLocalNextRow++ = temp;										// Save the next row temp pixels
+			*pLocalDst++ = temp + *pLocalPrevRow++ + 2 * (*pLocalCurrRow++);
+		}
+
+		pTemp = pPrevRow;
+		pPrevRow = pCurrRow;
+		pCurrRow = pNextRow;
+		pNextRow = pTemp;
+
+		pLocalPrevRow = pPrevRow;
+		pLocalCurrRow = pCurrRow;
+		pLocalNextRow = pNextRow;
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstGxImage += (dstGxImageStrideInBytes >> 1);
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes at least one pixel padding on the top, left, right and bottom
+	Separable filter:
+	-1		1	2	1
+	0
+	1
+*/
+int HafCpu_Sobel_S16_U8_3x3_GY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstGyImage,
+		vx_uint32     dstGyImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	)
+{
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	short * pLocalDst;
+
+	int prefixWidth = intptr_t(pDstGyImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	vx_int16 * pPrevRow = (vx_int16*)pScratch;
+	vx_int16 * pCurrRow = ((vx_int16*)pScratch) + tmpWidth;
+	vx_int16 * pNextRow = ((vx_int16*)pScratch) + (tmpWidth + tmpWidth);
+
+	__m128i row0, shiftedR, shiftedL, temp0, temp1, resultH, resultL;
+	__m128i zeromask = _mm_setzero_si128();
+
+	vx_int16 * pLocalPrevRow = pPrevRow;
+	vx_int16 * pLocalCurrRow = pCurrRow;
+	vx_int16 * pLocalNextRow = pNextRow;
+	vx_int16 * pTemp;
+
+	// Process first two rows - Horizontal filtering
+	for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+	}
+
+	for (int x = 0; x < (int)(alignedWidth >> 4); x++)
+	{
+		// row above
+		row0 = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes));
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,-1)
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,-1)
+
+		shiftedL = _mm_unpackhi_epi8(row0, zeromask);
+		shiftedL = _mm_slli_epi16(shiftedL, 1);							// H: 2 * (0,-1)
+		row0 = _mm_cvtepu8_epi16(row0);
+		row0 = _mm_slli_epi16(row0, 1);									// L: 2 * (0,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, row0);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,-1)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, shiftedR);
+
+		_mm_storeu_si128((__m128i *) pLocalPrevRow, resultL);
+		_mm_storeu_si128((__m128i *) (pLocalPrevRow + 8), resultH);
+
+		// current row
+		row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+		resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,0)
+		resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,0)
+
+		shiftedL = _mm_unpackhi_epi8(row0, zeromask);
+		shiftedL = _mm_slli_epi16(shiftedL, 1);							// H: 2 * (0,0)
+		row0 = _mm_cvtepu8_epi16(row0);
+		row0 = _mm_slli_epi16(row0, 1);									// L: 2 * (0,0)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, row0);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,0)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,0)
+		resultH = _mm_add_epi16(resultH, shiftedL);
+		resultL = _mm_add_epi16(resultL, shiftedR);
+
+		_mm_storeu_si128((__m128i *) pLocalCurrRow, resultL);
+		_mm_storeu_si128((__m128i *) (pLocalCurrRow + 8), resultH);
+
+		pLocalSrc += 16;
+		pLocalPrevRow += 16;
+		pLocalCurrRow += 16;
+	}
+
+	for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+	}
+
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+	pLocalNextRow = pNextRow;
+
+	// Process rows 3 till the end
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcImageStrideInBytes);				// Pointing to the row below
+		pLocalDst = (short *)pDstGyImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 temp = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+			*pLocalNextRow++ = temp;													// Save the next row temp pixels
+			*pLocalDst++ = *pLocalPrevRow++ - temp;
+		}
+
+		int width = (int)(alignedWidth >> 4);
+		while (width)
+		{
+			// Horizontal Filtering
+			// current row
+			row0 = _mm_loadu_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			resultL = _mm_cvtepu8_epi16(shiftedL);							// L: 1 * (-1,1)
+			resultH = _mm_unpackhi_epi8(shiftedL, zeromask);				// H: 1 * (-1,1)
+
+			temp0 = _mm_load_si128((__m128i*) pLocalPrevRow);				// Prev Row
+			temp1 = _mm_load_si128((__m128i*) (pLocalPrevRow + 8));
+
+			shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// H: 1 * (1,1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);							// L: 1 * (1,1)
+			resultH = _mm_add_epi16(shiftedL, resultH);
+			resultL = _mm_add_epi16(shiftedR, resultL);
+
+			shiftedL = _mm_unpackhi_epi8(row0, zeromask);
+			shiftedR = _mm_cvtepu8_epi16(row0);
+			shiftedL = _mm_slli_epi16(shiftedL, 1);							// H: 2 * (0,1)
+			shiftedR = _mm_slli_epi16(shiftedR, 1);							// L: 2 * (0,1)
+			resultH = _mm_add_epi16(shiftedL, resultH);						// Horizontal filtered next row
+			resultL = _mm_add_epi16(shiftedR, resultL);
+
+			_mm_store_si128((__m128i*) pLocalNextRow, resultL);				// Save the horizontal filtered pixels from the next row
+			_mm_store_si128((__m128i*) (pLocalNextRow + 8), resultH);
+
+			temp1 = _mm_sub_epi16(resultH, temp1);							// Next row - prev row
+			temp0 = _mm_sub_epi16(resultL, temp0);
+
+			_mm_store_si128((__m128i*) pLocalDst, temp0);
+			_mm_store_si128((__m128i*) (pLocalDst + 8), temp1);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+			pLocalPrevRow += 16;
+			pLocalCurrRow += 16;
+			pLocalNextRow += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 temp = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+			*pLocalNextRow++ = temp;													// Save the next row temp pixels
+			*pLocalDst++ = *pLocalPrevRow++ - temp;
+		}
+
+		pTemp = pPrevRow;
+		pPrevRow = pCurrRow;
+		pCurrRow = pNextRow;
+		pNextRow = pTemp;
+
+		pLocalPrevRow = pPrevRow;
+		pLocalCurrRow = pCurrRow;
+		pLocalNextRow = pNextRow;
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstGyImage += (dstGyImageStrideInBytes >> 1);
+		height--;
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes at least one pixel padding on the top, left, right and bottom 
+	Separable filter
+		1	 -1 0 1					-1	  1 2 1
+   Gx = 2						Gy = 0
+		1							-1
+*/
+int HafCpu_Sobel_S16S16_U8_3x3_GXY
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstGxImage,
+		vx_uint32     dstGxImageStrideInBytes,
+		vx_int16    * pDstGyImage,
+		vx_uint32     dstGyImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	)
+{	
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	short *pLocalDstGx, *pLocalDstGy;
+
+	int prefixWidth = intptr_t(pDstGxImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	vx_int16 * pPrevRow = (vx_int16*)pScratch;
+	vx_int16 * pCurrRow = ((vx_int16*)pScratch) + (2 * tmpWidth);
+	vx_int16 * pNextRow = ((vx_int16*)pScratch) + (4 * tmpWidth);
+
+	__m128i row0, shiftedR, shiftedL, temp0, temp1, temp2, GxH, GxL, GyH, GyL;
+	__m128i zeromask = _mm_setzero_si128();
+
+	vx_int16 * pLocalPrevRow = pPrevRow;
+	vx_int16 * pLocalCurrRow = pCurrRow;
+	vx_int16 * pLocalNextRow = pNextRow;
+	vx_int16 * pTemp;
+
+	// Process first two rows - Horizontal filtering
+	for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1];					// Gx
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];	// Gy
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];									// Gx
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];	// Gy
+	}
+
+	for (int x = 0; x < (int)(alignedWidth >> 4); x++)
+	{
+		// row above
+		row0 = _mm_load_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes));
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+		GyH = _mm_unpackhi_epi8(row0, zeromask);
+		GyH = _mm_slli_epi16(GyH, 1);									// GyH: 2 * (0,-1)
+		GyL = _mm_cvtepu8_epi16(row0);
+		GyL = _mm_slli_epi16(GyL, 1);									// GyL: 2 * (0,-1)
+
+		GxL = _mm_cvtepu8_epi16(shiftedL);								// GxL: -1 * (-1,-1)	GyL: 1 * (-1,-1)
+		GxH = _mm_unpackhi_epi8(shiftedL, zeromask);					// GxH: -1 * (-1,-1)	GyH: 1 * (-1,-1)
+		GyH = _mm_add_epi16(GyH, GxH);
+		GyL = _mm_add_epi16(GyL, GxL);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// GxH: 1 * (1,-1)		GyH: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// GxL: 1 * (1,-1)		GyL: 1 * (1,-1)
+		GxH = _mm_sub_epi16(shiftedL, GxH);
+		GxL = _mm_sub_epi16(shiftedR, GxL);
+		GyH = _mm_add_epi16(GyH, shiftedL);
+		GyL = _mm_add_epi16(GyL, shiftedR);
+
+		_mm_store_si128((__m128i *) pLocalPrevRow, GxL);
+		_mm_store_si128((__m128i *) (pLocalPrevRow + 8), GxH);
+		_mm_store_si128((__m128i *) (pLocalPrevRow + 16), GyL);
+		_mm_store_si128((__m128i *) (pLocalPrevRow + 24), GyH);
+
+		// current row
+		row0 = _mm_load_si128((__m128i *)pLocalSrc);
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+		GyH = _mm_unpackhi_epi8(row0, zeromask);
+		GyH = _mm_slli_epi16(GyH, 1);									// GyH: 2 * (-1, 0)
+		GyL = _mm_cvtepu8_epi16(row0);
+		GyL = _mm_slli_epi16(GyL, 1);									// GyL: 2 * (-1, 0)
+
+		GxL = _mm_cvtepu8_epi16(shiftedL);								// GxL: -1 * (-1,-1)	GyL: 1 * (-1,-1)
+		GxH = _mm_unpackhi_epi8(shiftedL, zeromask);					// GxH: -1 * (-1,-1)	GyH: 1 * (-1,-1)
+		GyH = _mm_add_epi16(GyH, GxH);
+		GyL = _mm_add_epi16(GyL, GxL);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// GxH: 1 * (1,-1)		GyH: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// GxL: 1 * (1,-1)		GyL: 1 * (1,-1)
+		GxH = _mm_sub_epi16(shiftedL, GxH);
+		GxL = _mm_sub_epi16(shiftedR, GxL);
+		GyH = _mm_add_epi16(GyH, shiftedL);
+		GyL = _mm_add_epi16(GyL, shiftedR);
+
+		_mm_store_si128((__m128i *) pLocalCurrRow, GxL);
+		_mm_store_si128((__m128i *) (pLocalCurrRow + 8), GxH);
+		_mm_store_si128((__m128i *) (pLocalCurrRow + 16), GyL);
+		_mm_store_si128((__m128i *) (pLocalCurrRow + 24), GyH);
+
+		pLocalSrc += 16;
+		pLocalPrevRow += 32;
+		pLocalCurrRow += 32;
+	}
+
+	for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1];					// Gx
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];	// Gy
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];									// Gx
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];	// Gy
+	}
+
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+	pLocalNextRow = pNextRow;
+
+	// Process rows 3 till the end
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcImageStrideInBytes);				// Pointing to the row below
+		pLocalDstGx = (short *) pDstGxImage;
+		pLocalDstGy = (short *) pDstGyImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 tempGx = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+			*pLocalNextRow++ = tempGx;
+			vx_int16 tempGy = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+			*pLocalNextRow++ = tempGy;
+
+			*pLocalDstGx++ = *pLocalPrevRow++ + ((*pLocalCurrRow++) << 1) + tempGx;
+			*pLocalDstGy++ = tempGy - *pLocalPrevRow++;
+			pLocalCurrRow++;
+		}
+
+		int width = (int)(dstWidth >> 4);
+		while (width)
+		{
+			// Horizontal Filtering
+			// next row
+			row0 = _mm_load_si128((__m128i *) pLocalSrc);
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			GyH = _mm_unpackhi_epi8(row0, zeromask);
+			GyH = _mm_slli_epi16(GyH, 1);									// GyH: 2 * (-1, 0)
+			GyL = _mm_cvtepu8_epi16(row0);
+			GyL = _mm_slli_epi16(GyL, 1);									// GyL: 2 * (-1, 0)
+
+			GxL = _mm_cvtepu8_epi16(shiftedL);								// GxL: -1 * (-1,-1)	GyL: 1 * (-1,-1)
+			GxH = _mm_unpackhi_epi8(shiftedL, zeromask);					// GxH: -1 * (-1,-1)	GyH: 1 * (-1,-1)
+			GyH = _mm_add_epi16(GyH, GxH);
+			GyL = _mm_add_epi16(GyL, GxL);
+
+			temp0 = _mm_load_si128((__m128i *) pLocalPrevRow);				// Prev Row - Gx
+			temp1 = _mm_load_si128((__m128i *) (pLocalPrevRow + 8));
+			row0 = _mm_load_si128((__m128i *) (pLocalPrevRow + 16));		// Prev Row - Gy
+			temp2 = _mm_load_si128((__m128i *) (pLocalPrevRow + 24));
+
+			shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// GxH: 1 * (1,-1)		GyH: 1 * (1,-1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);							// GxL: 1 * (1,-1)		GyL: 1 * (1,-1)
+			GxH = _mm_sub_epi16(shiftedL, GxH);
+			GxL = _mm_sub_epi16(shiftedR, GxL);
+			GyH = _mm_add_epi16(GyH, shiftedL);
+			GyL = _mm_add_epi16(GyL, shiftedR);
+
+			shiftedL = _mm_load_si128((__m128i *) pLocalCurrRow);			// Current Row
+			shiftedR = _mm_load_si128((__m128i *) (pLocalCurrRow + 8));
+
+			temp1 = _mm_add_epi16(temp1, GxH);								// Prev row + next row
+			temp0 = _mm_add_epi16(temp0, GxL);
+
+			shiftedR = _mm_slli_epi16(shiftedR, 1);
+			shiftedL = _mm_slli_epi16(shiftedL, 1);
+
+			_mm_store_si128((__m128i *) pLocalNextRow, GxL);				// Save the horizontal filtered pixels from the next row - Gx
+			_mm_store_si128((__m128i *) (pLocalNextRow + 8), GxH);
+			_mm_store_si128((__m128i *) (pLocalNextRow + 16), GyL);			// Save the horizontal filtered pixels from the next row - Gy
+			_mm_store_si128((__m128i *) (pLocalNextRow + 24), GyH);
+
+
+			temp1 = _mm_add_epi16(temp1, shiftedR);							// next row - Prev row 
+			temp0 = _mm_add_epi16(temp0, shiftedL);
+			row0 = _mm_sub_epi16(GyL, row0);
+			temp2 = _mm_sub_epi16(GyH, temp2);
+
+			_mm_store_si128((__m128i *) pLocalDstGx, temp0);
+			_mm_store_si128((__m128i *) (pLocalDstGx + 8), temp1);
+			_mm_store_si128((__m128i *) pLocalDstGy, row0);
+			_mm_store_si128((__m128i *) (pLocalDstGy + 8), temp2);
+
+			pLocalSrc += 16;
+			pLocalDstGx += 16;
+			pLocalDstGy += 16;
+			pLocalPrevRow += 32;
+			pLocalCurrRow += 32;
+			pLocalNextRow += 32;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 tempGx = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+			*pLocalNextRow++ = tempGx;
+			vx_int16 tempGy = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+			*pLocalNextRow++ = tempGy;
+
+			*pLocalDstGx++ = *pLocalPrevRow++ + ((*pLocalCurrRow++) << 1) + tempGx;
+			*pLocalDstGy++ = tempGy - *pLocalPrevRow++;
+			pLocalCurrRow++;
+		}
+
+		pTemp = pPrevRow;
+		pPrevRow = pCurrRow;
+		pCurrRow = pNextRow;
+		pNextRow = pTemp;
+
+		pLocalPrevRow = pPrevRow;
+		pLocalCurrRow = pCurrRow;
+		pLocalNextRow = pNextRow;
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstGxImage += (dstGxImageStrideInBytes >> 1);
+		pDstGyImage += (dstGyImageStrideInBytes >> 1);
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes at least one pixel padding on the top, left, right and bottom */
+int HafCpu_SobelMagnitude_S16_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstMagImage,
+		vx_uint32     dstMagImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	unsigned char * pLocalSrc;
+	short * pLocalDst;
+
+	int prefixWidth = intptr_t(pDstMagImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp, GxH, GxL, GyH, GyL;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (short *) pDstMagImage;
+		
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 tempGx = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1) + 
+				(vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1];
+			vx_int16 tempGy = (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] -
+				(vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] - ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+			float mag = (float)(tempGx*tempGx) + (float)(tempGy*tempGy);
+			mag = sqrtf(mag);
+			*pLocalDst++ = (vx_int16)mag;
+		}
+
+		int width = (int)(alignedWidth >> 4);						// 16 pixels processed at a time
+		while (width)
+		{
+			row0 = _mm_load_si128((__m128i *) (pLocalSrc - srcImageStrideInBytes));		// row above
+			row1 = _mm_load_si128((__m128i *) pLocalSrc);								// current row
+			row2 = _mm_load_si128((__m128i *) (pLocalSrc + srcImageStrideInBytes));		// row below
+
+			// For the row below
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes + 1));
+
+			GxH = _mm_unpackhi_epi8(shiftedL, zeromask);				// Gx, H: -1 * (-1,1)
+			GxL = _mm_cvtepu8_epi16(shiftedL);							// Gx, L: -1 * (-1,1)
+			GyH = _mm_add_epi16(GxH, zeromask);							// Gy, H: 1 * (-1,1)
+			GyL = _mm_add_epi16(GxL, zeromask);							// Gy, L: 1 * (-1,1)
+
+			temp = _mm_unpackhi_epi8(row2, zeromask);
+			temp = _mm_slli_epi16(temp, 1);								// Gy, H: 2 * (0,1)
+			row2 = _mm_cvtepu8_epi16(row2);
+			row2 = _mm_slli_epi16(row2, 1);								// Gy, L: 2 * (0,1)
+			GyH = _mm_add_epi16(GyH, temp);
+			GyL = _mm_add_epi16(GyL, row2);
+
+			temp = _mm_unpackhi_epi8(shiftedR, zeromask);				// Gy, H: 1 * (1,1),	Gx, H: 1 * (1,1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);						// Gy, L: 1 * (1,1),	Gx, L: 1 * (1,1)
+			GyH = _mm_add_epi16(GyH, temp);
+			GyL = _mm_add_epi16(GyL, shiftedR);
+			GxH = _mm_sub_epi16(temp, GxH);
+			GxL = _mm_sub_epi16(shiftedR, GxL);
+
+			// For the current row
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			temp = _mm_unpackhi_epi8(shiftedR, zeromask);
+			temp = _mm_slli_epi16(temp, 1);								// Gx, H: 2 * (1,0)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);
+			shiftedR = _mm_slli_epi16(shiftedR, 1);						// Gx, L: 2 * (1,0)
+			GxH = _mm_add_epi16(GxH, temp);
+			GxL = _mm_add_epi16(GxL, shiftedR);
+
+			temp = _mm_unpackhi_epi8(shiftedL, zeromask);
+			temp = _mm_slli_epi16(temp, 1);								// Gx, H: -2 * (-1,0)
+			shiftedL = _mm_cvtepu8_epi16(shiftedL);
+			shiftedL = _mm_slli_epi16(shiftedL, 1);						// Gx, L: -2 * (-1,0)
+			GxH = _mm_sub_epi16(GxH, temp);
+			GxL = _mm_sub_epi16(GxL, shiftedL);
+
+			// For the row above
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+			temp = _mm_unpackhi_epi8(shiftedR, zeromask);				// Gy, H: -1 * (1,-1),	Gx, H: 1 * (1,-1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);						// Gy, L: -1 * (1,-1),	Gx, L: 1 * (1,-1)
+			GxH = _mm_add_epi16(GxH, temp);
+			GxL = _mm_add_epi16(GxL, shiftedR);
+			GyH = _mm_sub_epi16(GyH, temp);
+			GyL = _mm_sub_epi16(GyL, shiftedR);
+
+			temp = _mm_unpackhi_epi8(row0, zeromask);
+			row0 = _mm_cvtepu8_epi16(row0);
+			temp = _mm_slli_epi16(temp, 1);								// Gy, H: -2 * (0,-1)
+			row0 = _mm_slli_epi16(row0, 1);								// Gy, L: -2 * (0,-1)
+			GyH = _mm_sub_epi16(GyH, temp);
+			GyL = _mm_sub_epi16(GyL, row0);
+
+			temp = _mm_unpackhi_epi8(shiftedL, zeromask);				// Gy, H: -1 * (-1,-1),	Gx, H: -1 * (-1,-1)
+			shiftedL = _mm_cvtepu8_epi16(shiftedL);						// Gy, L: -1 * (-1,-1),	Gx, L: -1 * (-1,-1)
+			GxH = _mm_sub_epi16(GxH, temp);
+			GxL = _mm_sub_epi16(GxL, shiftedL);
+			GyH = _mm_sub_epi16(GyH, temp);
+			GyL = _mm_sub_epi16(GyL, shiftedL);
+
+			// Magnitude
+			row0 = _mm_srli_si128(GxH, 8);
+			row1 = _mm_srli_si128(GxL, 8);
+			row0 = _mm_cvtepi16_epi32(row0);							// GxH: Upper 4 words to dwords
+			GxH = _mm_cvtepi16_epi32(GxH);								// GxH: Lower 4 words to dwords
+			row1 = _mm_cvtepi16_epi32(row1);							// GxL: Upper 4 words to dwords
+			GxL = _mm_cvtepi16_epi32(GxL);								// GxL: Lower 4 words to dwords
+
+			row2 = _mm_srli_si128(GyH, 8);
+			temp = _mm_srli_si128(GyL, 8);
+			row2 = _mm_cvtepi16_epi32(row2);							// GyH: Upper 4 words to dwords
+			GyH = _mm_cvtepi16_epi32(GyH);								// GyH: Lower 4 words to dwords
+			temp = _mm_cvtepi16_epi32(temp);							// GyL: Upper 4 words to dwords
+			GyL = _mm_cvtepi16_epi32(GyL);								// GyL: Lower 4 words to dwords
+			
+			row0 = _mm_mullo_epi32(row0, row0);							// Square
+			GxH = _mm_mullo_epi32(GxH, GxH);
+			row1 = _mm_mullo_epi32(row1, row1);
+			GxL = _mm_mullo_epi32(GxL, GxL);
+			row2 = _mm_mullo_epi32(row2, row2);
+			GyH = _mm_mullo_epi32(GyH, GyH);
+			temp = _mm_mullo_epi32(temp, temp);
+			GyL = _mm_mullo_epi32(GyL, GyL);
+
+			row0 = _mm_add_epi32(row0, row2);							// Add
+			GxH = _mm_add_epi32(GxH, GyH);
+			row1 = _mm_add_epi32(row1, temp);
+			GxL = _mm_add_epi32(GxL, GyL);
+
+			temp = _mm_srli_si128(row0, 8);
+			__m128d d_pix1 = _mm_cvtepi32_pd(temp);						// Pixels 15, 14
+			__m128d d_pix0 = _mm_cvtepi32_pd(row0);						// Pixels 13, 12
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			row0 = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			row0 = _mm_slli_si128(row0, 8);
+			row0 = _mm_or_si128(row0, temp);							// Pixels 15, 14, 13, 12 (DWORDS)
+
+			temp = _mm_srli_si128(GxH, 8);
+			d_pix1 = _mm_cvtepi32_pd(temp);								// Pixels 11, 10
+			d_pix0 = _mm_cvtepi32_pd(GxH);								// Pixels 9, 8
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			GxH = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			GxH = _mm_slli_si128(GxH, 8);
+			GxH = _mm_or_si128(GxH, temp);								// Pixels 11, 10, 9, 8 (DWORDS)
+			row0 = _mm_packus_epi32(GxH, row0);							// Pixels 15, 14, 13, 12, 11, 10, 9, 8 (WORDS)
+
+			temp = _mm_srli_si128(row1, 8);
+			d_pix1 = _mm_cvtepi32_pd(temp);								// Pixels 7, 6
+			d_pix0 = _mm_cvtepi32_pd(row1);								// Pixels 5, 4
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			row1 = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			row1 = _mm_slli_si128(row1, 8);
+			row1 = _mm_or_si128(row1, temp);							// Pixels 7, 6, 5, 4 (DWORDS)
+
+			temp = _mm_srli_si128(GxL, 8);
+			d_pix1 = _mm_cvtepi32_pd(temp);								// Pixels 3, 2
+			d_pix0 = _mm_cvtepi32_pd(GxL);								// Pixels 1, 0
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			GxL = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			GxL = _mm_slli_si128(GxL, 8);
+			GxL = _mm_or_si128(GxL, temp);								// Pixels 3, 2, 1, 0 (DWORDS)
+			row1 = _mm_packus_epi32(GxL, row1);							// Pixels 7, 6, 5, 4, 3, 2, 1, 0 (WORDS)
+
+			_mm_store_si128((__m128i *) pLocalDst, row1);
+			_mm_store_si128((__m128i *) (pLocalDst + 8), row0);
+
+			pLocalSrc += 16;
+			pLocalDst += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 tempGx = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1) +
+				(vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1];
+			vx_int16 tempGy = (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] -
+				(vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] - ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+			float mag = (float)(tempGx*tempGx) + (float)(tempGy*tempGy);
+			mag = sqrtf(mag);
+			*pLocalDst++ = (vx_int16)mag;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstMagImage += (dstMagImageStrideInBytes >> 1);
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_S16_U8_3xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc;
+	short * pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 3 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (short *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -1; j <= 1; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				int offset = y * srcStride;
+
+				row = _mm_loadu_si128((__m128i *)(pLocalSrc + offset - 1));				// shifted left pixels
+				mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+				// Upper 4 bytes - shiftedL pixels
+				temp1 = _mm_unpackhi_epi8(row, zeromask);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result3 = _mm_add_epi32(result3, temp0);
+
+				// Next 4 bytes - shiftedL pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result2 = _mm_add_epi32(result2, temp0);
+
+				// Next 4 bytes - shiftedL pixels
+				temp1 = _mm_cvtepu8_epi16(row);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result1 = _mm_add_epi32(result1, temp0);
+
+				row = _mm_loadu_si128((__m128i *)(pLocalSrc + offset));				// pixels at the location
+
+				// Lowest 4 bytes - shiftedL pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result0 = _mm_add_epi32(result0, temp0);
+
+				mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+				// Upper 4 bytes - at loc pixels
+				temp1 = _mm_unpackhi_epi8(row, zeromask);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result3 = _mm_add_epi32(result3, temp0);
+
+				// Next 4 bytes - at loc pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result2 = _mm_add_epi32(result2, temp0);
+
+				// Next 4 bytes - at loc pixels
+				temp1 = _mm_cvtepu8_epi16(row);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result1 = _mm_add_epi32(result1, temp0);
+
+				row = _mm_loadu_si128((__m128i *)(pLocalSrc + offset + 1));				// shifted right pixels
+
+				// Lowest 4 bytes - at loc pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result0 = _mm_add_epi32(result0, temp0);
+
+				mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+				// Upper 4 bytes - shiftedR pixels
+				temp1 = _mm_unpackhi_epi8(row, zeromask);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result3 = _mm_add_epi32(result3, temp0);
+
+				// Next 4 bytes - shiftedR pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result2 = _mm_add_epi32(result2, temp0);
+
+				// Next 4 bytes - shiftedR pixels
+				temp1 = _mm_cvtepu8_epi16(row);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result1 = _mm_add_epi32(result1, temp0);
+
+				// Lowest 4 bytes - shiftedR pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result0 = _mm_add_epi32(result0, temp0);
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			_mm_store_si128(pLocalDst_xmm++, temp0);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (short *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -1; j <= 1; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes >> 1);
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_U8_U8_3xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc, *pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 3 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -1; j <= 1; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				int offset = y * srcStride;
+				row = _mm_loadu_si128((__m128i *)(pLocalSrc + offset - 1));				// shifted left pixels
+				mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+				// Upper 4 bytes - shiftedL pixels
+				temp1 = _mm_unpackhi_epi8(row, zeromask);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result3 = _mm_add_epi32(result3, temp0);
+
+				// Next 4 bytes - shiftedL pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result2 = _mm_add_epi32(result2, temp0);
+
+				// Next 4 bytes - shiftedL pixels
+				temp1 = _mm_cvtepu8_epi16(row);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result1 = _mm_add_epi32(result1, temp0);
+
+				row = _mm_loadu_si128((__m128i *)(pLocalSrc + offset));				// pixels at the location
+
+				// Lowest 4 bytes - shiftedL pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result0 = _mm_add_epi32(result0, temp0);
+
+				mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+				// Upper 4 bytes - at loc pixels
+				temp1 = _mm_unpackhi_epi8(row, zeromask);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result3 = _mm_add_epi32(result3, temp0);
+
+				// Next 4 bytes - at loc pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result2 = _mm_add_epi32(result2, temp0);
+
+				// Next 4 bytes - at loc pixels
+				temp1 = _mm_cvtepu8_epi16(row);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result1 = _mm_add_epi32(result1, temp0);
+
+				row = _mm_loadu_si128((__m128i *)(pLocalSrc + offset + 1));				// shifted right pixels
+
+				// Lowest 4 bytes - at loc pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result0 = _mm_add_epi32(result0, temp0);
+
+				mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+				// Upper 4 bytes - shiftedR pixels
+				temp1 = _mm_unpackhi_epi8(row, zeromask);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result3 = _mm_add_epi32(result3, temp0);
+
+				// Next 4 bytes - shiftedR pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result2 = _mm_add_epi32(result2, temp0);
+
+				// Next 4 bytes - shiftedR pixels
+				temp1 = _mm_cvtepu8_epi16(row);
+				temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result1 = _mm_add_epi32(result1, temp0);
+
+				// Lowest 4 bytes - shiftedR pixels
+				temp0 = _mm_cvtepi16_epi32(temp1);
+				temp0 = _mm_mullo_epi32(temp0, mul);
+				result0 = _mm_add_epi32(result0, temp0);
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			row = _mm_packus_epi16(temp0, row);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (unsigned char *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -1; j <= 1; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_S16_U8_5xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc;
+	short * pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 5 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (short *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -2; j <= 2; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				for (int x = -2; x <= 2; x++)
+				{
+					row = _mm_loadu_si128((__m128i *)(pLocalSrc + (y * srcStride) + x));
+					mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+					// Upper 4 bytes
+					temp1 = _mm_unpackhi_epi8(row, zeromask);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result3 = _mm_add_epi32(result3, temp0);
+
+					// Next 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result2 = _mm_add_epi32(result2, temp0);
+
+					// Next 4 bytes
+					temp1 = _mm_cvtepu8_epi16(row);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result1 = _mm_add_epi32(result1, temp0);
+
+					// Lowest 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result0 = _mm_add_epi32(result0, temp0);
+				}
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			_mm_store_si128(pLocalDst_xmm++, temp0);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (short *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -2; j <= 2; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes >> 1);
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_U8_U8_5xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc, *pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 5 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -2; j <= 2; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				for (int x = -2; x <= 2; x++)
+				{
+					row = _mm_loadu_si128((__m128i *)(pLocalSrc + (y * srcStride) + x));
+					mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+					// Upper 4 bytes
+					temp1 = _mm_unpackhi_epi8(row, zeromask);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result3 = _mm_add_epi32(result3, temp0);
+
+					// Next 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result2 = _mm_add_epi32(result2, temp0);
+
+					// Next 4 bytes
+					temp1 = _mm_cvtepu8_epi16(row);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result1 = _mm_add_epi32(result1, temp0);
+
+					// Lowest 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result0 = _mm_add_epi32(result0, temp0);
+				}
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			row = _mm_packus_epi16(temp0, row);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (unsigned char *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -2; j <= 2; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_S16_U8_7xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size	      convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc;
+	short * pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 7 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (short *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -3; j <= 3; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride - j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				for (int x = -3; x <= 3; x++)
+				{
+					row = _mm_loadu_si128((__m128i *)(pLocalSrc + (y * srcStride) + x));
+					mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+					// Upper 4 bytes
+					temp1 = _mm_unpackhi_epi8(row, zeromask);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result3 = _mm_add_epi32(result3, temp0);
+
+					// Next 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result2 = _mm_add_epi32(result2, temp0);
+
+					// Next 4 bytes
+					temp1 = _mm_cvtepu8_epi16(row);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result1 = _mm_add_epi32(result1, temp0);
+
+					// Lowest 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result0 = _mm_add_epi32(result0, temp0);
+				}
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			_mm_store_si128(pLocalDst_xmm++, temp0);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (short *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -3; j <= 3; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes >> 1);
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_U8_U8_7xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc, *pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 7 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -3; j <= 3; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				for (int x = -3; x <= 3; x++)
+				{
+					row = _mm_loadu_si128((__m128i *)(pLocalSrc + (y * srcStride) + x));
+					mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+					// Upper 4 bytes
+					temp1 = _mm_unpackhi_epi8(row, zeromask);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result3 = _mm_add_epi32(result3, temp0);
+
+					// Next 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result2 = _mm_add_epi32(result2, temp0);
+
+					// Next 4 bytes
+					temp1 = _mm_cvtepu8_epi16(row);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result1 = _mm_add_epi32(result1, temp0);
+
+					// Lowest 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result0 = _mm_add_epi32(result0, temp0);
+				}
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			row = _mm_packus_epi16(temp0, row);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (unsigned char *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -3; j <= 3; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_S16_U8_9xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc;
+	short * pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	prefixWidth >>= 1;														// 2 bytes = 1 pixel
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 9 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (short *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -4; j <= 4; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				for (int x = -4; x <= 4; x++)
+				{
+					row = _mm_loadu_si128((__m128i *)(pLocalSrc + (y * srcStride) + x));
+					mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+					// Upper 4 bytes
+					temp1 = _mm_unpackhi_epi8(row, zeromask);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result3 = _mm_add_epi32(result3, temp0);
+
+					// Next 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result2 = _mm_add_epi32(result2, temp0);
+
+					// Next 4 bytes
+					temp1 = _mm_cvtepu8_epi16(row);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result1 = _mm_add_epi32(result1, temp0);
+
+					// Lowest 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result0 = _mm_add_epi32(result0, temp0);
+				}
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			_mm_store_si128(pLocalDst_xmm++, temp0);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (short *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -4; j <= 4; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, SHRT_MAX);
+			temp = max(temp, SHRT_MIN);
+			*pLocalDst++ = (short)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += (dstImageStrideInBytes >> 1);
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Convolve_U8_U8_9xN
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_int16    * convMatrix,
+		vx_size		  convolutionHeight,
+		vx_int32      shift
+	)
+{
+	__m128i *pLocalDst_xmm;
+	unsigned char *pLocalSrc, *pLocalDst;
+	short *pLocalConvMat;
+
+	__m128i result0, result1, result2, result3, row, mul, temp0, temp1;
+	__m128i zeromask = _mm_setzero_si128();
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;					// 16 pixels processed at a time in SSE loop
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	int height = (int)dstHeight;
+	int srcStride = (int)srcImageStrideInBytes;
+	int rowLimit = (int)(convolutionHeight >> 1);
+	int numConvCoeffs = 9 * (int)convolutionHeight;
+
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int w = 0; w < prefixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -4; j <= 4; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pLocalDst_xmm = (__m128i *) pLocalDst;
+		int width = (int)(alignedWidth >> 4);							// Each loop processess 16 pixels
+		while (width)
+		{
+			pLocalConvMat = convMatrix + numConvCoeffs - 1;
+			result0 = _mm_setzero_si128();
+			result1 = _mm_setzero_si128();
+			result2 = _mm_setzero_si128();
+			result3 = _mm_setzero_si128();
+
+			for (int y = -rowLimit; y <= rowLimit; y++)
+			{
+				for (int x = -4; x <= 4; x++)
+				{
+					row = _mm_loadu_si128((__m128i *)(pLocalSrc + (y * srcStride) + x));
+					mul = _mm_set1_epi32((int)(*pLocalConvMat--));
+
+					// Upper 4 bytes
+					temp1 = _mm_unpackhi_epi8(row, zeromask);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result3 = _mm_add_epi32(result3, temp0);
+
+					// Next 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result2 = _mm_add_epi32(result2, temp0);
+
+					// Next 4 bytes
+					temp1 = _mm_cvtepu8_epi16(row);
+					temp0 = _mm_unpackhi_epi16(temp1, zeromask);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result1 = _mm_add_epi32(result1, temp0);
+
+					// Lowest 4 bytes
+					temp0 = _mm_cvtepi16_epi32(temp1);
+					temp0 = _mm_mullo_epi32(temp0, mul);
+					result0 = _mm_add_epi32(result0, temp0);
+				}
+			}
+
+			result0 = _mm_srli_epi32(result0, shift);
+			result1 = _mm_srli_epi32(result1, shift);
+			result2 = _mm_srli_epi32(result2, shift);
+			result3 = _mm_srli_epi32(result3, shift);
+
+			row = _mm_packs_epi32(result2, result3);
+			temp0 = _mm_packs_epi32(result0, result1);
+			row = _mm_packus_epi16(temp0, row);
+			_mm_store_si128(pLocalDst_xmm++, row);
+
+			pLocalSrc += 16;
+			width--;
+		}
+
+		pLocalDst = (unsigned char *)pLocalDst_xmm;
+		for (int w = 0; w < postfixWidth; w++, pLocalSrc++)
+		{
+			int temp = 0;
+			int idx = numConvCoeffs - 1;
+			for (int i = -rowLimit; i <= rowLimit; i++)
+			{
+				for (int j = -4; j <= 4; j++)
+				{
+					temp += ((int)pLocalSrc[i*srcStride + j] * (int)convMatrix[idx--]);
+				}
+			}
+			temp = min(temp, 255);
+			temp = max(temp, 0);
+			*pLocalDst++ = (unsigned char)temp;
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+
+		height--;
+	}
+	return AGO_SUCCESS;
+}
+
+static inline void CompareAndSwap(__m128i& p1, __m128i& p2)
+{
+	__m128i First = _mm_min_epu8(p1, p2);
+	__m128i Sec = _mm_max_epu8(p1, p2);
+	p1 = First;
+	p2 = Sec;
+}
+
+int compareTwo(const void * a, const void * b)
+{
+	return(*(unsigned char *)a > *(unsigned char *)b ? 1 : -1);
+}
+
+int HafCpu_Median_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i pixels0, pixels1, pixels2, pixels3, pixels4, pixels5, pixels6, pixels7, pixels8;
+	unsigned char *pLocalSrc, *pPrevSrc, *pNextSrc, *pLocalDst;
+	unsigned char pixelArr[9];
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalDst = (unsigned char *)pDstImage;
+		pLocalSrc = (unsigned char *)pSrcImage;
+		pPrevSrc = pLocalSrc - srcImageStrideInBytes;
+		pNextSrc = pLocalSrc + srcImageStrideInBytes;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalDst++, pLocalSrc++, pPrevSrc++, pNextSrc++)
+		{
+			pixelArr[0] = pPrevSrc[-1];
+			pixelArr[1] = pPrevSrc[0];
+			pixelArr[2] = pPrevSrc[1];
+			pixelArr[3] = pLocalSrc[-1];
+			pixelArr[4] = pLocalSrc[0];
+			pixelArr[5] = pLocalSrc[1];
+			pixelArr[6] = pNextSrc[-1];
+			pixelArr[7] = pNextSrc[0];
+			pixelArr[8] = pNextSrc[1];
+			qsort(pixelArr, 9, sizeof(unsigned char), compareTwo);
+			*pLocalDst = pixelArr[4];
+		}
+		
+		for (int width = 0; width < (alignedWidth >> 4); width++)
+		{
+			pixels0 = _mm_loadu_si128((__m128i *)(pPrevSrc - 1));
+			pixels1 = _mm_loadu_si128((__m128i *)(pPrevSrc));
+			pixels2 = _mm_loadu_si128((__m128i *)(pPrevSrc + 1));
+			pixels3 = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			pixels4 = _mm_loadu_si128((__m128i *)(pLocalSrc));
+			pixels5 = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+			pixels6 = _mm_loadu_si128((__m128i *)(pNextSrc - 1));
+			pixels7 = _mm_loadu_si128((__m128i *)(pNextSrc));
+			pixels8 = _mm_loadu_si128((__m128i *)(pNextSrc + 1));
+
+			// sort by compare and swap : no branching required
+			CompareAndSwap(pixels1, pixels2);
+			CompareAndSwap(pixels4, pixels5);
+			CompareAndSwap(pixels7, pixels8);
+			CompareAndSwap(pixels0, pixels1);
+			CompareAndSwap(pixels3, pixels4);
+			CompareAndSwap(pixels6, pixels7);
+			CompareAndSwap(pixels1, pixels2);
+			CompareAndSwap(pixels4, pixels5);
+			CompareAndSwap(pixels7, pixels8);
+			CompareAndSwap(pixels0, pixels3);
+			CompareAndSwap(pixels5, pixels8);
+			CompareAndSwap(pixels4, pixels7);
+			CompareAndSwap(pixels3, pixels6);
+			CompareAndSwap(pixels1, pixels4);
+			CompareAndSwap(pixels2, pixels5);
+			CompareAndSwap(pixels4, pixels7);
+			CompareAndSwap(pixels4, pixels2);
+			CompareAndSwap(pixels6, pixels4);
+			CompareAndSwap(pixels4, pixels2);
+
+			// store median value
+			_mm_store_si128((__m128i *)pLocalDst, pixels4);
+
+			pPrevSrc += 16;
+			pLocalSrc += 16;
+			pNextSrc += 16;
+			pLocalDst += 16;
+		}
+		
+		for (int x = 0; x < postfixWidth; x++, pLocalDst++, pLocalSrc++, pPrevSrc++, pNextSrc++)
+		{
+			pixelArr[0] = pPrevSrc[-1];
+			pixelArr[1] = pPrevSrc[0];
+			pixelArr[2] = pPrevSrc[1];
+			pixelArr[3] = pLocalSrc[-1];
+			pixelArr[4] = pLocalSrc[0];
+			pixelArr[5] = pLocalSrc[1];
+			pixelArr[6] = pNextSrc[-1];
+			pixelArr[7] = pNextSrc[0];
+			pixelArr[8] = pNextSrc[1];
+			qsort(pixelArr, 9, sizeof(unsigned char), compareTwo);
+			*pLocalDst = pixelArr[4];
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_SobelPhase_U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstPhaseImage,
+		vx_uint32     dstPhaseImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		vx_uint8	* pScratch
+	)
+{
+	// calculate Gx and Gy and compute phase
+	vx_int16 *Gx, *Gy;
+	vx_uint8 *scratchPad;
+	vx_uint32 dstride = (dstWidth + 15)&~15;
+
+	Gx = (vx_int16*)pScratch;
+	Gy = (vx_int16*)(pScratch + ((dstride + 15) & ~15) * dstHeight * sizeof(vx_int16));
+	scratchPad = pScratch + ((dstride + 15) & ~15) * dstHeight * sizeof(vx_int16) * 2;
+
+	HafCpu_Sobel_S16S16_U8_3x3_GXY(dstWidth, dstHeight, Gx, dstride, Gy, dstride, pSrcImage, srcImageStrideInBytes, scratchPad);
+	HafCpu_Phase_U8_S16S16(dstWidth, dstHeight, pDstPhaseImage, dstPhaseImageStrideInBytes, Gx, dstride, Gy, dstride);
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_SobelMagnitudePhase_S16U8_U8_3x3
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_int16    * pDstMagImage,
+		vx_uint32     dstMagImageStrideInBytes,
+		vx_uint8    * pDstPhaseImage,
+		vx_uint32     dstPhaseImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc, *pLocalDstPhase;
+	vx_int16 * pLocalDstMag;
+
+	int prefixWidth = intptr_t(pDstMagImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+
+	__m128i row0, row1, row2, shiftedR, shiftedL, temp, GxH, GxL, GyH, GyL;
+	__m128i zeromask = _mm_setzero_si128();
+
+	float scale = (float)128 / 180.f;					// For arctan
+
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (vx_uint8 *)pSrcImage;
+		pLocalDstMag = (vx_int16 *)pDstMagImage;
+		pLocalDstPhase = (vx_uint8 *)pDstPhaseImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 tempGx = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1) +
+				(vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1];
+			vx_int16 tempGy = (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] -
+				(vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] - ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+			float mag = (float)(tempGx*tempGx) + (float)(tempGy*tempGy);
+			mag = sqrtf(mag);
+			*pLocalDstMag++ = (vx_int16)mag;
+
+			float arct = HafCpu_FastAtan2_deg(tempGx, tempGy);
+			*pLocalDstPhase++ = (vx_uint8)((vx_uint32)(arct*scale + 0.5) & 0xFF);
+		}
+
+		int width = (int)(alignedWidth >> 4);						// 16 pixels processed at a time
+		while (width)
+		{
+			row0 = _mm_load_si128((__m128i *) (pLocalSrc - srcImageStrideInBytes));		// row above
+			row1 = _mm_load_si128((__m128i *) pLocalSrc);								// current row
+			row2 = _mm_load_si128((__m128i *) (pLocalSrc + srcImageStrideInBytes));		// row below
+
+			// For the row below
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + srcImageStrideInBytes + 1));
+
+			GxH = _mm_unpackhi_epi8(shiftedL, zeromask);				// Gx, H: -1 * (-1,1)
+			GxL = _mm_cvtepu8_epi16(shiftedL);							// Gx, L: -1 * (-1,1)
+			GyH = _mm_add_epi16(GxH, zeromask);							// Gy, H: 1 * (-1,1)
+			GyL = _mm_add_epi16(GxL, zeromask);							// Gy, L: 1 * (-1,1)
+
+			temp = _mm_unpackhi_epi8(row2, zeromask);
+			temp = _mm_slli_epi16(temp, 1);								// Gy, H: 2 * (0,1)
+			row2 = _mm_cvtepu8_epi16(row2);
+			row2 = _mm_slli_epi16(row2, 1);								// Gy, L: 2 * (0,1)
+			GyH = _mm_add_epi16(GyH, temp);
+			GyL = _mm_add_epi16(GyL, row2);
+
+			temp = _mm_unpackhi_epi8(shiftedR, zeromask);				// Gy, H: 1 * (1,1),	Gx, H: 1 * (1,1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);						// Gy, L: 1 * (1,1),	Gx, L: 1 * (1,1)
+			GyH = _mm_add_epi16(GyH, temp);
+			GyL = _mm_add_epi16(GyL, shiftedR);
+			GxH = _mm_sub_epi16(temp, GxH);
+			GxL = _mm_sub_epi16(shiftedR, GxL);
+
+			// For the current row
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+			temp = _mm_unpackhi_epi8(shiftedR, zeromask);
+			temp = _mm_slli_epi16(temp, 1);								// Gx, H: 2 * (1,0)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);
+			shiftedR = _mm_slli_epi16(shiftedR, 1);						// Gx, L: 2 * (1,0)
+			GxH = _mm_add_epi16(GxH, temp);
+			GxL = _mm_add_epi16(GxL, shiftedR);
+
+			temp = _mm_unpackhi_epi8(shiftedL, zeromask);
+			temp = _mm_slli_epi16(temp, 1);								// Gx, H: -2 * (-1,0)
+			shiftedL = _mm_cvtepu8_epi16(shiftedL);
+			shiftedL = _mm_slli_epi16(shiftedL, 1);						// Gx, L: -2 * (-1,0)
+			GxH = _mm_sub_epi16(GxH, temp);
+			GxL = _mm_sub_epi16(GxL, shiftedL);
+
+			// For the row above
+			shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes - 1));
+			shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcImageStrideInBytes + 1));
+
+			temp = _mm_unpackhi_epi8(shiftedR, zeromask);				// Gy, H: -1 * (1,-1),	Gx, H: 1 * (1,-1)
+			shiftedR = _mm_cvtepu8_epi16(shiftedR);						// Gy, L: -1 * (1,-1),	Gx, L: 1 * (1,-1)
+			GxH = _mm_add_epi16(GxH, temp);
+			GxL = _mm_add_epi16(GxL, shiftedR);
+			GyH = _mm_sub_epi16(GyH, temp);
+			GyL = _mm_sub_epi16(GyL, shiftedR);
+
+			temp = _mm_unpackhi_epi8(row0, zeromask);
+			row0 = _mm_cvtepu8_epi16(row0);
+			temp = _mm_slli_epi16(temp, 1);								// Gy, H: -2 * (0,-1)
+			row0 = _mm_slli_epi16(row0, 1);								// Gy, L: -2 * (0,-1)
+			GyH = _mm_sub_epi16(GyH, temp);
+			GyL = _mm_sub_epi16(GyL, row0);
+
+			temp = _mm_unpackhi_epi8(shiftedL, zeromask);				// Gy, H: -1 * (-1,-1),	Gx, H: -1 * (-1,-1)
+			shiftedL = _mm_cvtepu8_epi16(shiftedL);						// Gy, L: -1 * (-1,-1),	Gx, L: -1 * (-1,-1)
+			GxH = _mm_sub_epi16(GxH, temp);
+			GxL = _mm_sub_epi16(GxL, shiftedL);
+			GyH = _mm_sub_epi16(GyH, temp);
+			GyL = _mm_sub_epi16(GyL, shiftedL);
+
+			// Calculate phase
+			for (int i = 0; i < 8; i++)
+			{
+				float arct = HafCpu_FastAtan2_deg(M128I(GxL).m128i_i16[i], M128I(GyL).m128i_i16[i]);
+				*pLocalDstPhase++ = (vx_uint8)((vx_uint32)(arct*scale + 0.5) & 0xFF);
+			}
+
+			for (int i = 0; i < 8; i++)
+			{
+				float arct = HafCpu_FastAtan2_deg(M128I(GxH).m128i_i16[i], M128I(GyH).m128i_i16[i]);
+				*pLocalDstPhase++ = (vx_uint8)((vx_uint32)(arct*scale + 0.5) & 0xFF);
+			}
+
+			// Magnitude
+			row0 = _mm_srli_si128(GxH, 8);
+			row1 = _mm_srli_si128(GxL, 8);
+			row0 = _mm_cvtepi16_epi32(row0);							// GxH: Upper 4 words to dwords
+			GxH = _mm_cvtepi16_epi32(GxH);								// GxH: Lower 4 words to dwords
+			row1 = _mm_cvtepi16_epi32(row1);							// GxL: Upper 4 words to dwords
+			GxL = _mm_cvtepi16_epi32(GxL);								// GxL: Lower 4 words to dwords
+
+			row2 = _mm_srli_si128(GyH, 8);
+			temp = _mm_srli_si128(GyL, 8);
+			row2 = _mm_cvtepi16_epi32(row2);							// GyH: Upper 4 words to dwords
+			GyH = _mm_cvtepi16_epi32(GyH);								// GyH: Lower 4 words to dwords
+			temp = _mm_cvtepi16_epi32(temp);							// GyL: Upper 4 words to dwords
+			GyL = _mm_cvtepi16_epi32(GyL);								// GyL: Lower 4 words to dwords
+
+			row0 = _mm_mullo_epi32(row0, row0);							// Square
+			GxH = _mm_mullo_epi32(GxH, GxH);
+			row1 = _mm_mullo_epi32(row1, row1);
+			GxL = _mm_mullo_epi32(GxL, GxL);
+			row2 = _mm_mullo_epi32(row2, row2);
+			GyH = _mm_mullo_epi32(GyH, GyH);
+			temp = _mm_mullo_epi32(temp, temp);
+			GyL = _mm_mullo_epi32(GyL, GyL);
+
+			row0 = _mm_add_epi32(row0, row2);							// Add
+			GxH = _mm_add_epi32(GxH, GyH);
+			row1 = _mm_add_epi32(row1, temp);
+			GxL = _mm_add_epi32(GxL, GyL);
+
+			temp = _mm_srli_si128(row0, 8);
+			__m128d d_pix1 = _mm_cvtepi32_pd(temp);						// Pixels 15, 14
+			__m128d d_pix0 = _mm_cvtepi32_pd(row0);						// Pixels 13, 12
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			row0 = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			row0 = _mm_slli_si128(row0, 8);
+			row0 = _mm_or_si128(row0, temp);							// Pixels 15, 14, 13, 12 (DWORDS)
+
+			temp = _mm_srli_si128(GxH, 8);
+			d_pix1 = _mm_cvtepi32_pd(temp);								// Pixels 11, 10
+			d_pix0 = _mm_cvtepi32_pd(GxH);								// Pixels 9, 8
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			GxH = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			GxH = _mm_slli_si128(GxH, 8);
+			GxH = _mm_or_si128(GxH, temp);								// Pixels 11, 10, 9, 8 (DWORDS)
+			row0 = _mm_packus_epi32(GxH, row0);							// Pixels 15, 14, 13, 12, 11, 10, 9, 8 (WORDS)
+
+			temp = _mm_srli_si128(row1, 8);
+			d_pix1 = _mm_cvtepi32_pd(temp);								// Pixels 7, 6
+			d_pix0 = _mm_cvtepi32_pd(row1);								// Pixels 5, 4
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			row1 = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			row1 = _mm_slli_si128(row1, 8);
+			row1 = _mm_or_si128(row1, temp);							// Pixels 7, 6, 5, 4 (DWORDS)
+
+			temp = _mm_srli_si128(GxL, 8);
+			d_pix1 = _mm_cvtepi32_pd(temp);								// Pixels 3, 2
+			d_pix0 = _mm_cvtepi32_pd(GxL);								// Pixels 1, 0
+			d_pix1 = _mm_sqrt_pd(d_pix1);
+			d_pix0 = _mm_sqrt_pd(d_pix0);
+			GxL = _mm_cvtpd_epi32(d_pix1);
+			temp = _mm_cvtpd_epi32(d_pix0);
+			GxL = _mm_slli_si128(GxL, 8);
+			GxL = _mm_or_si128(GxL, temp);								// Pixels 3, 2, 1, 0 (DWORDS)
+			row1 = _mm_packus_epi32(GxL, row1);							// Pixels 7, 6, 5, 4, 3, 2, 1, 0 (WORDS)
+
+			_mm_store_si128((__m128i *) pLocalDstMag, row1);
+			_mm_store_si128((__m128i *) (pLocalDstMag + 8), row0);
+
+			pLocalSrc += 16;
+			pLocalDstMag += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc++)
+		{
+			vx_int16 tempGx = (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1) +
+				(vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] - (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1];
+			vx_int16 tempGy = (vx_int16)pLocalSrc[(int)srcImageStrideInBytes - 1] + ((vx_int16)pLocalSrc[(int)srcImageStrideInBytes] << 1) + (vx_int16)pLocalSrc[(int)srcImageStrideInBytes + 1] -
+				(vx_int16)pLocalSrc[-(int)srcImageStrideInBytes - 1] - ((vx_int16)pLocalSrc[-(int)srcImageStrideInBytes] << 1) - (vx_int16)pLocalSrc[-(int)srcImageStrideInBytes + 1];
+			float mag = (float)(tempGx*tempGx) + (float)(tempGy*tempGy);
+			mag = sqrtf(mag);
+			*pLocalDstMag++ = (vx_int16)mag;
+
+			float arct = HafCpu_FastAtan2_deg(tempGx, tempGy);
+			*pLocalDstPhase++ = (vx_uint8)((vx_uint32)(arct*scale + 0.5) & 0xFF);
+		}
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstMagImage += (dstMagImageStrideInBytes >> 1);
+		pDstPhaseImage += dstPhaseImageStrideInBytes;
+		height--;
+	}
+	return AGO_SUCCESS;
+}
\ No newline at end of file
diff --git a/openvx/ago/ago_haf_cpu_geometric.cpp b/openvx/ago/ago_haf_cpu_geometric.cpp
new file mode 100644
index 0000000..d3bd845
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_geometric.cpp
@@ -0,0 +1,2785 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+//=======================================
+//				Types
+//=======================================
+typedef signed char			S8;
+typedef unsigned char		U8;
+typedef unsigned short		U16;
+typedef unsigned int		U32;
+typedef unsigned long long	U64;
+typedef signed short		S16;
+typedef signed int			S32;
+typedef signed long long	S64;
+
+union DECL_ALIGN(16) XMM128
+{
+	__m128	f;
+	__m128d	d;
+	__m128i	i;
+	__m64	m64[2];
+	double	f64[2];
+	U64		u64[2];
+	S64		s64[2];
+	float	f32[4];
+	S32		s32[4];
+	U32		u32[4];
+	S16		s16[8];
+	U16		u16[8];
+	U8		u8[16];
+	S8      s8[16];
+} ATTR_ALIGN(16);
+
+#define FP_BITS		18
+#define FP_MUL		(1<<FP_BITS)
+#define FP_ROUND    (1<<17)
+
+// Image remapping primitive
+/*
+Remap with nearest neighbor interpolation type
+The map table has 16 bit values out of which 13 bits are used for integer position and 3 bit for fractional.
+Assumption: the value of 0xffff in map table corresponds to border and the border values will be substituted by 1.
+The BORDER policy is not specified
+*/
+static const __m128i CONST_7		= _mm_set1_epi16((short) 7);
+static const __m128i CONST_3		= _mm_set1_epi16((short) 3);
+static const __m128i CONST_FFFF		= _mm_set1_epi16((short) 0xFFFF);
+static const __m128i CONST_0000FFFF = _mm_set1_epi32((int) 0x0000FFFF);
+
+int HafCpu_Remap_U8_U8_Nearest
+(
+	vx_uint32              dstWidth,
+	vx_uint32              dstHeight,
+	vx_uint8             * pDstImage,
+	vx_uint32              dstImageStrideInBytes,
+	vx_uint32              srcWidth,
+	vx_uint32              srcHeight,
+	vx_uint8             * pSrcImage,
+	vx_uint32              srcImageStrideInBytes,
+	ago_coord2d_ushort_t  * pMap,
+	vx_uint32              mapStrideInBytes
+)
+{
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i mapxy, mapfrac;
+
+	const __m128i sstride = _mm_set1_epi32(srcImageStrideInBytes);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pchMap = (unsigned char *)pMap;
+	vx_uint32 extra_pixels = dstWidth&3;
+
+	while (pchDst < pchDstlast)
+	{
+		ago_coord2d_short_t *pMapY_X = (ago_coord2d_short_t *)pchMap;
+		unsigned int *pdst = (unsigned int *)pchDst;
+		unsigned int *pdstLast = pdst + (dstWidth >> 2);
+		while (pdst < pdstLast)
+		{
+			__m128i temp0;
+			// read remap table location for (x,y)
+			mapxy = _mm_loadu_si128((__m128i *)pMapY_X );		// mapped table [src_y4,src_x4 .....src_y2,src_x1,src_y0,src_x0]
+			// check for boundary values: will be substituted by 1.
+			temp0 = _mm_cmpeq_epi16(mapxy, CONST_FFFF);
+			mapxy = _mm_andnot_si128(temp0, mapxy);
+			temp0 = _mm_and_si128(temp0, CONST_7);		// frac= 7 will be rounded to 1
+			mapxy = _mm_or_si128(mapxy, temp0);			// combined result
+
+			// get the fractional part for rounding
+			mapfrac = _mm_and_si128(mapxy, CONST_7);
+			mapxy = _mm_srli_epi16(mapxy, 3);			// mapxy is the int part.
+
+			// check if the fractional part if >3, then round to next location
+			mapfrac = _mm_cmpgt_epi16(mapfrac, CONST_3);
+			mapfrac = _mm_and_si128(mapfrac, _mm_set1_epi16((short)1) );
+			// add rounding
+			mapxy = _mm_add_epi16(mapxy, mapfrac);
+
+			// getPixel from src at mapxy position
+			// calculate (mapxy.y*srcImageStrideInBytes + mapxy.x)
+			temp0 = _mm_srli_epi32(mapxy, 16);					//[0000src_y4......0000src_y0]
+			mapxy = _mm_and_si128(mapxy, CONST_0000FFFF);		// [0000src_x4......0000src_x0]
+			temp0 = _mm_mullo_epi32(temp0, sstride);				// temp0 = src_y*stride;
+			mapxy = _mm_add_epi32(mapxy, temp0);				// mapxy = src_y*stride + src_x;
+
+			// read each src pixel from mapped position and copy to dst
+			*pdst++ = pSrcImage[M128I(mapxy).m128i_i32[0]] | (pSrcImage[M128I(mapxy).m128i_i32[1]] << 8) |
+				(pSrcImage[M128I(mapxy).m128i_i32[2]] << 16) | (pSrcImage[M128I(mapxy).m128i_i32[3]] << 24);
+			pMapY_X += 4;
+		}
+		// process extra pixels if any
+		if (extra_pixels){
+			unsigned char *pd = (unsigned char *)pdst;
+			for (unsigned int i = 0; i < extra_pixels; i++, pMapY_X++){
+				int x = (pMapY_X->x != 0xFFFF) ? (pMapY_X->x >> 3) + ((pMapY_X->x&7)>>2): 0;
+				int y = (pMapY_X->y != 0xFFFF) ? (pMapY_X->y >> 3) + ((pMapY_X->y&7)>>2) : 0;
+				pd[i] = pSrcImage[y*srcImageStrideInBytes + x];
+			}
+		}
+		pchDst += dstImageStrideInBytes;
+		pchMap += mapStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+/*
+Remap with nearest neighbor interpolation type
+The map table has 16 bit values out of which 13 bits are used for integer position and 3 bit for fractional.
+Assumption: the value of 0xffff in map table corresponds to border and border.
+The BORDER policy is constant
+*/
+
+int HafCpu_Remap_U8_U8_Nearest_Constant
+(
+	vx_uint32              dstWidth,
+	vx_uint32              dstHeight,
+	vx_uint8             * pDstImage,
+	vx_uint32              dstImageStrideInBytes,
+	vx_uint32              srcWidth,
+	vx_uint32              srcHeight,
+	vx_uint8             * pSrcImage,
+	vx_uint32              srcImageStrideInBytes,
+	ago_coord2d_ushort_t  * pMap,
+	vx_uint32              mapStrideInBytes,
+	vx_uint8               border
+)
+{
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i mapxy, mapfrac, sstride;
+
+	const __m128i srcb = _mm_set1_epi32((srcHeight*srcImageStrideInBytes) - 1);
+	sstride = _mm_set1_epi32(srcImageStrideInBytes);
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pchMap = (unsigned char *)pMap;
+	vx_uint32 extra_pixels = dstWidth&3;
+
+	while (pchDst < pchDstlast)
+	{
+		ago_coord2d_short_t *pMapY_X = (ago_coord2d_short_t *)pchMap;
+		unsigned int *pdst = (unsigned int *)pchDst;
+		unsigned int *pdstLast = pdst + (dstWidth>> 2);
+		while (pdst < pdstLast)
+		{
+			__m128i temp0, temp1;
+			int mask;
+			// read remap table location for (x,y)
+			mapxy = _mm_loadu_si128((__m128i *)pMapY_X);		// mapped table [src_y4,src_x4 .....src_y2,src_x1,src_y0,src_x0]
+			// check for boundary values: will be substituted by 1.
+			temp1 = _mm_cmpeq_epi16(mapxy, CONST_FFFF);
+			mapxy = _mm_andnot_si128(temp1, mapxy);
+			temp0 = _mm_and_si128(temp1, zeromask);
+			mapxy = _mm_or_si128(mapxy, temp0);			// combined result
+
+			// get the fractional part for rounding
+			mapfrac = _mm_and_si128(mapxy, CONST_7);
+			mapxy = _mm_srli_epi16(mapxy, 3);
+			// check if the fractional part if >3, then round to next location
+			mapfrac = _mm_cmpgt_epi16(mapfrac, CONST_3);
+			mapfrac = _mm_and_si128(mapfrac, _mm_set1_epi16((short)1));
+			// add rounding
+			mapxy = _mm_add_epi16(mapxy, mapfrac);
+			// getPixel from src at mapxy position
+			// calculate (mapxy.y*srcImageStrideInBytes + mapxy.x)
+			temp0 = _mm_srli_epi32(mapxy, 16);					//[0000src_y4......0000src_y0]
+			mapxy = _mm_and_si128(mapxy, CONST_0000FFFF);		// [0000src_x4......0000src_x0]
+			temp0 = _mm_mullo_epi32(temp0, sstride);				// temp0 = src_y*stride;
+			mapxy = _mm_add_epi32(mapxy, temp0);				// mapxy = src_y*stride + src_x;
+			// check if pixels exceed boundary
+			temp0 = _mm_cmpgt_epi32(mapxy, srcb);
+			temp1 = _mm_or_si128(temp1, temp0);
+			mask  = _mm_movemask_epi8(temp1);
+
+			// read each src pixel from mapped position and copy to dst
+			if (!mask){
+				*pdst++ = pSrcImage[M128I(mapxy).m128i_i32[0]] | (pSrcImage[M128I(mapxy).m128i_i32[1]] << 8) |
+					(pSrcImage[M128I(mapxy).m128i_i32[2]] << 16) | (pSrcImage[M128I(mapxy).m128i_i32[3]] << 24);
+			}
+			else
+			{
+				// copy each checking for boundary
+				unsigned int dstpel = (mask & 0xf) ? border : pSrcImage[M128I(mapxy).m128i_i32[0]];
+				dstpel |= (mask & 0xf0) ? (border << 8) : (pSrcImage[M128I(mapxy).m128i_i32[1]] << 8);
+				dstpel |= (mask & 0xf00) ? (border << 16) : (pSrcImage[M128I(mapxy).m128i_i32[2]] << 16);
+				dstpel |= (mask & 0xf000) ? (border << 24) : (pSrcImage[M128I(mapxy).m128i_i32[3]] << 24);
+				*pdst++ = dstpel;
+			}
+			pMapY_X += 4;
+		}
+		// process extra pixels if any
+		if (extra_pixels){
+			unsigned char *pd = (unsigned char *)pdst;
+			for (unsigned int i = 0; i < extra_pixels; i++, pMapY_X++){
+				int x = (pMapY_X->x != 0xFFFF) ? (pMapY_X->x >> 3) + ((pMapY_X->x & 7) >> 2) : border;
+				int y = (pMapY_X->y != 0xFFFF) ? (pMapY_X->y >> 3) + ((pMapY_X->y & 7) >> 2) : border;
+				pd[i] = pSrcImage[y*srcImageStrideInBytes + x];
+			}
+		}
+		pchDst += dstImageStrideInBytes;
+		pchMap += mapStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Remap_U8_U8_Bilinear
+(
+	vx_uint32              dstWidth,
+	vx_uint32              dstHeight,
+	vx_uint8             * pDstImage,
+	vx_uint32              dstImageStrideInBytes,
+	vx_uint32              srcWidth,
+	vx_uint32              srcHeight,
+	vx_uint8             * pSrcImage,
+	vx_uint32              srcImageStrideInBytes,
+	ago_coord2d_ushort_t  * pMap,
+	vx_uint32              mapStrideInBytes
+)
+{
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i mapxy, mapfrac;
+
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pchMap = (unsigned char *)pMap;
+	const __m128i sstride = _mm_set1_epi32(srcImageStrideInBytes);
+	const __m128i round = _mm_set1_epi32((int)32);
+
+	while (pchDst < pchDstlast)
+	{
+		ago_coord2d_short_t *pMapY_X = (ago_coord2d_short_t *)pchMap;
+		unsigned int *pdst = (unsigned int *)pchDst;
+		unsigned int *pdstLast = pdst + ((dstWidth+3) >> 2);
+		while (pdst < pdstLast)
+		{
+			__m128i temp0, temp1, w_xy, oneminusxy, p12, p34;
+			unsigned char *p0;
+			// read remap table location for (x,y)
+			mapxy = _mm_loadu_si128((__m128i *)pMapY_X);		// mapped table [src_y3,src_x3 .....src_y2,src_x1,src_y0,src_x0]
+			// check for boundary values: will be substituted by 1.
+			temp0 = _mm_cmpeq_epi16(mapxy, CONST_FFFF);
+			mapxy = _mm_andnot_si128(temp0, mapxy);
+			temp0 = _mm_and_si128(temp0, _mm_set1_epi16(0x8));
+			mapxy = _mm_or_si128(mapxy, temp0);			// combined result
+
+			// get the fractional part for rounding
+			mapfrac = _mm_and_si128(mapxy, CONST_7);					// [dy3, dx3.........dy0, dx0]
+			oneminusxy = _mm_sub_epi16(_mm_set1_epi16(8), mapfrac);		// [1-dy3, 1-dx3........1-dy0, 1-dx0]
+			mapxy = _mm_srli_epi16(mapxy, 3);							// [y3, x3.............y0, x0]
+			// calculate (mapxy.y*srcImageStrideInBytes + mapxy.x)
+			temp0 = _mm_srli_epi32(mapxy, 16);					//[0000src_y4......0000src_y0]
+			mapxy = _mm_and_si128(mapxy, CONST_0000FFFF);		// [0000src_x4......0000src_x0]
+			temp0 = _mm_mullo_epi32(temp0, sstride);				// temp0 = src_y*stride;
+			mapxy = _mm_add_epi32(mapxy, temp0);				// mapxy = src_y*stride + src_x;
+
+			// load the pixels 2 pixels in one load
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[0]];
+			p12 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			p34 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			temp0 = _mm_unpacklo_epi16(oneminusxy, mapfrac);			// [dy1, 1-dy1, dx1, 1-dx1, dy0, 1-dy0, dx0, 1-dx0]
+			temp1 = _mm_unpacklo_epi32(temp0, temp0);					// [dy0, 1-dy0, dy0, 1-dy0, dx0, 1-dx0, dx0, 1-dx0]
+			temp0 = _mm_unpackhi_epi32(temp0, temp0);					// [dy1, 1-dy1, dy1, 1-dy1, dx1, 1-dx1, dx1, 1-dx1]
+
+			w_xy	= _mm_unpacklo_epi64(temp1, temp0);					// [dx1, 1-dx1, dx1, 1-dx1 dx0, 1-dx0, dx0, 1-dx0]
+			temp1 = _mm_unpackhi_epi64(temp1, temp0);					// [dy1, 1-dy1, dy1, 1-dy1, dy0, 1-dy0, dy0, 1-dy0]
+			temp1 = _mm_shufflelo_epi16(temp1, 0xd8);
+			temp1 = _mm_shufflehi_epi16(temp1, 0xd8);					// [dy1, dy1, 1-dy1, 1-dy1, dy0, dy0, 1-dy0, 1-dy0]
+			// calculate weight 
+			w_xy = _mm_mullo_epi16(w_xy, temp1);						// [w3, w2, w1, w0]	// for 2 
+			p12 = _mm_unpacklo_epi16(p12, p34);
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[1]];
+			temp0 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			p34 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			temp0 = _mm_unpacklo_epi16(temp0, p34);
+//			w_xy = _mm_srli_epi16(w_xy, 6);
+			p12 = _mm_unpacklo_epi32(p12, temp0);
+			p12 = _mm_unpacklo_epi8(p12, zeromask);				// [p2, p2, p1, p0] for 2
+
+			// multiply add with weight
+			p12 = _mm_madd_epi16(p12, w_xy);			// (w3p3+w2p2),(w0p0+w1p1) for 2
+			p34 = _mm_hadd_epi32(p12, p12);				// dst 0 and 1
+
+			// do computation for dst 2 and 3
+			temp0 = _mm_unpackhi_epi16(oneminusxy, mapfrac);			// [dy3, 1-dy3, dx3, 1-dx3, dy2, 1-dy2, dx2, 1-dx2]
+			temp1 = _mm_unpacklo_epi32(temp0, temp0);					// [dy2, 1-dy2, dy2, 1-dy2, dx2, 1-dx2, dx2, 1-dx2]
+			temp0 = _mm_unpackhi_epi32(temp0, temp0);					// [dy3, 1-dy3, dy3, 1-dy3, dx3, 1-dx3, dx3, 1-dx3]
+			w_xy = _mm_unpacklo_epi64(temp1, temp0);					// [dx3, 1-dx3, dx3, 1-dx3, dx2, 1-dx2, dx2, 1-dx2]
+			temp1 = _mm_unpackhi_epi64(temp1, temp0);					// [dy3, 1-dy3, dy3, 1-dy3, dy2, 1-dy2, dy2, 1-dy2]
+			temp1 = _mm_shufflelo_epi16(temp1, 0xd8);
+			temp1 = _mm_shufflehi_epi16(temp1, 0xd8);					// [dy1, dy1, 1-dy1, 1-dy1, dy0, dy0, 1-dy0, 1-dy0]
+			// calculate weight 
+			w_xy = _mm_mullo_epi16(w_xy, temp1);						// [w3, w2, w1, w0]	// for 2 and 3 
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[2]];
+			p12 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			temp0 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			p12 = _mm_unpacklo_epi16(p12, temp0);
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[3]];
+			temp0 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			temp1 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			//w_xy = _mm_srli_epi16(w_xy, 6);
+			temp0 = _mm_unpacklo_epi16(temp0, temp1);
+			p12 = _mm_unpacklo_epi32(p12, temp0);
+
+			p12 = _mm_unpacklo_epi8(p12, zeromask);				// [p2, p2, p1, p0] for 2
+
+			// multiply add with weight
+			p12 = _mm_madd_epi16(p12, w_xy);			// (w3p3+w2p2),(w0p0+w1p1) for 2
+			temp0 = _mm_hadd_epi32(p12, p12);				// dst 2 and 3
+
+			p34 = _mm_unpacklo_epi64(p34, temp0);
+			p34 = _mm_add_epi32(p34, round);
+			p34 = _mm_srli_epi32(p34, 6);
+			// convert 32 bit to 8 bit
+			p34 = _mm_packus_epi32(p34, zeromask);
+			p34 = _mm_packus_epi16(p34, zeromask);
+
+			// read each src pixel from mapped position and copy to dst
+			*pdst++ = M128I(p34).m128i_i32[0];
+			pMapY_X += 4;
+		}
+		pchDst += dstImageStrideInBytes;
+		pchMap += mapStrideInBytes;
+
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Remap_U8_U8_Bilinear_Constant
+(
+	vx_uint32              dstWidth,
+	vx_uint32              dstHeight,
+	vx_uint8             * pDstImage,
+	vx_uint32              dstImageStrideInBytes,
+	vx_uint32              srcWidth,
+	vx_uint32              srcHeight,
+	vx_uint8             * pSrcImage,
+	vx_uint32              srcImageStrideInBytes,
+	ago_coord2d_ushort_t  * pMap,
+	vx_uint32              mapStrideInBytes,
+	vx_uint8               border
+)
+{
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i mapxy, mapfrac;
+
+	const __m128i sstride = _mm_set1_epi32(srcImageStrideInBytes);
+	const __m128i pborder = _mm_set1_epi32(border);
+	const __m128i round = _mm_set1_epi32((int)32);
+
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	unsigned char *pchMap = (unsigned char *)pMap;
+
+	while (pchDst < pchDstlast)
+	{
+		ago_coord2d_short_t *pMapY_X = (ago_coord2d_short_t *)pchMap;
+		unsigned int *pdst = (unsigned int *)pchDst;
+		unsigned int *pdstLast = pdst + ((dstWidth+3) >> 2);
+		while (pdst < pdstLast)
+		{
+			__m128i temp0, temp1, w_xy, oneminusxy, p12, p34, mask;
+			unsigned char *p0;
+			// read remap table location for (x,y)
+			mapxy = _mm_loadu_si128((__m128i *)pMapY_X);		// mapped table [src_y3,src_x3 .....src_y2,src_x1,src_y0,src_x0]
+			// check for boundary values: will be substituted by border.
+			mask = _mm_cmpeq_epi16(mapxy, CONST_FFFF);
+			mapxy = _mm_andnot_si128(mask, mapxy);
+			temp0 = _mm_and_si128(mask, zeromask);
+			mapxy = _mm_or_si128(mapxy, temp0);			// combined result
+			//mask = _mm_movemask_epi8(temp1);
+
+			// get the fractional part for rounding
+			// get the fractional part for rounding
+			mapfrac = _mm_and_si128(mapxy, CONST_7);					// [dy3, dx3.........dy0, dx0]
+			oneminusxy = _mm_sub_epi16(_mm_set1_epi16(8), mapfrac);		// [1-dy3, 1-dx3........1-dy0, 1-dx0]
+			mapxy = _mm_srli_epi16(mapxy, 3);							// [y3, x3.............y0, x0]
+			// calculate (mapxy.y*srcImageStrideInBytes + mapxy.x)
+			temp0 = _mm_srli_epi32(mapxy, 16);					//[0000src_y4......0000src_y0]
+			mapxy = _mm_and_si128(mapxy, CONST_0000FFFF);		// [0000src_x4......0000src_x0]
+			temp0 = _mm_mullo_epi32(temp0, sstride);				// temp0 = src_y*stride;
+			mapxy = _mm_add_epi32(mapxy, temp0);				// mapxy = src_y*stride + src_x;
+
+				// load the pixels 2 pixels in one load
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[0]];
+			p12 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			p34 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			temp0 = _mm_unpacklo_epi16(oneminusxy, mapfrac);			// [dy1, 1-dy1, dx1, 1-dx1, dy0, 1-dy0, dx0, 1-dx0]
+			temp1 = _mm_unpacklo_epi32(temp0, temp0);					// [dy0, 1-dy0, dy0, 1-dy0, dx0, 1-dx0, dx0, 1-dx0]
+			temp0 = _mm_unpackhi_epi32(temp0, temp0);					// [dy1, 1-dy1, dy1, 1-dy1, dx1, 1-dx1, dx1, 1-dx1]
+
+			w_xy = _mm_unpacklo_epi64(temp1, temp0);					// [dx1, 1-dx1, dx1, 1-dx1 dx0, 1-dx0, dx0, 1-dx0]
+			temp1 = _mm_unpackhi_epi64(temp1, temp0);					// [dy1, 1-dy1, dy1, 1-dy1, dy0, 1-dy0, dy0, 1-dy0]
+			temp1 = _mm_shufflelo_epi16(temp1, 0xd8);
+			temp1 = _mm_shufflehi_epi16(temp1, 0xd8);					// [dy1, dy1, 1-dy1, 1-dy1, dy0, dy0, 1-dy0, 1-dy0]
+
+			// calculate weight 
+			w_xy = _mm_mullo_epi16(w_xy, temp1);						// [w3, w2, w1, w0]	// for 2 
+			p12 = _mm_unpacklo_epi16(p12, p34);
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[1]];
+			temp0 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			p34 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			temp0 = _mm_unpacklo_epi16(temp0, p34);
+			p12 = _mm_unpacklo_epi32(p12, temp0);
+			p12 = _mm_unpacklo_epi8(p12, zeromask);				// [p3, p2, p1, p0] for 2
+
+			// multiply add with weight
+			p12 = _mm_madd_epi16(p12, w_xy);			// (w3p3+w2p2),(w0p0+w1p1) for 2
+			p34 = _mm_hadd_epi32(p12, p12);				// dst 0 and 1
+
+			// do computation for dst 2 and 3
+			temp0 = _mm_unpackhi_epi16(oneminusxy, mapfrac);			// [dy3, 1-dy3, dx3, 1-dx3, dy2, 1-dy2, dx2, 1-dx2]
+			temp1 = _mm_unpacklo_epi32(temp0, temp0);					// [dy2, 1-dy2, dy2, 1-dy2, dx2, 1-dx2, dx2, 1-dx2]
+			temp0 = _mm_unpackhi_epi32(temp0, temp0);					// [dy3, 1-dy3, dy3, 1-dy3, dx3, 1-dx3, dx3, 1-dx3]
+			w_xy = _mm_unpacklo_epi64(temp1, temp0);					// [dx3, 1-dx3, dx3, 1-dx3, dx2, 1-dx2, dx2, 1-dx2]
+			temp1 = _mm_unpackhi_epi64(temp1, temp0);					// [dy3, 1-dy3, dy3, 1-dy3, dy2, 1-dy2, dy2, 1-dy2]
+			temp1 = _mm_shufflelo_epi16(temp1, 0xd8);
+			temp1 = _mm_shufflehi_epi16(temp1, 0xd8);					// [dy3, dy3, 1-dy3, 1-dy3, dy0, dy2, 1-dy2, 1-dy2]
+
+			// calculate weight 
+			w_xy = _mm_mullo_epi16(w_xy, temp1);						// [w3, w2, w1, w0]	// for 2 and 3 
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[2]];
+			p12 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			temp0 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			p12 = _mm_unpacklo_epi16(p12, temp0);
+			p0 = &pSrcImage[M128I(mapxy).m128i_i32[3]];
+			temp0 = _mm_cvtsi32_si128(((unsigned int *)p0)[0]);
+			temp1 = _mm_cvtsi32_si128(((unsigned int *)(p0 + srcImageStrideInBytes))[0]);
+			temp0 = _mm_unpacklo_epi16(temp0, temp1);
+			p12 = _mm_unpacklo_epi32(p12, temp0);
+			//w_xy = _mm_shuffle_epi32(w_xy, 0x4e);
+			p12 = _mm_unpacklo_epi8(p12, zeromask);				// [p3, p2, p1, p0] for 2
+
+			// multiply add with weight
+			p12 = _mm_madd_epi16(p12, w_xy);			// (w3p3+w2p2),(w0p0+w1p1) for 2
+			temp0 = _mm_hadd_epi32(p12, p12);				// dst 0 and 1
+			//p34 = _mm_shuffle_epi32(p34, 0xd8);
+
+			//temp0 = _mm_shuffle_epi32(temp0, 0xd8);
+			p34 = _mm_unpacklo_epi64(p34, temp0);
+			p34 = _mm_add_epi32(p34, round);
+			p34 = _mm_srli_epi32(p34, 6);
+
+			p34 = _mm_andnot_si128(mask, p34);
+			mask = _mm_and_si128(mask, pborder);
+			p34 = _mm_or_si128(p34, mask);			// combined result
+			// convert 32 bit to 8 bit
+			p34 = _mm_packus_epi32(p34, zeromask);
+			p34 = _mm_packus_epi16(p34, zeromask);
+
+			// read each src pixel from mapped position and copy to dst
+			*pdst++ = M128I(p34).m128i_i32[0];
+
+			pMapY_X += 4;
+		}
+		pchDst += dstImageStrideInBytes;
+		pchMap += mapStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+// The dst pixels are nearest affine transformed (truncate towards zero rounding). Bounday_mode is not specified. 
+// If the transformed location is out of bounds: 0 or max pixel will be used as substitution.
+int HafCpu_WarpAffine_U8_U8_Nearest
+(
+vx_uint32             dstWidth,
+vx_uint32             dstHeight,
+vx_uint8            * pDstImage,
+vx_uint32             dstImageStrideInBytes,
+vx_uint32             srcWidth,
+vx_uint32             srcHeight,
+vx_uint8            * pSrcImage,
+vx_uint32             srcImageStrideInBytes,
+ago_affine_matrix_t * matrix,
+vx_uint8			* pLocalData
+)
+{
+	__m128  ymap, xmap, ydest, xdest;
+	__m128i srcb, src_s;
+	__m128i zeromask = _mm_setzero_si128();
+
+	const float r00 = matrix->matrix[0][0];
+	const float r10 = matrix->matrix[0][1];
+	const float r01 = matrix->matrix[1][0];
+	const float r11 = matrix->matrix[1][1];
+	const float const1 = matrix->matrix[2][0];
+	const float const2 = matrix->matrix[2][1];
+
+	const __m128 srcbx = _mm_set1_ps((float)srcWidth);
+	const __m128 srcby = _mm_set1_ps((float)srcHeight);
+	const __m128 zero = _mm_set1_ps(0);
+	srcb = _mm_set1_epi32((srcHeight*srcImageStrideInBytes) - 1);
+	src_s = _mm_set1_epi32(srcImageStrideInBytes);
+
+	// check if all mapped pixels are valid or not
+	bool bBoder = (const1 < 0) | (const2 < 0) | (const1 >= srcWidth) | (const2 >= srcHeight);
+	// check for (dstWidth, 0)
+	float x1 = (r00*dstWidth + const1);
+	float y1 = (r10*dstWidth + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (0, dstHeight)
+	x1 = (r01*dstHeight + const1);
+	y1 = (r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (dstWidth, dstHeight)
+	x1 = (r00*dstWidth + r01*dstHeight + const1);
+	y1 = (r10*dstWidth + r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+
+	XMM128 mask;
+	unsigned int x, y;
+	float *r00_x, *r10_x;
+	r00_x = (float*)pLocalData;
+	r10_x = r00_x + dstWidth;
+	for (x = 0; x<dstWidth; x++){
+		r00_x[x] = r00 * x;
+		r10_x[x] = r10 * x;
+	}
+	y = 0;
+
+	if (bBoder){
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			unsigned int x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128i xpels, ypels;
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+
+				mask.f = _mm_cmpge_ps(xmap, zero);
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(xmap, srcbx));
+				mask.f = _mm_and_ps(mask.f, _mm_cmpge_ps(ymap, zero));
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(ymap, srcby));
+
+				// convert to integer with rounding towards zero
+				xpels = _mm_cvttps_epi32(xmap);
+				ypels = _mm_cvttps_epi32(ymap);
+
+				// multiply ydest*srcImageStrideInBytes
+				ypels = _mm_mullo_epi32(ypels, src_s);
+				ypels = _mm_add_epi32(ypels, xpels);			// pixel location at src for dst image.
+
+				// check if the values exceed boundary and clamp it to boundary :: need to do this to avoid memory access violations
+				ypels = _mm_min_epi32(ypels, srcb);
+				ypels = _mm_max_epi32(ypels, zeromask);
+
+				// check if the values exceed boundary and clamp it to boundary
+				xpels = _mm_set_epi32(pSrcImage[M128I(ypels).m128i_i32[3]], pSrcImage[M128I(ypels).m128i_i32[2]], pSrcImage[M128I(ypels).m128i_i32[1]], pSrcImage[M128I(ypels).m128i_i32[0]]);
+				// mask for boundary: boundary pixels  will substituted with xero
+				xpels = _mm_and_si128(xpels, mask.i);
+
+				// convert to unsigned char and write to dst
+				xpels = _mm_packus_epi32(xpels, zeromask);
+				xpels = _mm_packus_epi16(xpels, zeromask);
+				*dst++ = M128I(xpels).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		while (y < dstHeight)
+		{
+			unsigned int x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+			while (x < dstWidth)
+			{
+				__m128i xpels, ypels;
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+
+				// convert to integer with rounding towards zero
+				xpels = _mm_cvttps_epi32(xmap);
+				ypels = _mm_cvttps_epi32(ymap);
+				// multiply ydest*srcImageStrideInBytes
+				ypels = _mm_mullo_epi32(ypels, src_s);
+				ypels = _mm_add_epi32(ypels, xpels);			// pixel location at src for dst image.
+
+				// check if the values exceed boundary and clamp it to boundary
+				xpels = _mm_set_epi32(pSrcImage[M128I(ypels).m128i_i32[3]], pSrcImage[M128I(ypels).m128i_i32[2]], pSrcImage[M128I(ypels).m128i_i32[1]], pSrcImage[M128I(ypels).m128i_i32[0]]);
+				// convert to unsigned char and write to dst
+				xpels = _mm_packus_epi32(xpels, zeromask);
+				xpels = _mm_packus_epi16(xpels, zeromask);
+				*dst++ = M128I(xpels).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+
+// The dst pixels are nearest affine transformed (truncate towards zero rounding). Bounday_mode is not specified. 
+// If the transformed location is out of bounds: border has to be substituted.
+int HafCpu_WarpAffine_U8_U8_Nearest_Constant
+(
+	vx_uint32             dstWidth,
+	vx_uint32             dstHeight,
+	vx_uint8            * pDstImage,
+	vx_uint32             dstImageStrideInBytes,
+	vx_uint32             srcWidth,
+	vx_uint32             srcHeight,
+	vx_uint8            * pSrcImage,
+	vx_uint32             srcImageStrideInBytes,
+	ago_affine_matrix_t * matrix,
+	vx_uint8              border,
+	vx_uint8			* pLocalData
+)
+{
+	__m128  ymap, xmap, ydest, xdest;
+	__m128i pborder, srcb, src_s;
+	__m128i zeromask = _mm_setzero_si128();
+	const unsigned int u32_border = border | (border << 8) | (border << 16) | (border << 24);
+
+	const float r00 = matrix->matrix[0][0];
+	const float r10 = matrix->matrix[0][1];
+	const float r01 = matrix->matrix[1][0];
+	const float r11 = matrix->matrix[1][1];
+	const float const1 = matrix->matrix[2][0];
+	const float const2 = matrix->matrix[2][1];
+
+	const __m128 srcbx = _mm_set1_ps((float)srcWidth);
+	const __m128 srcby = _mm_set1_ps((float)srcHeight);
+	const __m128 zero = _mm_set1_ps(0);
+	srcb = _mm_set1_epi32((srcHeight*srcImageStrideInBytes) - 1);
+	src_s = _mm_set1_epi32(srcImageStrideInBytes);
+	pborder = _mm_cvtsi32_si128((int)border);
+	pborder = _mm_shuffle_epi32(pborder, 0);
+	// check if all mapped pixels are valid or not
+	bool bBoder = (const1 < 0) | (const2 < 0) | (const1 >= srcWidth) | (const2 >= srcHeight);
+	// check for (dstWidth, 0)
+	float x1 = (r00*dstWidth + const1);
+	float y1 = (r10*dstWidth + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (0, dstHeight)
+	x1 = (r01*dstHeight + const1);
+	y1 = (r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (dstWidth, dstHeight)
+	x1 = (r00*dstWidth + r01*dstHeight + const1);
+	y1 = (r10*dstWidth + r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+
+	XMM128 mask;
+	unsigned int x, y;
+	float *r00_x = (float*)pLocalData;
+	float *r10_x = r00_x + dstWidth;
+	for (x = 0; x<dstWidth; x++){
+		r00_x[x] = r00 * x;
+		r10_x[x] = r10 * x;
+	}
+	y = 0;
+
+	if (bBoder){
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			unsigned int x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128i xpels, ypels;
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+
+				mask.f = _mm_cmpge_ps(xmap, zero);
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(xmap, srcbx));
+				mask.f = _mm_and_ps(mask.f, _mm_cmpge_ps(ymap, zero));
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(ymap, srcby));
+				//int m = _mm_movemask_ps(mask.f);
+				//if (m){
+					// convert to integer with rounding towards zero
+					xpels = _mm_cvttps_epi32(xmap);
+					ypels = _mm_cvttps_epi32(ymap);
+					// multiply ydest*srcImageStrideInBytes
+					ypels = _mm_mullo_epi32(ypels, src_s);
+					ypels = _mm_add_epi32(ypels, xpels);			// pixel location at src for dst image.
+
+					// check if the values exceed boundary and clamp it to boundary :: need to do this to avoid memory access violations
+					ypels = _mm_min_epi32(ypels, srcb);
+					ypels = _mm_max_epi32(ypels, zeromask);
+
+					// check if the values exceed boundary and clamp it to boundary
+					xpels = _mm_set_epi32(pSrcImage[M128I(ypels).m128i_i32[3]], pSrcImage[M128I(ypels).m128i_i32[2]], pSrcImage[M128I(ypels).m128i_i32[1]], pSrcImage[M128I(ypels).m128i_i32[0]]);
+					// mask for boundary
+					xpels = _mm_and_si128(xpels, mask.i);
+					xpels = _mm_or_si128(xpels, _mm_andnot_si128(mask.i, pborder));			// combined result
+
+					// convert to unsigned char and write to dst
+					xpels = _mm_packus_epi32(xpels, zeromask);
+					xpels = _mm_packus_epi16(xpels, zeromask);
+					*dst++ = M128I(xpels).m128i_i32[0];
+				//}
+				//else
+				//{
+				//	*dst++ = u32_border;
+				//}
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			unsigned int x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128i xpels, ypels;
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+				// convert to integer with rounding towards zero
+				xpels = _mm_cvttps_epi32(xmap);
+				ypels = _mm_cvttps_epi32(ymap);
+				// multiply ydest*srcImageStrideInBytes
+				ypels = _mm_mullo_epi32(ypels, src_s);
+				ypels = _mm_add_epi32(ypels, xpels);			// pixel location at src for dst image.
+				xpels = _mm_set_epi32(pSrcImage[M128I(ypels).m128i_i32[3]], pSrcImage[M128I(ypels).m128i_i32[2]], pSrcImage[M128I(ypels).m128i_i32[1]], pSrcImage[M128I(ypels).m128i_i32[0]]);
+				// convert to unsigned char and write to dst
+				xpels = _mm_packus_epi32(xpels, zeromask);
+				xpels = _mm_packus_epi16(xpels, zeromask);
+				*dst++ = M128I(xpels).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_WarpAffine_U8_U8_Bilinear
+(
+	vx_uint32             dstWidth,
+	vx_uint32             dstHeight,
+	vx_uint8            * pDstImage,
+	vx_uint32             dstImageStrideInBytes,
+	vx_uint32             srcWidth,
+	vx_uint32             srcHeight,
+	vx_uint8            * pSrcImage,
+	vx_uint32             srcImageStrideInBytes,
+	ago_affine_matrix_t * matrix,
+	vx_uint8			* pLocalData
+)
+{
+	// call the HafCpu_WarpAffine_U8_U8_Bilinear_Constant with border value 128
+	__m128  ymap, xmap, ydest, xdest;
+	__m128i srcb, src_s;
+	const __m128i zeromask = _mm_setzero_si128();
+	const __m128i one = _mm_set1_epi32(1);
+
+	// do backward mapping to find the (x, y) locations in source corresponding to (x', y') from dest by doing inverse matrix
+	const float r00 = matrix->matrix[0][0];
+	const float r10 = matrix->matrix[0][1];
+	const float r01 = matrix->matrix[1][0];
+	const float r11 = matrix->matrix[1][1];
+	const float const1 = matrix->matrix[2][0];
+	const float const2 = matrix->matrix[2][1];
+
+	const __m128 zero = _mm_set1_ps(0);
+	const __m128i srcbx_i = _mm_set1_epi32(srcWidth);
+	const __m128i srcby_i = _mm_set1_epi32(srcHeight);
+	const __m128 srcbx = _mm_cvtepi32_ps(srcbx_i);
+	const __m128 srcby = _mm_cvtepi32_ps(srcby_i);
+
+	const __m128i p0mask = _mm_set1_epi32((int)0xFF);
+	const __m128 oneFloat = _mm_set1_ps(1.0);
+	srcb = _mm_set1_epi32((srcHeight*srcImageStrideInBytes) - 1);
+	src_s = _mm_set1_epi32(srcImageStrideInBytes);
+
+	XMM128 mask;
+	unsigned int x, y;
+	float *r00_x = (float*)pLocalData;
+	float *r10_x = (float *)ALIGN16(r00_x + dstWidth);
+	for (x = 0; x<dstWidth; x++){
+		r00_x[x] = r00 * x;
+		r10_x[x] = r10 * x;
+	}
+	bool bBoder = (const1 < 0) | (const2 < 0) | (const1 >= srcWidth) | (const2 >= srcHeight);
+	// check for (dstWidth, 0)
+	float x1 = (r00*dstWidth  + const1);
+	float y1 = (r10*dstWidth  + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (0, dstHeight)
+	x1 = (r01*dstHeight + const1);
+	y1 = (r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (dstWidth, dstHeight)
+	x1 = (r00*dstWidth + r01*dstHeight + const1);
+	y1 = (r10*dstWidth + r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+
+	y = 0;
+	if (bBoder){
+		__m128i srcb = _mm_set1_epi32((srcHeight-1)*srcImageStrideInBytes - 1);
+		__m128i src_s = _mm_set1_epi32(srcImageStrideInBytes);
+
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xFraction, yFraction, one_minus_xFraction, one_minus_yFraction;
+				__m128 p0_f, p1_f, p2_f, p3_f;
+				__m128i p0, p1, p2, p3, xint, yint;			// pixels in src 
+				unsigned char *psrc;
+
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+
+				mask.f = _mm_cmpge_ps(xmap, zero);
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(xmap, srcbx));
+				mask.f = _mm_and_ps(mask.f, _mm_cmpge_ps(ymap, zero));
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(ymap, srcby));
+				int m = _mm_movemask_ps(mask.f);
+				if (m){
+					// convert to integer with rounding towards zero
+					xint = _mm_cvttps_epi32(xmap);
+					yint = _mm_cvttps_epi32(ymap);
+
+					//xFraction = xmap-xint;
+					//yFraction = ymap-yint;
+					xFraction = _mm_cvtepi32_ps(xint);
+					yFraction = _mm_cvtepi32_ps(yint);
+					xFraction = _mm_sub_ps(xmap, xFraction);
+					yFraction = _mm_sub_ps(ymap, yFraction);
+
+					// clip for boundary
+					yint = _mm_mullo_epi32(yint, src_s);
+					yint = _mm_add_epi32(yint, xint);
+					//(1-xFraction)
+					//(1-yFraction)
+					one_minus_xFraction = _mm_sub_ps(oneFloat, xFraction);
+					one_minus_yFraction = _mm_sub_ps(oneFloat, yFraction);
+					yint = _mm_min_epi32(yint, srcb);
+					yint = _mm_max_epi32(yint, zeromask);
+
+					// read pixels from src and re-arrange
+					psrc = pSrcImage + M128I(yint).m128i_u32[0];
+					M128I(p0).m128i_u32[0] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[0] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					psrc = pSrcImage + M128I(yint).m128i_u32[1];
+					M128I(p0).m128i_u32[1] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[1] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					psrc = pSrcImage + M128I(yint).m128i_u32[2];
+					M128I(p0).m128i_u32[2] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[2] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					psrc = pSrcImage + M128I(yint).m128i_u32[3];
+					M128I(p0).m128i_u32[3] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[3] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					// get p0, p1, p2, p3 by masking and shifting
+					p1 = p0;
+					p0 = _mm_and_si128(p0, p0mask);
+					p1 = _mm_srli_epi32(p1, 8);
+					p3 = p2;
+					p2 = _mm_and_si128(p2, p0mask);
+					p3 = _mm_srli_epi32(p3, 8);
+					p1 = _mm_and_si128(p1, p0mask);
+					p3 = _mm_and_si128(p3, p0mask);
+
+					p0_f = _mm_cvtepi32_ps(p0);
+					p1_f = _mm_cvtepi32_ps(p1);
+					p2_f = _mm_cvtepi32_ps(p2);
+					p3_f = _mm_cvtepi32_ps(p3);
+
+					p0_f = _mm_mul_ps(p0_f, one_minus_xFraction);
+					p0_f = _mm_mul_ps(p0_f, one_minus_yFraction);
+					p1_f = _mm_mul_ps(p1_f, xFraction);
+					p1_f = _mm_mul_ps(p1_f, one_minus_yFraction);
+					p2_f = _mm_mul_ps(p2_f, one_minus_xFraction);
+					p2_f = _mm_mul_ps(p2_f, yFraction);
+					p3_f = _mm_mul_ps(p3_f, xFraction);
+					p3_f = _mm_mul_ps(p3_f, yFraction);
+
+					p0_f = _mm_add_ps(p0_f, p1_f);
+					p2_f = _mm_add_ps(p2_f, p3_f);
+					p0_f = _mm_add_ps(p0_f, p2_f);
+					p0 = _mm_cvtps_epi32(p0_f);
+					// mask for boundary
+					p0 = _mm_and_si128(mask.i, p0);
+
+					// convert to unsigned char and write to dst
+					p0 = _mm_packus_epi32(p0, zeromask);
+					p0 = _mm_packus_epi16(p0, zeromask);
+					*dst++ = M128I(p0).m128i_i32[0];
+				}
+				else
+				{
+					*dst++ = 0;
+				}
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else{
+		XMM128 xint = { 0 }, yint = { 0 };
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xFraction, yFraction, one_minus_xFraction, one_minus_yFraction;
+				__m128 p0_f, p1_f, p2_f, p3_f;
+				__m128i p0, p1, p2, p3;			// pixels in src 
+				unsigned char *psrc;
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+				// convert to integer with rounding towards zero
+				xint.i = _mm_cvttps_epi32(xmap);
+				yint.i = _mm_cvttps_epi32(ymap);
+
+				//xFraction = xmap-xint;
+				//yFraction = ymap-yint;
+				xFraction = _mm_cvtepi32_ps(xint.i);
+				yFraction = _mm_cvtepi32_ps(yint.i);
+				xFraction = _mm_sub_ps(xmap, xFraction);
+				yFraction = _mm_sub_ps(ymap, yFraction);
+
+				//(1-xFraction)
+				//(1-yFraction)
+				one_minus_xFraction = _mm_sub_ps(oneFloat, xFraction);
+				one_minus_yFraction = _mm_sub_ps(oneFloat, yFraction);
+
+				// read pixels from src and re-arrange
+				psrc = pSrcImage + (yint.s32[0] * srcImageStrideInBytes + xint.s32[0]);
+				M128I(p0).m128i_u32[0] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[0] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[1] * srcImageStrideInBytes + xint.s32[1]);
+				M128I(p0).m128i_u32[1] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[1] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[2] * srcImageStrideInBytes + xint.s32[2]);
+				M128I(p0).m128i_u32[2] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[2] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[3] * srcImageStrideInBytes + xint.s32[3]);
+				M128I(p0).m128i_u32[3] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[3] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				// get p0, p1, p2, p3 by masking and shifting
+				p1 = p0;
+				p0 = _mm_and_si128(p0, p0mask);
+				p1 = _mm_srli_epi32(p1, 8);
+				p3 = p2;
+				p2 = _mm_and_si128(p2, p0mask);
+				p3 = _mm_srli_epi32(p3, 8);
+				p1 = _mm_and_si128(p1, p0mask);
+				p3 = _mm_and_si128(p3, p0mask);
+
+				p0_f = _mm_cvtepi32_ps(p0);
+				p1_f = _mm_cvtepi32_ps(p1);
+				p2_f = _mm_cvtepi32_ps(p2);
+				p3_f = _mm_cvtepi32_ps(p3);
+
+				p0_f = _mm_mul_ps(p0_f, one_minus_xFraction);
+				p0_f = _mm_mul_ps(p0_f, one_minus_yFraction);
+				p1_f = _mm_mul_ps(p1_f, xFraction);
+				p1_f = _mm_mul_ps(p1_f, one_minus_yFraction);
+				p2_f = _mm_mul_ps(p2_f, one_minus_xFraction);
+				p2_f = _mm_mul_ps(p2_f, yFraction);
+				p3_f = _mm_mul_ps(p3_f, xFraction);
+				p3_f = _mm_mul_ps(p3_f, yFraction);
+
+				p0_f = _mm_add_ps(p0_f, p1_f);
+				p2_f = _mm_add_ps(p2_f, p3_f);
+				p0_f = _mm_add_ps(p0_f, p2_f);
+				p0 = _mm_cvtps_epi32(p0_f);
+
+				// convert to unsigned char and write to dst
+				p0 = _mm_packus_epi32(p0, zeromask);
+				p0 = _mm_packus_epi16(p0, zeromask);
+				*dst++ = M128I(p0).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+// the implementation currently uses floating point.
+// TODO:: used fixed point for bilinear interpolation. We can do 8 pixels in the innel loop.
+int HafCpu_WarpAffine_U8_U8_Bilinear_Constant
+(
+vx_uint32             dstWidth,
+vx_uint32             dstHeight,
+vx_uint8            * pDstImage,
+vx_uint32             dstImageStrideInBytes,
+vx_uint32             srcWidth,
+vx_uint32             srcHeight,
+vx_uint8            * pSrcImage,
+vx_uint32             srcImageStrideInBytes,
+ago_affine_matrix_t * matrix,
+vx_uint8              border,
+vx_uint8			* pLocalData
+)
+{
+	__m128  ymap, xmap, ydest, xdest;
+	__m128i srcb, src_s;
+	const unsigned int u32_border = border | (border << 8) | (border << 16) | (border << 24);
+	const __m128i zeromask = _mm_setzero_si128();
+	const __m128i one = _mm_set1_epi32(1);
+	const __m128i pborder = _mm_set1_epi32((int)border);	
+
+	// do backward mapping to find the (x, y) locations in source corresponding to (x', y') from dest by doing inverse matrix
+	const float r00 = matrix->matrix[0][0];
+	const float r10 = matrix->matrix[0][1];
+	const float r01 = matrix->matrix[1][0];
+	const float r11 = matrix->matrix[1][1];
+	const float const1 = matrix->matrix[2][0];
+	const float const2 = matrix->matrix[2][1];
+
+	const __m128 zero = _mm_set1_ps(0);
+	const __m128i srcbx_i = _mm_set1_epi32(srcWidth);
+	const __m128i srcby_i = _mm_set1_epi32(srcHeight);
+	const __m128 srcbx = _mm_cvtepi32_ps(srcbx_i);
+	const __m128 srcby = _mm_cvtepi32_ps(srcby_i);
+
+	const __m128i p0mask = _mm_set1_epi32((int)0xFF);
+	const __m128 oneFloat = _mm_set1_ps(1.0);
+	srcb = _mm_set1_epi32((srcHeight-1)*srcImageStrideInBytes - 1);
+	src_s = _mm_set1_epi32(srcImageStrideInBytes);
+
+	XMM128 xint = { 0 }, yint = { 0 }, mask;
+	unsigned int x, y;
+	float *r00_x = (float*)pLocalData;
+	float *r10_x = (float *)ALIGN16(r00_x + dstWidth);
+	for (x = 0; x<dstWidth; x++){
+		r00_x[x] = r00 * x;
+		r10_x[x] = r10 * x;
+	}
+	bool bBoder = (const1 < 0) | (const2 < 0) | (const1 >= srcWidth) | (const2 >= srcHeight);
+	// check for (dstWidth, 0)
+	float x1 = (r00*dstWidth + const1);
+	float y1 = (r10*dstWidth + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (0, dstHeight)
+	x1 = (r01*dstHeight + const1);
+	y1 = (r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+	// check for (dstWidth, dstHeight)
+	x1 = (r00*dstWidth + r01*dstHeight + const1);
+	y1 = (r10*dstWidth + r11*dstHeight + const2);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+
+	y = 0;
+	if (bBoder){
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			unsigned int x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xFraction, yFraction, one_minus_xFraction, one_minus_yFraction;
+				__m128 p0_f, p1_f, p2_f, p3_f;
+				__m128i p0, p1, p2, p3;			// pixels in src 
+				unsigned char *psrc;
+
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+
+				mask.f = _mm_cmpge_ps(xmap, zero);
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(xmap, srcbx));
+				mask.f = _mm_and_ps(mask.f, _mm_cmpge_ps(ymap, zero));
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(ymap, srcby));
+				int m = _mm_movemask_ps(mask.f);
+				if (m){
+					// convert to integer with rounding towards zero
+					xint.i = _mm_cvttps_epi32(xmap);
+					yint.i = _mm_cvttps_epi32(ymap);
+
+					//xFraction = xmap-xint;
+					//yFraction = ymap-yint;
+					xFraction = _mm_cvtepi32_ps(xint.i);
+					yFraction = _mm_cvtepi32_ps(yint.i);
+					xFraction = _mm_sub_ps(xmap, xFraction);
+					yFraction = _mm_sub_ps(ymap, yFraction);
+					
+					yint.i = _mm_mullo_epi32(yint.i, src_s);
+					yint.i = _mm_add_epi32(yint.i, xint.i);
+					//(1-xFraction)
+					//(1-yFraction)
+					one_minus_xFraction = _mm_sub_ps(oneFloat, xFraction);
+					one_minus_yFraction = _mm_sub_ps(oneFloat, yFraction);
+					yint.i = _mm_min_epi32(yint.i, srcb);
+					yint.i = _mm_max_epi32(yint.i, zeromask);
+
+					// read pixels from src and re-arrange
+					psrc = pSrcImage + yint.s32[0];
+					M128I(p0).m128i_u32[0] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[0] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					psrc = pSrcImage + yint.s32[1];
+					M128I(p0).m128i_u32[1] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[1] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					psrc = pSrcImage + yint.s32[2];
+					M128I(p0).m128i_u32[2] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[2] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					psrc = pSrcImage + yint.s32[3];
+					M128I(p0).m128i_u32[3] = ((unsigned int *)psrc)[0];
+					M128I(p2).m128i_u32[3] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+					// get p0, p1, p2, p3 by masking and shifting
+					p1 = p0;
+					p0 = _mm_and_si128(p0, p0mask);
+					p1 = _mm_srli_epi32(p1, 8);
+					p3 = p2;
+					p2 = _mm_and_si128(p2, p0mask);
+					p3 = _mm_srli_epi32(p3, 8);
+					p1 = _mm_and_si128(p1, p0mask);
+					p3 = _mm_and_si128(p3, p0mask);
+
+					p0_f = _mm_cvtepi32_ps(p0);
+					p1_f = _mm_cvtepi32_ps(p1);
+					p2_f = _mm_cvtepi32_ps(p2);
+					p3_f = _mm_cvtepi32_ps(p3);
+
+					p0_f = _mm_mul_ps(p0_f, one_minus_xFraction);
+					p0_f = _mm_mul_ps(p0_f, one_minus_yFraction);
+					p1_f = _mm_mul_ps(p1_f, xFraction);
+					p1_f = _mm_mul_ps(p1_f, one_minus_yFraction);
+					p2_f = _mm_mul_ps(p2_f, one_minus_xFraction);
+					p2_f = _mm_mul_ps(p2_f, yFraction);
+					p3_f = _mm_mul_ps(p3_f, xFraction);
+					p3_f = _mm_mul_ps(p3_f, yFraction);
+
+					p0_f = _mm_add_ps(p0_f, p1_f);
+					p2_f = _mm_add_ps(p2_f, p3_f);
+					p0_f = _mm_add_ps(p0_f, p2_f);
+					p0 = _mm_cvtps_epi32(p0_f);
+					// mask for boundary
+					p0 = _mm_and_si128(mask.i, p0);
+					p0 = _mm_or_si128(p0, _mm_andnot_si128(mask.i, pborder));			// combined result
+
+					// convert to unsigned char and write to dst
+					p0 = _mm_packus_epi32(p0, zeromask);
+					p0 = _mm_packus_epi16(p0, zeromask);
+					*dst++ = M128I(p0).m128i_i32[0];
+				}
+				else
+				{
+					*dst++ = u32_border;
+				}
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else{
+		while (y < dstHeight)
+		{
+			// calculate (y*m[0][1] + m[0][2]) for x and y
+			xdest = _mm_set1_ps(y*r01 + const1);
+			ydest = _mm_set1_ps(y*r11 + const2);
+
+			unsigned int x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xFraction, yFraction, one_minus_xFraction, one_minus_yFraction;
+				__m128 p0_f, p1_f, p2_f, p3_f;
+				__m128i p0, p1, p2, p3;			// pixels in src 
+				unsigned char *psrc;
+				// read x into xpel
+				xmap = _mm_load_ps(&r00_x[x]);
+				xmap = _mm_add_ps(xmap, xdest);				// xf = dst[x3, x2, x1, x0]
+				ymap = _mm_load_ps(&r10_x[x]);
+				ymap = _mm_add_ps(ymap, ydest);				// ymap <- r10*x + ty
+				// convert to integer with rounding towards zero
+				xint.i = _mm_cvttps_epi32(xmap);
+				yint.i = _mm_cvttps_epi32(ymap);
+
+				//xFraction = xmap-xint;
+				//yFraction = ymap-yint;
+				xFraction = _mm_cvtepi32_ps(xint.i);
+				yFraction = _mm_cvtepi32_ps(yint.i);
+				xFraction = _mm_sub_ps(xmap, xFraction);
+				yFraction = _mm_sub_ps(ymap, yFraction);
+
+				//(1-xFraction)
+				//(1-yFraction)
+				one_minus_xFraction = _mm_sub_ps(oneFloat, xFraction);
+				one_minus_yFraction = _mm_sub_ps(oneFloat, yFraction);
+
+				// read pixels from src and re-arrange
+				psrc = pSrcImage + (yint.s32[0] * srcImageStrideInBytes + xint.s32[0]);
+				M128I(p0).m128i_u32[0] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[0] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[1] * srcImageStrideInBytes + xint.s32[1]);
+				M128I(p0).m128i_u32[1] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[1] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[2] * srcImageStrideInBytes + xint.s32[2]);
+				M128I(p0).m128i_u32[2] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[2] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[3] * srcImageStrideInBytes + xint.s32[3]);
+				M128I(p0).m128i_u32[3] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[3] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				// get p0, p1, p2, p3 by masking and shifting
+				p1 = p0;
+				p0 = _mm_and_si128(p0, p0mask);
+				p1 = _mm_srli_epi32(p1, 8);
+				p3 = p2;
+				p2 = _mm_and_si128(p2, p0mask);
+				p3 = _mm_srli_epi32(p3, 8);
+				p1 = _mm_and_si128(p1, p0mask);
+				p3 = _mm_and_si128(p3, p0mask);
+
+				p0_f = _mm_cvtepi32_ps(p0);
+				p1_f = _mm_cvtepi32_ps(p1);
+				p2_f = _mm_cvtepi32_ps(p2);
+				p3_f = _mm_cvtepi32_ps(p3);
+
+				p0_f = _mm_mul_ps(p0_f, one_minus_xFraction);
+				p0_f = _mm_mul_ps(p0_f, one_minus_yFraction);
+				p1_f = _mm_mul_ps(p1_f, xFraction);
+				p1_f = _mm_mul_ps(p1_f, one_minus_yFraction);
+				p2_f = _mm_mul_ps(p2_f, one_minus_xFraction);
+				p2_f = _mm_mul_ps(p2_f, yFraction);
+				p3_f = _mm_mul_ps(p3_f, xFraction);
+				p3_f = _mm_mul_ps(p3_f, yFraction);
+
+				p0_f = _mm_add_ps(p0_f, p1_f);
+				p2_f = _mm_add_ps(p2_f, p3_f);
+				p0_f = _mm_add_ps(p0_f, p2_f);
+				p0 = _mm_cvtps_epi32(p0_f);
+
+				// convert to unsigned char and write to dst
+				p0 = _mm_packus_epi32(p0, zeromask);
+				p0 = _mm_packus_epi16(p0, zeromask);
+				*dst++ = M128I(p0).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_WarpPerspective_U8_U8_Nearest
+(
+vx_uint32                  dstWidth,
+vx_uint32                  dstHeight,
+vx_uint8                 * pDstImage,
+vx_uint32                  dstImageStrideInBytes,
+vx_uint32                  srcWidth,
+vx_uint32                  srcHeight,
+vx_uint8                 * pSrcImage,
+vx_uint32                  srcImageStrideInBytes,
+ago_perspective_matrix_t * matrix,
+vx_uint8				 * pLocalData
+)
+{
+	return HafCpu_WarpPerspective_U8_U8_Nearest_Constant(dstWidth, dstHeight, pDstImage, dstImageStrideInBytes, srcWidth,
+		srcHeight, pSrcImage, srcImageStrideInBytes, matrix, (unsigned char)0, pLocalData);
+}
+
+
+// This alogorithm implements Constant Denominator method described in 
+// " A Novel Architechture for real time sprite decoding".
+// The idea is to do perpective warping along the lines of constant divisor..
+// The number of floating point divisions are reduced from O(Nsqared) to O(N)
+/*
+	forward mapping: 
+		x' = (ax+by+c)/(gx+hy+1)
+		y' = (dx+ey+f)/(gx+hy+1)
+	backward mapping:
+		x  = ((hf-e)x'+(b-hc)y'+(ec-bf))/(eg-dh)x'+(ah-bg)y'+(db-ae))
+		y  = ((d-fg)x'+(cg-a)y'+(af-dc))/(eg-dh)x'+(ah-bg)y'+(db-ae))
+
+*/
+
+
+int HafCpu_WarpPerspective_U8_U8_Nearest_Constant
+(
+vx_uint32                  dstWidth,
+vx_uint32                  dstHeight,
+vx_uint8                 * pDstImage,
+vx_uint32                  dstImageStrideInBytes,
+vx_uint32                  srcWidth,
+vx_uint32                  srcHeight,
+vx_uint8                 * pSrcImage,
+vx_uint32                  srcImageStrideInBytes,
+ago_perspective_matrix_t * matrix,
+vx_uint8                   border,
+vx_uint8				 * pLocalData
+)
+{
+	// calculate inverse mapping coefficients for x and y
+	const float a = matrix->matrix[0][0];				
+	const float d = matrix->matrix[0][1];
+	const float g = matrix->matrix[0][2];
+	const float b = matrix->matrix[1][0];
+	const float e = matrix->matrix[1][1];
+	const float h = matrix->matrix[1][2];
+	const float c = matrix->matrix[2][0];		
+	const float f = matrix->matrix[2][1];		
+	const float i = matrix->matrix[2][2];
+
+	// can't assume if end points in the warped image is within boundary, all the warped image is within boundary
+	bool bBoder = 1;
+
+	XMM128 mask;
+	__m128 xdest, ydest, zdest;
+	const __m128i zeromask = _mm_setzero_si128();
+	const __m128i one = _mm_set1_epi32(1);
+	const __m128i pborder = _mm_set1_epi32((int)border);	
+	const __m128i srcb = _mm_set1_epi32((srcHeight*srcImageStrideInBytes) - 1);
+	const __m128i src_s = _mm_set1_epi32(srcImageStrideInBytes);
+	const __m128 zero = _mm_set1_ps(0);
+	const __m128 srcbx = _mm_set1_ps((float)srcWidth);
+	const __m128 srcby = _mm_set1_ps((float)srcHeight);
+	const __m128 oneFloat = _mm_set1_ps(1.0);
+
+	unsigned int x;
+	float *A_x = (float*)pLocalData;
+	float *D_x = (float *)ALIGN16(A_x + dstWidth);
+	float *G_x = (float *)ALIGN16(D_x + dstWidth);
+	for (x = 0; x<dstWidth; x++){
+		A_x[x] = a * x;
+		D_x[x] = d * x;
+		G_x[x] = g * x;			// (eg - dh)
+	}
+
+	unsigned int y = 0;
+	// do the plain vanilla version with floating point division in inner_loop
+	if (bBoder){
+		while (y < dstHeight)
+		{
+			xdest = _mm_set1_ps(y*b + c);
+			ydest = _mm_set1_ps(y*e + f);
+			zdest = _mm_set1_ps(y*h + i);
+			x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xmap, ymap, zmap;
+				__m128i xpels, ypels;
+
+				zmap = _mm_load_ps(&G_x[x]);
+				xmap = _mm_load_ps(&A_x[x]);
+				zmap = _mm_add_ps(zmap, zdest);
+				ymap = _mm_load_ps(&D_x[x]);
+				zmap = _mm_div_ps(oneFloat, zmap);
+				xmap = _mm_add_ps(xmap, xdest);				
+				ymap = _mm_add_ps(ymap, ydest);				
+				xmap = _mm_mul_ps(xmap, zmap);
+				ymap = _mm_mul_ps(ymap, zmap);
+
+				mask.f = _mm_cmpge_ps(xmap, zero);
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(xmap, srcbx));
+				mask.f = _mm_and_ps(mask.f, _mm_cmpge_ps(ymap, zero));
+				mask.f = _mm_and_ps(mask.f, _mm_cmplt_ps(ymap, srcby));
+
+				// convert to integer with rounding towards zero
+				xpels = _mm_cvttps_epi32(xmap);
+				ypels = _mm_cvttps_epi32(ymap);
+				// multiply ydest*srcImageStrideInBytes
+				ypels = _mm_mullo_epi32(ypels, src_s);
+				ypels = _mm_add_epi32(ypels, xpels);			// pixel location at src for dst image.
+
+				// check if the values exceed boundary and clamp it to boundary :: need to do this to avoid memory access violations
+				ypels = _mm_min_epi32(ypels, srcb);
+				ypels = _mm_max_epi32(ypels, zeromask);
+
+				// check if the values exceed boundary and clamp it to boundary
+				xpels = _mm_set_epi32(pSrcImage[M128I(ypels).m128i_i32[3]], pSrcImage[M128I(ypels).m128i_i32[2]], pSrcImage[M128I(ypels).m128i_i32[1]], pSrcImage[M128I(ypels).m128i_i32[0]]);
+				// mask for boundary
+				xpels = _mm_and_si128(xpels, mask.i);
+				xpels = _mm_or_si128(xpels, _mm_andnot_si128(mask.i, pborder));			// combined result
+
+				// convert to unsigned char and write to dst
+				xpels = _mm_packus_epi32(xpels, zeromask);
+				xpels = _mm_packus_epi16(xpels, zeromask);
+				*dst++ = M128I(xpels).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else{
+		while (y < dstHeight)
+		{
+			xdest = _mm_set1_ps(y*b + c);
+			ydest = _mm_set1_ps(y*e + f);
+			zdest = _mm_set1_ps(y*h + i);
+			unsigned int *dst = (unsigned int *)pDstImage;
+			x = 0;
+			while (x < dstWidth)
+			{
+				__m128 xmap, ymap, zmap;
+				__m128i xpels, ypels;
+
+				zmap = _mm_load_ps(&G_x[x]);
+				xmap = _mm_load_ps(&A_x[x]);
+				zmap = _mm_add_ps(zmap, zdest);
+				ymap = _mm_load_ps(&D_x[x]);
+				zmap = _mm_div_ps(oneFloat, zmap);
+				xmap = _mm_add_ps(xmap, xdest);				
+				ymap = _mm_add_ps(ymap, ydest);				
+				xmap = _mm_mul_ps(xmap, zmap);
+				ymap = _mm_mul_ps(ymap, zmap);
+
+				// convert to integer with rounding towards zero
+				xpels = _mm_cvttps_epi32(xmap);
+				ypels = _mm_cvttps_epi32(ymap);
+				// multiply ydest*srcImageStrideInBytes
+				ypels = _mm_mullo_epi32(ypels, src_s);
+				ypels = _mm_add_epi32(ypels, xpels);			// pixel location at src for dst image.
+
+				// check if the values exceed boundary and clamp it to boundary
+				xpels = _mm_set_epi32(pSrcImage[M128I(ypels).m128i_i32[3]], pSrcImage[M128I(ypels).m128i_i32[2]], pSrcImage[M128I(ypels).m128i_i32[1]], pSrcImage[M128I(ypels).m128i_i32[0]]);
+				// convert to unsigned char and write to dst
+				xpels = _mm_packus_epi32(xpels, zeromask);
+				xpels = _mm_packus_epi16(xpels, zeromask);
+				*dst++ = M128I(xpels).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_WarpPerspective_U8_U8_Bilinear
+(
+vx_uint32                  dstWidth,
+vx_uint32                  dstHeight,
+vx_uint8                 * pDstImage,
+vx_uint32                  dstImageStrideInBytes,
+vx_uint32                  srcWidth,
+vx_uint32                  srcHeight,
+vx_uint8                 * pSrcImage,
+vx_uint32                  srcImageStrideInBytes,
+ago_perspective_matrix_t * matrix,
+vx_uint8				 * pLocalData
+)
+{
+	return HafCpu_WarpPerspective_U8_U8_Bilinear_Constant(dstWidth, dstHeight, pDstImage, dstImageStrideInBytes, srcWidth,
+		srcHeight, pSrcImage, srcImageStrideInBytes, matrix, (unsigned char)0, pLocalData);
+}
+
+
+int HafCpu_WarpPerspective_U8_U8_Bilinear_Constant
+(
+vx_uint32                  dstWidth,
+vx_uint32                  dstHeight,
+vx_uint8                 * pDstImage,
+vx_uint32                  dstImageStrideInBytes,
+vx_uint32                  srcWidth,
+vx_uint32                  srcHeight,
+vx_uint8                 * pSrcImage,
+vx_uint32                  srcImageStrideInBytes,
+ago_perspective_matrix_t * matrix,
+vx_uint8                   border,
+vx_uint8				 * pLocalData
+)
+{
+	// calculate inverse mapping coefficients for x and y
+	const float a = matrix->matrix[0][0];
+	const float d = matrix->matrix[0][1];
+	const float g = matrix->matrix[0][2];
+	const float b = matrix->matrix[1][0];
+	const float e = matrix->matrix[1][1];
+	const float h = matrix->matrix[1][2];
+	const float c = matrix->matrix[2][0];
+	const float f = matrix->matrix[2][1];
+	const float i = matrix->matrix[2][2];
+
+	XMM128 xint, yint, xmask, ymask;
+	__m128 xdest, ydest, zdest;
+	const __m128i zeromask = _mm_setzero_si128();
+	const __m128i one = _mm_set1_epi32(1);
+	const __m128i pborder = _mm_set1_epi32((int)border);	
+	const __m128 zero = _mm_set1_ps(0);
+	const __m128 oneFloat = _mm_set1_ps(1.0);
+	const __m128i srcbx = _mm_set1_epi32((int)srcWidth);
+	const __m128i srcby = _mm_set1_epi32((int)srcHeight);
+	const __m128i p0mask = _mm_set1_epi32((int)0xFF);
+	const __m128i srcb = _mm_set1_epi32((srcHeight-1)*srcImageStrideInBytes - 1);
+	const __m128i src_s = _mm_set1_epi32(srcImageStrideInBytes);
+	const __m128i srcbx1 = _mm_set1_epi32((int)(srcWidth-1));
+	const __m128i srcby1 = _mm_set1_epi32((int)(srcHeight - 1));
+	const __m128i negone = _mm_set1_epi32((int)-1);
+
+	unsigned int x;
+	float *A_x = (float*)pLocalData;
+	float *D_x = (float *)ALIGN16(A_x + dstWidth);
+	float *G_x = (float *)ALIGN16(D_x + dstWidth);
+	for (x = 0; x<dstWidth; x++){
+		A_x[x] = a * x;
+		D_x[x] = d * x;
+		G_x[x] = g * x;			// (eg - dh)
+	}
+
+#if 0
+	// find out if all the mapped pixels are within valid range or not
+	// find out if all the mapped pixels are within valid range or not
+	float z, x0, y0, x1, y1;
+	z = (float)(1.0 / i);
+	x0 = c * z;
+	y0 = f * z;
+	z = (float)(1.0 / (h * dstHeight + g*dstWidth + i));
+	x1 = (a*dstWidth + b * dstHeight + c) * z;
+	y1 = (d*dstWidth + e * dstHeight + f) * z;
+	bool bBoder = (x0 < 0) | (y0 < 0) | (x0 >= srcWidth) | (y0 >= srcHeight);
+	bBoder |= (x1 < 0) | (y1 < 0) | (x1 >= srcWidth) | (y1 >= srcHeight);
+#endif
+	bool bBoder = 1;
+
+	unsigned int y = 0;
+	if (bBoder){
+		// do the plain vanilla version with floating point division in inner_loop
+		while (y < dstHeight)
+		{
+			xdest = _mm_set1_ps(y*b + c);
+			ydest = _mm_set1_ps(y*e + f);
+			zdest = _mm_set1_ps(y*h + i);
+			x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xmap, ymap, zmap;
+				__m128 xFraction, yFraction, one_minus_xFraction, one_minus_yFraction;
+				__m128 p0_f, p1_f, p2_f, p3_f;
+				__m128i p0 = _mm_set1_epi8(border), p1, p2 = _mm_set1_epi8(border), p3;	
+				__m128i mask, mask1; // mask for boundary checking 
+				unsigned char *psrc;
+				zmap = _mm_load_ps(&G_x[x]);
+				xmap = _mm_load_ps(&A_x[x]);
+				zmap = _mm_add_ps(zmap, zdest);
+				ymap = _mm_load_ps(&D_x[x]);
+				zmap = _mm_div_ps(oneFloat, zmap);
+				xmap = _mm_add_ps(xmap, xdest);
+				ymap = _mm_add_ps(ymap, ydest);
+				xmap = _mm_mul_ps(xmap, zmap);
+				ymap = _mm_mul_ps(ymap, zmap);
+				xmask.f = _mm_cmplt_ps(xmap, zero);
+				ymask.f = _mm_cmplt_ps(ymap, zero);
+				// convert to integer with rounding towards zero
+				xint.i = _mm_cvttps_epi32(xmap);
+				xint.i = _mm_sub_epi32(xint.i, _mm_srli_epi32(xmask.i, 31));
+				yint.i = _mm_cvttps_epi32(ymap);
+				yint.i = _mm_sub_epi32(yint.i, _mm_srli_epi32(ymask.i, 31));
+				mask = _mm_cmplt_epi32(xint.i, srcbx);
+				mask = _mm_andnot_si128(_mm_cmplt_epi32(xint.i, zeromask), mask);
+				mask = _mm_and_si128(mask, _mm_cmplt_epi32(yint.i, srcby));
+				mask = _mm_andnot_si128(_mm_cmplt_epi32(yint.i, zeromask), mask);
+				mask1 = _mm_cmplt_epi32(xint.i, srcbx1);	// xmap+1 < srcWidth;
+				mask1 = _mm_andnot_si128(_mm_cmplt_epi32(xint.i, negone), mask1);
+				mask1 = _mm_and_si128(mask1, _mm_cmplt_epi32(yint.i, srcby1));
+				mask1 = _mm_andnot_si128(_mm_cmplt_epi32(yint.i, negone), mask1);
+
+				//xFraction = xmap-xint;
+				//yFraction = ymap-yint;
+				xFraction = _mm_cvtepi32_ps(xint.i);
+				yFraction = _mm_cvtepi32_ps(yint.i);
+				xFraction = _mm_sub_ps(xmap, xFraction);
+				yFraction = _mm_sub_ps(ymap, yFraction);
+
+				// clip for boundary
+				yint.i = _mm_mullo_epi32(yint.i, src_s);
+				yint.i = _mm_add_epi32(yint.i, xint.i);
+				//(1-xFraction)
+				//(1-yFraction)
+				one_minus_xFraction = _mm_sub_ps(oneFloat, xFraction);
+				one_minus_yFraction = _mm_sub_ps(oneFloat, yFraction);
+				yint.i = _mm_min_epi32(yint.i, srcb);
+				yint.i = _mm_max_epi32(yint.i, zeromask);
+
+				// read pixels from src and re-arrange
+				psrc = pSrcImage + yint.s32[0];
+				M128I(p0).m128i_u32[0] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[0] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + yint.s32[1];
+				M128I(p0).m128i_u32[1] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[1] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + yint.s32[2];
+				M128I(p0).m128i_u32[2] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[2] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + yint.s32[3];
+				M128I(p0).m128i_u32[3] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[3] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				// get p0, p1, p2, p3 by masking and shifting
+				p1 = p0;
+				p0 = _mm_and_si128(p0, p0mask);
+				p1 = _mm_srli_epi32(p1, 8);
+				p3 = p2;
+				p2 = _mm_and_si128(p2, p0mask);
+				p3 = _mm_srli_epi32(p3, 8);
+				p1 = _mm_and_si128(p1, p0mask);
+				p3 = _mm_and_si128(p3, p0mask);
+				// mask p0, p2 with border
+				p0 = _mm_and_si128(p0, mask);
+				p0 = _mm_or_si128(p0, _mm_andnot_si128(mask, pborder));			// combined result
+				p2 = _mm_and_si128(p2, mask);
+				p2 = _mm_or_si128(p2, _mm_andnot_si128(mask, pborder));			// combined result
+				// mask p1 and p3 with border
+				p1 = _mm_and_si128(p1, mask1);
+				p1 = _mm_or_si128(p1, _mm_andnot_si128(mask1, pborder));			// combined result
+				p3 = _mm_and_si128(p3, mask1);
+				p3 = _mm_or_si128(p3, _mm_andnot_si128(mask1, pborder));			// combined result
+
+				p0_f = _mm_cvtepi32_ps(p0);
+				p1_f = _mm_cvtepi32_ps(p1);
+				p2_f = _mm_cvtepi32_ps(p2);
+				p3_f = _mm_cvtepi32_ps(p3);
+
+				p0_f = _mm_mul_ps(p0_f, one_minus_xFraction);
+				p0_f = _mm_mul_ps(p0_f, one_minus_yFraction);
+				p1_f = _mm_mul_ps(p1_f, xFraction);
+				p1_f = _mm_mul_ps(p1_f, one_minus_yFraction);
+				p2_f = _mm_mul_ps(p2_f, one_minus_xFraction);
+				p2_f = _mm_mul_ps(p2_f, yFraction);
+				p3_f = _mm_mul_ps(p3_f, xFraction);
+				p3_f = _mm_mul_ps(p3_f, yFraction);
+
+				p0_f = _mm_add_ps(p0_f, p1_f);
+				p0_f = _mm_add_ps(p0_f, p2_f);
+				p0_f = _mm_add_ps(p0_f, p3_f);
+				p0 = _mm_cvttps_epi32(p0_f);
+
+				// convert to unsigned char and write to dst
+				p0 = _mm_packus_epi32(p0, zeromask);
+				p0 = _mm_packus_epi16(p0, zeromask);
+				*dst++ = M128I(p0).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}else{ 
+		// do the plain vanilla version with floating point division in inner_loop
+		while (y < dstHeight)
+		{
+			xdest = _mm_set1_ps(y*b + c);
+			ydest = _mm_set1_ps(y*e + f);
+			zdest = _mm_set1_ps(y*h + i);
+			x = 0;
+			unsigned int *dst = (unsigned int *)pDstImage;
+			while (x < dstWidth)
+			{
+				__m128 xmap, ymap, zmap;
+				__m128 xFraction, yFraction, one_minus_xFraction, one_minus_yFraction;
+				__m128 p0_f, p1_f, p2_f, p3_f;
+				__m128i p0, p1, p2, p3;			// pixels in src 
+				unsigned char *psrc;
+
+				zmap = _mm_load_ps(&G_x[x]);
+				xmap = _mm_load_ps(&A_x[x]);
+				zmap = _mm_add_ps(zmap, zdest);
+				ymap = _mm_load_ps(&D_x[x]);
+				zmap = _mm_div_ps(oneFloat, zmap);
+				xmap = _mm_add_ps(xmap, xdest);
+				ymap = _mm_add_ps(ymap, ydest);
+				xmap = _mm_mul_ps(xmap, zmap);
+				ymap = _mm_mul_ps(ymap, zmap);
+
+				// convert to integer with rounding towards zero
+				xint.i = _mm_cvttps_epi32(xmap);
+				yint.i = _mm_cvttps_epi32(ymap);
+
+				//xFraction = xmap-xint;
+				//yFraction = ymap-yint;
+				xFraction = _mm_cvtepi32_ps(xint.i);
+				yFraction = _mm_cvtepi32_ps(yint.i);
+				xFraction = _mm_sub_ps(xmap, xFraction);
+				yFraction = _mm_sub_ps(ymap, yFraction);
+
+				//(1-xFraction)
+				//(1-yFraction)
+				one_minus_xFraction = _mm_sub_ps(oneFloat, xFraction);
+				one_minus_yFraction = _mm_sub_ps(oneFloat, yFraction);
+
+				// read pixels from src and re-arrange
+				psrc = pSrcImage + (yint.s32[0] * srcImageStrideInBytes + xint.s32[0]);
+				M128I(p0).m128i_u32[0] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[0] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[1] * srcImageStrideInBytes + xint.s32[1]);
+				M128I(p0).m128i_u32[1] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[1] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[2] * srcImageStrideInBytes + xint.s32[2]);
+				M128I(p0).m128i_u32[2] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[2] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				psrc = pSrcImage + (yint.s32[3] * srcImageStrideInBytes + xint.s32[3]);
+				M128I(p0).m128i_u32[3] = ((unsigned int *)psrc)[0];
+				M128I(p2).m128i_u32[3] = ((unsigned int *)(psrc + srcImageStrideInBytes))[0];
+
+				// get p0, p1, p2, p3 by masking and shifting
+				p1 = p0;
+				p0 = _mm_and_si128(p0, p0mask);
+				p1 = _mm_srli_epi32(p1, 8);
+				p3 = p2;
+				p2 = _mm_and_si128(p2, p0mask);
+				p3 = _mm_srli_epi32(p3, 8);
+				p1 = _mm_and_si128(p1, p0mask);
+				p3 = _mm_and_si128(p3, p0mask);
+
+				p0_f = _mm_cvtepi32_ps(p0);
+				p1_f = _mm_cvtepi32_ps(p1);
+				p2_f = _mm_cvtepi32_ps(p2);
+				p3_f = _mm_cvtepi32_ps(p3);
+
+				p0_f = _mm_mul_ps(p0_f, one_minus_xFraction);
+				p0_f = _mm_mul_ps(p0_f, one_minus_yFraction);
+				p1_f = _mm_mul_ps(p1_f, xFraction);
+				p1_f = _mm_mul_ps(p1_f, one_minus_yFraction);
+				p2_f = _mm_mul_ps(p2_f, one_minus_xFraction);
+				p2_f = _mm_mul_ps(p2_f, yFraction);
+				p3_f = _mm_mul_ps(p3_f, xFraction);
+				p3_f = _mm_mul_ps(p3_f, yFraction);
+
+				p0_f = _mm_add_ps(p0_f, p1_f);
+				p2_f = _mm_add_ps(p2_f, p3_f);
+				p0_f = _mm_add_ps(p0_f, p2_f);
+				p0 = _mm_cvttps_epi32(p0_f);
+
+				// convert to unsigned char and write to dst
+				p0 = _mm_packus_epi32(p0, zeromask);
+				p0 = _mm_packus_epi16(p0, zeromask);
+				*dst++ = M128I(p0).m128i_i32[0];
+				x += 4;
+			}
+			y++;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_ScaleImage_U8_U8_Nearest
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix
+)
+{
+	int xinc, yinc, ypos, xpos, yoffs, xoffs;// , newDstHeight, newDstWidth;
+
+	// precompute Xmap and Ymap  based on scale factors
+	unsigned short *Xmap = (unsigned short *)((vx_uint8*)matrix + sizeof(AgoConfigScaleMatrix));
+	unsigned short *Ymap = Xmap + ((dstWidth+15)&~15);
+	unsigned int x, y;
+
+	yinc = (int)(FP_MUL * matrix->yscale);		// to convert to fixed point
+	xinc = (int)(FP_MUL * matrix->xscale);
+	yoffs = (int)(FP_MUL * matrix->yoffset);		// to convert to fixed point
+	xoffs = (int)(FP_MUL * matrix->xoffset);
+	// generate ymap;
+	for (y = 0, ypos = yoffs; y < (int)dstHeight; y++, ypos += yinc)
+	{
+		int ymap;
+		ymap = (ypos >> FP_BITS);
+		if (ymap > (int)(srcHeight - 1)){
+			ymap = srcHeight - 1;
+		}
+		if (ymap < 0) ymap = 0;
+		Ymap[y] = (unsigned short)ymap;
+		
+	}
+	// generate xmap;
+	for (x = 0, xpos = xoffs; x < (int)dstWidth; x++, xpos += xinc)
+	{
+		int xmap;
+		xmap = (xpos >> FP_BITS);
+		if (xmap > (int)(srcWidth - 1)){
+			xmap = (srcWidth - 1);
+		}
+		if (xmap < 0) xmap = 0;
+		Xmap[x] = (unsigned short)xmap;
+	}
+	// now do the scaling
+	__m128i zeromask = _mm_set1_epi32((int)0);
+	if (dstWidth >= 16){
+		for (y = 0; y < dstHeight; y++)
+		{
+			unsigned int yadd = Ymap[y] * srcImageStrideInBytes;
+			__m128i syint = _mm_set1_epi32(yadd);
+			unsigned int *pdst = (unsigned int *)pDstImage;
+			for (x = 0; x <= (dstWidth - 16); x += 16)
+			{
+				__m128i mapx0, mapx1, mapx2, mapx3;
+				mapx0 = _mm_load_si128((__m128i *)&Xmap[x]);
+				mapx1 = _mm_load_si128((__m128i *)&Xmap[x + 8]);
+				mapx2 = _mm_unpackhi_epi16(mapx0, zeromask);
+				mapx0 = _mm_cvtepi16_epi32(mapx0);
+				mapx3 = _mm_unpackhi_epi16(mapx1, zeromask);
+				mapx1 = _mm_cvtepi16_epi32(mapx1);
+				mapx0 = _mm_add_epi32(mapx0, syint);
+				mapx2 = _mm_add_epi32(mapx2, syint);
+				mapx1 = _mm_add_epi32(mapx1, syint);
+				mapx3 = _mm_add_epi32(mapx3, syint);
+				// copy to dst
+				*pdst++ = pSrcImage[M128I(mapx0).m128i_i32[0]] | (pSrcImage[M128I(mapx0).m128i_i32[1]] << 8) |
+					(pSrcImage[M128I(mapx0).m128i_i32[2]] << 16) | (pSrcImage[M128I(mapx0).m128i_i32[3]] << 24);
+
+				*pdst++ = pSrcImage[M128I(mapx2).m128i_i32[0]] | (pSrcImage[M128I(mapx2).m128i_i32[1]] << 8) |
+					(pSrcImage[M128I(mapx2).m128i_i32[2]] << 16) | (pSrcImage[M128I(mapx2).m128i_i32[3]] << 24);
+
+				*pdst++ = pSrcImage[M128I(mapx1).m128i_i32[0]] | (pSrcImage[M128I(mapx1).m128i_i32[1]] << 8) |
+					(pSrcImage[M128I(mapx1).m128i_i32[2]] << 16) | (pSrcImage[M128I(mapx1).m128i_i32[3]] << 24);
+
+				*pdst++ = pSrcImage[M128I(mapx3).m128i_i32[0]] | (pSrcImage[M128I(mapx3).m128i_i32[1]] << 8) |
+					(pSrcImage[M128I(mapx3).m128i_i32[2]] << 16) | (pSrcImage[M128I(mapx3).m128i_i32[3]] << 24);
+
+			}
+			while (x < dstWidth)
+				pDstImage[x] = pSrcImage[Xmap[x++] + yadd];
+
+			pDstImage += dstImageStrideInBytes;
+
+		}
+	}
+	else
+	{
+		for (y = 0; y < dstHeight; y++)
+		{
+			unsigned int yadd = Ymap[y] * srcImageStrideInBytes;
+			x = 0;
+			while (x < dstWidth)
+				pDstImage[x] = pSrcImage[Xmap[x++] + yadd];
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ScaleImage_U8_U8_Bilinear
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix
+)
+{
+	int xinc, yinc,xoffs, yoffs;
+
+	unsigned char *pdst = pDstImage;
+	yinc = (int)(FP_MUL * matrix->yscale);		// to convert to fixed point
+	xinc = (int)(FP_MUL * matrix->xscale);
+	yoffs = (int)(FP_MUL * matrix->yoffset);		// to convert to fixed point
+	xoffs = (int)(FP_MUL * matrix->xoffset);
+
+	// SSE4  version
+	int alignW = (dstWidth + 15)&~15;
+	unsigned short *Xmap = (unsigned short *)((vx_uint8*)matrix + sizeof(AgoConfigScaleMatrix));
+	unsigned short *Xfrac = Xmap + alignW;
+	unsigned short *One_min_xf = Xfrac + alignW;
+
+	int xpos = xoffs;
+	for (unsigned int x = 0; x < dstWidth; x++, xpos += xinc)
+	{
+		int xf;
+		int xmap = (xpos >> FP_BITS);
+		if (xmap >= (int)(srcWidth - 1)){
+			Xmap[x] = (unsigned short)(srcWidth - 1);
+		}
+		Xmap[x] = (xmap<0)? 0: (unsigned short)xmap;
+		xf = ((xpos & 0x3ffff)+0x200)>>10;
+		Xfrac[x] = xf;
+		One_min_xf[x] = (0x100 - xf);
+	}
+
+	XMM128 pp1 = { 0 }, pp2 = { 0 };
+	const __m128i mask = _mm_set1_epi16((short)0xff);
+	const __m128i round = _mm_set1_epi16((short)0x80);
+	unsigned int newDstWidth = dstWidth & ~7;	// nearest multiple of 8
+
+	for (int y = 0, ypos = yoffs; y < (int)dstHeight; y++, ypos += yinc)
+	{
+		int ym, yf, one_min_yf;
+		__m128i rxmm0, rxmm7;
+		vx_uint8 *pSrc1, *pSrc2;
+
+		ym = (ypos >> FP_BITS);
+		yf = ((ypos & 0x3ffff)+0x200)>>10;
+		one_min_yf = (0x100 - yf);
+		yoffs = ym*srcImageStrideInBytes;
+		if (ym < 0){
+			ym = yoffs = 0;
+			pSrc1 = pSrc2 = pSrcImage;
+		}
+		else if (ym >= (int)(srcHeight - 1)){
+			ym = srcHeight - 1;
+			pSrc1 = pSrc2 = pSrcImage + ym*srcImageStrideInBytes;
+		}
+		else
+		{
+			pSrc1 = pSrcImage + ym*srcImageStrideInBytes;
+			pSrc2 = pSrc1 + srcImageStrideInBytes;
+		}
+		rxmm0 = _mm_set1_epi16((unsigned short)one_min_yf);
+		rxmm7 = _mm_set1_epi16((unsigned short)yf);
+		unsigned int x = 0;
+		for (; x < newDstWidth; x += 8)
+		{
+			__m128i mapxy, rxmm1, rxmm2, rxmm3, rxmm4;
+			mapxy = _mm_load_si128((__m128i *)&Xmap[x]);		// mapped table [srcx7...src_x3,src_x2,src_x1,src_x0]
+			// load pixels for mapxy
+			for (int xx = 0; xx < 8; xx++)
+			{
+				pp1.u16[xx] = ((unsigned short*)&pSrc1[M128I(mapxy).m128i_i16[xx]])[0];
+				pp2.u16[xx] = ((unsigned short*)&pSrc2[M128I(mapxy).m128i_i16[xx]])[0];
+			}
+			// unpack src for p1 and p2
+			rxmm1 = _mm_and_si128(pp1.i, mask);		// p1
+			pp1.i = _mm_srli_epi16(pp1.i, 8);		// p2
+			// unpack pp2 for p3 and p4
+			rxmm4 = _mm_and_si128(pp2.i, mask);		// p3
+			pp2.i = _mm_srli_epi16(pp2.i, 8);		// p4
+
+			// load xf and 1-xf
+			rxmm2 = _mm_load_si128((__m128i *)&Xfrac[x]);			// xf
+			rxmm3 = _mm_load_si128((__m128i *)&One_min_xf[x]);		// 1-xf
+
+			// t1 = (unsigned char)((ione_minus_x *p1 + ifraction_x *p2) >> FW_WEIGHT); 
+			rxmm1 = _mm_mullo_epi16(rxmm1, rxmm3);	//  ione_minus_xf *p1 	
+			pp1.i = _mm_mullo_epi16(pp1.i, rxmm2);	//  ifraction_x  *p2		
+			rxmm1 = _mm_add_epi16(rxmm1, pp1.i);
+			rxmm1 = _mm_add_epi16(rxmm1, round);
+			rxmm1 = _mm_srli_epi16(rxmm1, 8);
+
+			//  t2 = (unsigned char)((ione_minus_x *p3 + ifraction_x *p4) >> FW_WEIGHT); 	
+			rxmm4 = _mm_mullo_epi16(rxmm4, rxmm3);	//  ione_minus_x *p3 	
+			pp2.i = _mm_mullo_epi16(pp2.i, rxmm2);	//  ifraction_x  *p4		
+			rxmm4 = _mm_add_epi16(rxmm4, pp2.i);
+			rxmm4 = _mm_add_epi16(rxmm4, round);
+			rxmm4 = _mm_srli_epi16(rxmm4, 8);
+
+
+			// *(pDst + x + y*dstStep) = (unsigned char)((ione_minus_y *t1 + ifraction_y * t2) >> FW_WEIGHT)	
+			rxmm1 = _mm_mullo_epi16(rxmm1, rxmm0);	//  ione_minus_y * t1 	
+			rxmm4 = _mm_mullo_epi16(rxmm4, rxmm7);	//  ifraction_y  * t2		
+			rxmm1 = _mm_add_epi16(rxmm1, rxmm4);
+			rxmm1 = _mm_add_epi16(rxmm1, round);
+			rxmm1 = _mm_srli_epi16(rxmm1, 8);
+			rxmm1 = _mm_packus_epi16(rxmm1, rxmm1);
+
+			_mm_storel_epi64((__m128i *)(pDstImage + x), rxmm1);
+		}
+		for (x = newDstWidth; x < dstWidth; x++) {
+			const unsigned char *p0 = pSrc1 + Xmap[x];
+			const unsigned char *p1 = pSrc2 + Xmap[x];
+			pDstImage[x] = ((One_min_xf[x] * one_min_yf*p0[0]) + (Xfrac[x] * one_min_yf*p0[1]) + (One_min_xf[x] * yf*p1[0]) + (Xfrac[x] * yf*p1[1]) + 0x8000) >> 16;
+		}
+
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+
+int HafCpu_ScaleImage_U8_U8_Bilinear_Replicate
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix
+)
+{
+
+	// SSE4  version
+	int xinc, yinc, xoffs, yoffs;
+
+	unsigned char *pdst = pDstImage;
+	yinc = (int)(FP_MUL * matrix->yscale);		// to convert to fixed point
+	xinc = (int)(FP_MUL * matrix->xscale);
+	yoffs = (int)(FP_MUL * matrix->yoffset);		// to convert to fixed point
+	xoffs = (int)(FP_MUL * matrix->xoffset);
+	int alignW = (dstWidth + 15)&~15;
+	unsigned short *Xmap = (unsigned short *)((vx_uint8*)matrix + sizeof(AgoConfigScaleMatrix));
+	unsigned short *Xfrac = Xmap + alignW;
+	unsigned short *One_min_xf = Xfrac + alignW;
+
+	int xpos = xoffs;
+	vx_uint32 newDstWidth = 0;
+	for (unsigned int x = 0; x < dstWidth; x++, xpos += xinc)
+	{
+		int xf;
+		int xmap = (xpos >> FP_BITS);
+		if (xmap >= (int)(srcWidth - 1)){
+			if (!newDstWidth) newDstWidth = x - 1;
+			Xmap[x] = (unsigned short)(srcWidth - 1);
+		}
+		else {
+			Xmap[x] = (xmap < 0) ? 0 : (unsigned short)xmap;
+		}
+		xf = ((xpos & 0x3ffff)+0x200)>>10;
+		Xfrac[x] = xf;
+		One_min_xf[x] = (0x100 - xf);
+	}
+	if (dstWidth & 7)
+	{
+		newDstWidth &= ~7;	// nearest multiple of 8
+	}
+
+	XMM128 pp1 = { 0 }, pp2 = { 0 };
+	const __m128i mask = _mm_set1_epi16((short)0xff);
+	const __m128i round = _mm_set1_epi16((short)0x80);
+	for (int y = 0, ypos = yoffs; y < (int)dstHeight; y++, ypos += yinc)
+	{
+		int ym, yf, one_min_yf;
+		__m128i rxmm0, rxmm7;
+		unsigned int yoffs;
+		vx_uint8 *pSrc1, *pSrc2;
+
+		ym = (ypos >> FP_BITS);
+		yf = ((ypos & 0x3ffff)+0x200)>>10;
+		one_min_yf = (0x100 - yf);
+		yoffs = ym*srcImageStrideInBytes;
+		if (ym < 0){
+			ym = yoffs = 0;
+			pSrc1 = pSrc2 = pSrcImage;
+		}
+		else if (ym >= (int)(srcHeight - 1)){
+			ym = srcHeight - 1;
+			pSrc1 = pSrc2 = pSrcImage + ym*srcImageStrideInBytes;
+		}
+		else
+		{
+			pSrc1 = pSrcImage + ym*srcImageStrideInBytes;
+			pSrc2 = pSrc1 + srcImageStrideInBytes;
+		}
+		rxmm0 = _mm_set1_epi16((unsigned short)one_min_yf);
+		rxmm7 = _mm_set1_epi16((unsigned short)yf);
+		unsigned int x = 0;
+		for (; x < newDstWidth; x += 8)
+		{
+			__m128i mapxy, rxmm1, rxmm2, rxmm3, rxmm4;
+			mapxy = _mm_load_si128((__m128i *)&Xmap[x]);		// mapped table [srcx7...src_x3,src_x2,src_x1,src_x0]
+
+			// load pixels for mapxy
+			for (int xx = 0; xx < 8; xx++)
+			{
+				pp1.u16[xx] = ((unsigned short*)&pSrc1[M128I(mapxy).m128i_i16[xx]])[0];
+				pp2.u16[xx] = ((unsigned short*)&pSrc2[M128I(mapxy).m128i_i16[xx]])[0];
+			}
+			// unpack src for p1 and p2
+			rxmm1 = _mm_and_si128(pp1.i, mask);		// p1
+			pp1.i = _mm_srli_epi16(pp1.i, 8);		// p2
+			// unpack pp2 for p3 and p4
+			rxmm4 = _mm_and_si128(pp2.i, mask);		// p3
+			pp2.i = _mm_srli_epi16(pp2.i, 8);		// p4
+
+			// load xf and 1-xf
+			rxmm2 = _mm_load_si128((__m128i *)&Xfrac[x]);			// xf
+			rxmm3 = _mm_load_si128((__m128i *)&One_min_xf[x]);		// 1-xf
+
+			// t1 = (unsigned char)((ione_minus_x *p1 + ifraction_x *p2) >> FW_WEIGHT); 
+			rxmm1 = _mm_mullo_epi16(rxmm1, rxmm3);	//  ione_minus_xf *p1 	
+			pp1.i = _mm_mullo_epi16(pp1.i, rxmm2);	//  ifraction_x  *p2		
+			rxmm1 = _mm_add_epi16(rxmm1, pp1.i);
+			rxmm1 = _mm_add_epi16(rxmm1, round);
+			rxmm1 = _mm_srli_epi16(rxmm1, 8);
+
+			//  t2 = (unsigned char)((ione_minus_x *p3 + ifraction_x *p4) >> FW_WEIGHT); 	
+			rxmm4 = _mm_mullo_epi16(rxmm4, rxmm3);	//  ione_minus_x *p3 	
+			pp2.i = _mm_mullo_epi16(pp2.i, rxmm2);	//  ifraction_x  *p4		
+			rxmm4 = _mm_add_epi16(rxmm4, pp2.i);
+			rxmm4 = _mm_add_epi16(rxmm4, round);
+			rxmm4 = _mm_srli_epi16(rxmm4, 8);
+
+
+			// *(pDst + x + y*dstStep) = (unsigned char)((ione_minus_y *t1 + ifraction_y * t2) >> FW_WEIGHT)	
+			rxmm1 = _mm_mullo_epi16(rxmm1, rxmm0);	//  ione_minus_y * t1 	
+			rxmm4 = _mm_mullo_epi16(rxmm4, rxmm7);	//  ifraction_y  * t2		
+			rxmm1 = _mm_add_epi16(rxmm1, rxmm4);
+			rxmm1 = _mm_add_epi16(rxmm1, round);
+			rxmm1 = _mm_srli_epi16(rxmm1, 8);
+			rxmm1 = _mm_packus_epi16(rxmm1, rxmm1);
+
+			_mm_storel_epi64((__m128i *)(pDstImage + x), rxmm1);
+		}
+		// todo: if (upscale; recompute x=0, x=dwidth-1)
+		if (matrix->xscale < 1){
+			unsigned int p0, p1, p2, p3;
+			p0 = p1 = pSrc1[0];
+			p2 = p3 = pSrc2[0];
+			pDstImage[0] = ((One_min_xf[0] * one_min_yf*p0) + (Xfrac[0] * one_min_yf*p1) + (One_min_xf[0] * yf*p2) + (Xfrac[0]*yf*p3) + 0x8000) >> 16;
+		}
+		x = newDstWidth;
+		while (x < dstWidth){
+			unsigned int p0, p1, p2, p3;
+			const unsigned char *p = pSrc1 + Xmap[x];
+			p0 = p[0];
+			p1 = (Xmap[x] < (srcWidth - 1)) ? p[1] : p0;
+			p = pSrc2 + Xmap[x];
+			p2 = p[0];
+			p3 = (Xmap[x] < (srcWidth - 1)) ? p[1]: p2;
+			pDstImage[x] = ((One_min_xf[x] * one_min_yf*p0) + (Xfrac[x] * one_min_yf*p1) + (One_min_xf[x] * yf*p2) + (Xfrac[x] * yf*p3) + 0x8000) >> 16;
+			x++;
+		}
+
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ScaleImage_U8_U8_Bilinear_Constant
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix,
+vx_uint8             border
+)
+{
+	int xinc, yinc, xoffs, yoffs;
+
+	unsigned int sline = srcImageStrideInBytes;
+	unsigned char *pdst = pDstImage;
+	unsigned char *pSrcLast = pSrcImage + (srcImageStrideInBytes*(srcWidth - 1));
+	yinc = (int)(FP_MUL * matrix->yscale);		// to convert to fixed point
+	xinc = (int)(FP_MUL * matrix->xscale);
+	yoffs = (int)(FP_MUL * matrix->yoffset);		// to convert to fixed point
+	xoffs = (int)(FP_MUL * matrix->xoffset);
+	int alignW = (dstWidth + 15)&~15;
+	unsigned short *Xmap = (unsigned short *)((vx_uint8*)matrix + sizeof(AgoConfigScaleMatrix));
+	unsigned short *Xfrac = Xmap + alignW;
+	unsigned short *One_min_xf = Xfrac + alignW;
+	vx_uint8 *pSrcBorder = (vx_uint8 *)(One_min_xf + alignW);
+	memset(pSrcBorder, border, srcWidth);
+
+	int xpos = xoffs;
+	vx_uint32 newDstWidth = 0;
+	for (unsigned int x = 0; x < dstWidth; x++, xpos += xinc)
+	{
+		int xf;
+		int xmap = (xpos >> FP_BITS);
+		if (xmap >= (int)(srcWidth - 1)){
+			if (!newDstWidth) newDstWidth = x - 1;
+			Xmap[x] = (unsigned short)(srcWidth - 1);
+		}
+		else {
+			Xmap[x] = (xmap < 0) ? 0 : (unsigned short)xmap;
+		}
+		xf = ((xpos & 0x3ffff)+0x200)>>10;
+		Xfrac[x] = xf;
+		One_min_xf[x] = (0x100 - xf);
+	}
+	if (dstWidth & 7)
+	{
+		newDstWidth &= ~7;	// nearest multiple of 8
+	}
+
+	XMM128 pp1 = { 0 }, pp2 = { 0 };
+	const __m128i mask = _mm_set1_epi16((short)0xff);
+	const __m128i round = _mm_set1_epi16((short)0x80);
+	for (int y = 0, ypos = yoffs; y < (int)dstHeight; y++, ypos += yinc)
+	{
+		int ym, yf, one_min_yf;
+		unsigned int yoffs;
+		vx_uint8 *pSrc1, *pSrc2;
+
+		ym = (ypos >> FP_BITS);
+		yf = ((ypos & 0x3ffff)+0x200)>>10;
+		one_min_yf = (0x100 - yf);
+		if (ym < 0){
+			ym = yoffs = 0;
+			pSrc1 = pSrcBorder;
+			pSrc2 = pSrcImage;
+		}
+		else if (ym >= (int)(srcHeight - 1)){
+			ym = srcHeight - 1;
+			pSrc1 = pSrcImage + ym*srcImageStrideInBytes;
+			pSrc2 = pSrcBorder;
+			yoffs = ym*srcImageStrideInBytes;
+		}
+		else
+		{
+			pSrc1 = pSrcImage + ym*srcImageStrideInBytes;
+			pSrc2 = pSrc1 + srcImageStrideInBytes;
+			yoffs = ym*srcImageStrideInBytes;
+		}
+
+		__m128i rxmm0, rxmm7;
+		rxmm0 = _mm_set1_epi16((unsigned short)one_min_yf);
+		rxmm7 = _mm_set1_epi16((unsigned short)yf);
+		unsigned int x = 0;
+		for (; x < newDstWidth; x += 8)
+		{
+			__m128i mapxy, rxmm1, rxmm2, rxmm3, rxmm4;
+			mapxy = _mm_load_si128((__m128i *)&Xmap[x]);		// mapped table [srcx7...src_x3,src_x2,src_x1,src_x0]
+			// load pixels for mapxy
+			for (int xx = 0; xx < 8; xx++)
+			{
+				pp1.u16[xx] = ((unsigned short*)&pSrc1[M128I(mapxy).m128i_i16[xx]])[0];
+				pp2.u16[xx] = ((unsigned short*)&pSrc2[M128I(mapxy).m128i_i16[xx]])[0];
+			}
+			// unpack src for p1 and p2
+			rxmm1 = _mm_and_si128(pp1.i, mask);		// p1
+			pp1.i = _mm_srli_epi16(pp1.i, 8);		// p2
+			// unpack pp2 for p3 and p4
+			rxmm4 = _mm_and_si128(pp2.i, mask);		// p3
+			pp2.i = _mm_srli_epi16(pp2.i, 8);		// p4
+
+			// load xf and 1-xf
+			rxmm2 = _mm_load_si128((__m128i *)&Xfrac[x]);			// xf
+			rxmm3 = _mm_load_si128((__m128i *)&One_min_xf[x]);		// 1-xf
+
+			// t1 = (unsigned char)((ione_minus_x *p1 + ifraction_x *p2) >> FW_WEIGHT); 
+			rxmm1 = _mm_mullo_epi16(rxmm1, rxmm3);	//  ione_minus_xf *p1 	
+			pp1.i = _mm_mullo_epi16(pp1.i, rxmm2);	//  ifraction_x  *p2		
+			rxmm1 = _mm_add_epi16(rxmm1, pp1.i);
+			rxmm1 = _mm_add_epi16(rxmm1, round);
+			rxmm1 = _mm_srli_epi16(rxmm1, 8);
+
+			//  t2 = (unsigned char)((ione_minus_x *p3 + ifraction_x *p4) >> FW_WEIGHT); 	
+			rxmm4 = _mm_mullo_epi16(rxmm4, rxmm3);	//  ione_minus_x *p3 	
+			pp2.i = _mm_mullo_epi16(pp2.i, rxmm2);	//  ifraction_x  *p4		
+			rxmm4 = _mm_add_epi16(rxmm4, pp2.i);
+			rxmm4 = _mm_add_epi16(rxmm4, round);
+			rxmm4 = _mm_srli_epi16(rxmm4, 8);
+
+
+			// *(pDst + x + y*dstStep) = (unsigned char)((ione_minus_y *t1 + ifraction_y * t2) >> FW_WEIGHT)	
+			rxmm1 = _mm_mullo_epi16(rxmm1, rxmm0);	//  ione_minus_y * t1 	
+			rxmm4 = _mm_mullo_epi16(rxmm4, rxmm7);	//  ifraction_y  * t2		
+			rxmm1 = _mm_add_epi16(rxmm1, rxmm4);
+			rxmm1 = _mm_add_epi16(rxmm1, round);
+			rxmm1 = _mm_srli_epi16(rxmm1, 8);
+			rxmm1 = _mm_packus_epi16(rxmm1, rxmm1);
+
+			_mm_storel_epi64((__m128i *)(pDstImage + x), rxmm1);
+		}
+		// todo: if (upscale; recompute x=0, x=dwidth-1)
+		if (matrix->xscale < 1){
+			unsigned int p0, p1, p2, p3;
+			p0 = border;
+			p1 = (ypos >> 8) < 0 ? border : pSrc1[0];
+			p2 = border;
+			p3 = pSrc2[0];
+			pDstImage[0] = ((One_min_xf[0] * one_min_yf*p0) + (Xfrac[0] * one_min_yf*p1) + (One_min_xf[0] * yf*p2) + (Xfrac[0] * yf*p3) + 0x8000) >> 16;
+		}
+		x = newDstWidth ;
+		while (x < dstWidth){
+			unsigned int p0, p1, p2, p3;
+			const unsigned char *p = pSrc1 + Xmap[x];
+			p0 = p[0];
+			p1 = (Xmap[x] < (srcWidth - 1)) ? p[1] : border;
+			p = pSrc2 + Xmap[x];
+			p2 = p[0];
+			p3 = (Xmap[x] < (srcWidth - 1)) ? p[1] : border;
+			pDstImage[x] = ((One_min_xf[x] * one_min_yf*p0) + (Xfrac[x] * one_min_yf*p1) + (One_min_xf[x] * yf*p2) + (Xfrac[x] * yf*p3) + 0x8000) >> 16;
+			x++;
+		}
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+
+// upsample 2x2 (used for 4:2:0 to 4:4:4 conversion)
+int HafCpu_ScaleUp2x2_U8_U8
+(
+vx_uint32     dstWidth,
+vx_uint32     dstHeight,
+vx_uint8    * pDstImage,
+vx_uint32     dstImageStrideInBytes,
+vx_uint8    * pSrcImage,
+vx_uint32     srcImageStrideInBytes
+)
+{
+
+	__m128i pixels1, pixels2;
+
+	unsigned char *pchDst = (unsigned char*)pDstImage;
+	unsigned char *pchDstlast = (unsigned char*)pDstImage + dstHeight*dstImageStrideInBytes;
+	while (pchDst < pchDstlast)
+	{
+		__m128i * src = (__m128i*)pSrcImage;
+		__m128i * dst = (__m128i*)pchDst;
+		__m128i * dstNext = (__m128i*)(pchDst + dstImageStrideInBytes);
+		__m128i * dstlast = dst + (dstWidth >> 4);
+		while (dst < dstlast)
+		{
+			pixels1 = _mm_loadu_si128(src++);		// src (0-15)
+			pixels2 = _mm_unpacklo_epi8(pixels1, pixels1);		// dst (0-15)
+			pixels1 = _mm_unpackhi_epi8(pixels1, pixels1);		// dst (16-31)
+			_mm_store_si128(dst++, pixels2);
+			_mm_store_si128(dst++, pixels1);
+			_mm_store_si128(dstNext++, pixels2);
+			_mm_store_si128(dstNext++, pixels1);
+		}
+		pchDst += (dstImageStrideInBytes * 2);
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ScaleImage_U8_U8_Area
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix
+)
+{
+	if (matrix->xscale == 1.0f && matrix->yscale == 1.0f)
+	{
+		vx_uint8 *pSrcB = pSrcImage + (dstHeight - 1)*srcImageStrideInBytes;
+		// no scaling. Just do a copy from src to dst
+		for (unsigned int y = 0; y < dstHeight; y++)
+		{
+			vx_uint8 *pSrc = pSrcImage + (int)(matrix->yoffset+y)*srcImageStrideInBytes + (int)matrix->xoffset;
+			// clamp to boundary
+			if (pSrc < pSrcImage) pSrc = pSrcImage;
+			if (pSrc > pSrcB) pSrc = pSrcB;
+			memcpy(pDstImage, pSrc, dstWidth);
+			pDstImage += dstImageStrideInBytes;
+		}
+	} 
+	else if (matrix->xscale == 2.0f && matrix->yscale == 2.0f)
+	{
+		__m128i zero = _mm_setzero_si128();
+		__m128i delta2 = _mm_set1_epi16(2);
+		__m128i masklow = _mm_set1_epi16(0x00ff);
+		vx_uint8 *pSrcB = pSrcImage + (srcHeight - 2)*srcImageStrideInBytes;
+		// 2x2 image scaling
+		for (unsigned int y = 0; y < dstHeight; y++)
+		{
+			vx_uint8 *S0 = pSrcImage + (int)(matrix->yoffset+(y*2))*srcImageStrideInBytes + (int)(matrix->xoffset);
+			if (S0 < pSrcImage) S0 = pSrcImage;
+			if (S0 > pSrcB) S0 = pSrcB;
+			vx_uint8 *S1 = S0 + srcImageStrideInBytes;
+			vx_uint8 *D = pDstImage;
+			for (unsigned int dx = 0; dx <= dstWidth - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+			{
+				__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+				__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+				__m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
+				__m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
+				s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
+				s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+
+				_mm_storel_epi64((__m128i*)D, s0);
+			}
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		int xinc, yinc, xoffs, yoffs, xpos, ypos, x, y;
+		// Intermideate buffers to store results between horizontally filtered rows
+		int alignWidth = (dstWidth + 15) & ~15;
+		vx_uint16 *Xmap = (unsigned short *)((vx_uint8*)matrix + sizeof(ago_scale_matrix_t));
+		vx_uint16 *Ymap = Xmap + alignWidth+8;
+		__m128i z = _mm_setzero_si128();
+		// do generic area scaling
+		yinc = (int)(FP_MUL * matrix->yscale);		// to convert to fixed point
+		xinc = (int)(FP_MUL * matrix->xscale);
+		yoffs = (int)(FP_MUL * matrix->yoffset);		// to convert to fixed point
+		xoffs = (int)(FP_MUL * matrix->xoffset);
+		int xscale = (int)(matrix->xscale + 0.5);
+		int yscale = (int)(matrix->yscale + 0.5);
+		float inv_scale = 1.0f / (xscale*yscale);
+		int area_div = (int)(FP_MUL * inv_scale);
+		vx_uint8 *src_b = pSrcImage + srcWidth*(srcHeight - 1);
+		//int area_sz = (area + (1 << (FP_BITS - 1))) >> FP_BITS;
+		// generate xmap;
+		for (x = 0, xpos = xoffs; x <= (int)dstWidth; x++, xpos += xinc)
+		{
+			int xmap;
+			xmap = ((xpos + FP_ROUND) >> FP_BITS);
+			if (xmap >(int)(srcWidth - 1)){
+				xmap = (srcWidth - 1);
+			}
+			if (xmap < 0) xmap = 0;
+			Xmap[x] = (unsigned short)xmap;
+		}
+		for (y = 0, ypos = yoffs; y < (int)dstHeight; y++, ypos += yinc)
+		{
+			int ymap;
+			ymap = ((ypos + FP_ROUND )>> FP_BITS);
+			if (ymap >(int)(srcHeight - 1)){
+				ymap = srcHeight - 1;
+			}
+			if (ymap < 0) ymap = 0;
+			// compute vertical sum and store in intermediate buffer
+			vx_uint8 *S0 = pSrcImage + (int)ymap*srcImageStrideInBytes;
+			vx_uint8 *D = pDstImage;
+			for (x = Xmap[0]; x <= (Xmap[dstWidth] - 7); x += 8)
+			{
+				__m128i r0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(S0 + x)), z);
+				vx_uint8 *S1 = S0 + srcImageStrideInBytes;
+				for (int i = 1; i < yscale; i++){
+					if (S1 > src_b)S1 = src_b;
+					__m128i r1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(S1 + x)), z);
+					r0 = _mm_add_epi16(r0, r1);
+					S1 += srcImageStrideInBytes;
+				}
+				_mm_store_si128((__m128i*)&Ymap[x], r0);
+			}
+			// do horizontal scaling on intermediate buffer
+			for (x = 0; x < (int)dstWidth; x++)
+			{
+				int x0 = Xmap[x];
+				int x1 = x0 + xscale;
+				int sum = Ymap[x0];
+				while(++x0<x1) {
+					sum += Ymap[x0];
+				};
+				// divide sum by area and copy to dest
+				*D++ = (vx_uint8)(((sum*area_div) + (1 << 15)) >> FP_BITS);
+
+			}
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ScaleImage_U8_U8_Area_Constant
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix,
+vx_uint8             border
+)
+{
+	if (matrix->xscale == 1.0f && matrix->yscale == 1.0f)
+	{
+		vx_uint8 *pSrcB = pSrcImage + (dstHeight - 1)*srcImageStrideInBytes;
+		// no scaling. Just do a copy from src to dst
+		for (unsigned int y = 0; y < dstHeight; y++)
+		{
+			vx_uint8 *pSrc = pSrcImage + (int)(matrix->yoffset + y)*srcImageStrideInBytes + (int)matrix->xoffset;
+			// clamp to boundary
+			if ((pSrc < pSrcImage) || (pSrc > pSrcB)){
+				memset(pDstImage, border, dstWidth) ;
+			}
+			else
+				memcpy(pDstImage, pSrc, dstWidth);
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else if (matrix->xscale == 2.0f && matrix->yscale == 2.0f)
+	{
+		__m128i zero = _mm_setzero_si128();
+		__m128i delta2 = _mm_set1_epi16(2);
+		__m128i masklow = _mm_set1_epi16(0x00ff);
+		__m128i bound = _mm_set1_epi16(border);
+		vx_uint8 *pSrcB = pSrcImage + (srcHeight - 2)*srcImageStrideInBytes;
+		// 2x2 image scaling
+		for (unsigned int y = 0; y < dstHeight; y++)
+		{
+			vx_uint8 *S0 = pSrcImage + (int)(matrix->yoffset + (y*2))*srcImageStrideInBytes + (int)(matrix->xoffset);
+			if (S0 < pSrcImage) S0 = pSrcImage;
+			if (S0 > pSrcB) S0 = pSrcB;
+			vx_uint8 *S1 = S0 + srcImageStrideInBytes;
+			vx_uint8 *D = pDstImage;
+			for (unsigned int dx = 0; dx <= dstWidth - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+			{
+				__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+				__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+				__m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
+				__m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
+				s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
+				s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+
+				_mm_storel_epi64((__m128i*)D, s0);
+			}
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		int xinc, yinc, xoffs, yoffs, xpos, ypos, x, y;
+		vx_uint16 *Xmap = (unsigned short *)((vx_uint8*)matrix + sizeof(ago_scale_matrix_t));
+		vx_uint16 *Ymap = Xmap + dstWidth;
+		__m128i z = _mm_setzero_si128();
+		// do generic area scaling
+		yinc = (int)(FP_MUL * matrix->yscale);		// to convert to fixed point
+		xinc = (int)(FP_MUL * matrix->xscale);
+		yoffs = (int)(FP_MUL * matrix->yoffset);		// to convert to fixed point
+		xoffs = (int)(FP_MUL * matrix->xoffset);
+		int xscale = (int)(matrix->xscale + 0.5);
+		int yscale = (int)(matrix->yscale + 0.5);
+		float inv_scale = 1.0f / (xscale*yscale);
+		int area_div = (int)(FP_MUL * inv_scale);
+		vx_uint8 *src_b = pSrcImage + srcWidth*(srcHeight - 1);
+		// generate xmap;
+		for (x = 0, xpos = xoffs; x <= (int)dstWidth; x++, xpos += xinc)
+		{
+			int xmap;
+			xmap = ((xpos + FP_ROUND) >> FP_BITS);
+			if (xmap >(int)(srcWidth - 1)){
+				xmap = (srcWidth - 1);
+			}
+			if (xmap < 0) xmap = 0;
+			Xmap[x] = (unsigned short)xmap;
+		}
+		for (y = 0, ypos = yoffs; y < (int)dstHeight; y++, ypos += yinc)
+		{
+			int ymap;
+			ymap = ((ypos + FP_ROUND) >> FP_BITS);
+			if (ymap >(int)(srcHeight - 1)){
+				ymap = srcHeight - 1;
+			}
+			if (ymap < 0) ymap = 0;
+			// compute vertical sum and store in intermediate buffer
+			vx_uint8 *S0 = pSrcImage + (int)ymap*srcImageStrideInBytes;
+			vx_uint8 *D = pDstImage;
+			for (x = Xmap[0]; x <= (Xmap[dstWidth] - 7); x += 8)
+			{
+				__m128i r0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(S0 + x)), z);
+				vx_uint8 *S1 = S0 + srcImageStrideInBytes;
+				for (int i = 1; i < yscale; i++){
+					if (S1 > src_b)S1 = src_b;
+					__m128i r1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(S1 + x)), z);
+					r0 = _mm_add_epi16(r0, r1);
+					S1 += srcImageStrideInBytes;
+				}
+				_mm_store_si128((__m128i*)&Ymap[x], r0);
+			}
+			// do horizontal scaling on intermediate buffer
+			for (x = 0; x < (int)dstWidth; x++)
+			{
+				int x0 = Xmap[x];
+				int x1 = x0 + xscale;
+				int sum = Ymap[x0];
+				while (++x0<x1) {
+					sum += Ymap[x0];
+				};
+				// divide sum by area and copy to dest
+				*D++ = (vx_uint8)(((sum*area_div) + (1 << 15)) >> FP_BITS);
+
+			}
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_ScaleImage_U8_U8_Area_Replicate
+(
+vx_uint32            dstWidth,
+vx_uint32            dstHeight,
+vx_uint8           * pDstImage,
+vx_uint32            dstImageStrideInBytes,
+vx_uint32            srcWidth,
+vx_uint32            srcHeight,
+vx_uint8           * pSrcImage,
+vx_uint32            srcImageStrideInBytes,
+ago_scale_matrix_t * matrix
+)
+{
+	return HafCpu_ScaleImage_U8_U8_Area(dstWidth, dstHeight, pDstImage, dstImageStrideInBytes, srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes, matrix);
+}
+
+/*
+Performs a Gaussian blur(3x3) and half scales it
+gaussian filter
+Kernel			1   2   1 			1		1   2   1
+				2   4   2 			2					>>4
+				1   2   1     =		1									
+*/
+
+int HafCpu_ScaleGaussianHalf_U8_U8_3x3
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_uint8    * pLocalData
+)
+{
+	unsigned int x, y;
+	//	float scale = (float)128 / 180.f;
+
+	pSrcImage += srcImageStrideInBytes;
+	__m128i z = _mm_setzero_si128(), mask = _mm_set1_epi32((int)0x0000FFFF);
+	vx_uint16 *r0 = (vx_uint16*)(pLocalData + 16);
+	unsigned int W = 2 * dstWidth;
+
+	for (y = 0; y < dstHeight; y++)
+	{
+		const vx_uint8* srow0 = pSrcImage - srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrcImage;
+		const vx_uint8* srow2 = pSrcImage + srcImageStrideInBytes;
+		vx_uint8* pDst = (vx_uint8*)pDstImage;
+
+		// do vertical convolution
+		x = 0;
+		for (; x <= W - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i t0 = _mm_add_epi16(_mm_add_epi16(s0, s2), _mm_slli_epi16(s1, 1));
+			_mm_store_si128((__m128i*)(r0 + x), t0);
+		}
+
+		// do horizontal convolution, interleave the results and store them to dst
+		x = 0;
+		for (; x <= W - 16; x += 16, pDst+=8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+
+			__m128i t0 = _mm_add_epi16(_mm_add_epi16(s0, s2), _mm_slli_epi16(s1, 1));
+			s0 = _mm_loadu_si128((const __m128i*)(r0 + x + 7));
+			s1 = _mm_loadu_si128((const __m128i*)(r0 + x + 8));
+			s2 = _mm_loadu_si128((const __m128i*)(r0 + x + 9));
+			s0 = _mm_add_epi16(_mm_add_epi16(s0, s2), _mm_slli_epi16(s1, 1));
+
+			t0 = _mm_packus_epi32(_mm_and_si128(t0, mask), _mm_and_si128(s0, mask));
+			t0 = _mm_srli_epi16(t0, 4);
+			t0 = _mm_packus_epi16(t0, t0);
+			_mm_storel_epi64((__m128i*)pDst, t0);
+		}
+		pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);	// do alternate rows for /2 scaling
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
\ No newline at end of file
diff --git a/openvx/ago/ago_haf_cpu_harris.cpp b/openvx/ago/ago_haf_cpu_harris.cpp
new file mode 100644
index 0000000..f81fbfe
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_harris.cpp
@@ -0,0 +1,1006 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+typedef struct {
+	vx_float32 GxGx;
+	vx_float32 GxGy;
+	vx_float32 GyGy;
+} ago_harris_Gxy_t;
+
+void insertAtLocation(vx_uint32 listCapacity, vx_keypoint_t * pList, vx_uint32 * cornerCount, vx_keypoint_t itemToBeAdded, vx_uint32 loc)
+{
+	vx_keypoint_t incoming_keypt = itemToBeAdded;
+	vx_keypoint_t temp;
+
+	for (int i = (int)loc; i <= (int)*cornerCount; i++)
+	{
+		temp = pList[i];
+		pList[i] = incoming_keypt;
+		incoming_keypt = temp;
+	}
+
+	*cornerCount = *cornerCount + 1;
+}
+
+void AddToTheSortedKeypointList(vx_uint32 listCapacity, vx_keypoint_t * pList, vx_uint32 * cornerCount, vx_keypoint_t itemToBeAdded)
+{
+	if (*cornerCount == 0)									// Add the item to the head
+	{
+		pList[0] = itemToBeAdded;
+		*cornerCount = 1;
+	}
+	else
+	{
+		if (itemToBeAdded.strength <= pList[*cornerCount - 1].strength)
+		{
+			if (*cornerCount == listCapacity)
+				return;
+			else
+			{
+				pList[*cornerCount] = itemToBeAdded;
+				*cornerCount = *cornerCount + 1;
+			}
+		}
+		else
+		{
+			int idx = 0;
+			while (pList[idx].strength > itemToBeAdded.strength)
+				idx++;
+			insertAtLocation(listCapacity, pList, cornerCount, itemToBeAdded, idx);
+		}
+	}
+}
+
+// Using Separable filter:
+// For Gx:
+//	-1	0	1		-1	0	1		1
+//	-2	0	2	=					2
+//	-1	0	1						1
+// For Gy:
+//	-1	-2	-1		1	2	1		-1
+//	 0	 0	 0	=					 0
+//	 1	 2	 1						 1
+int HafCpu_HarrisSobel_HG3_U8_3x3
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstGxy_,
+		vx_uint32          dstGxyStrideInBytes,
+		vx_uint8         * pSrcImage,
+		vx_uint32          srcImageStrideInBytes,
+		vx_uint8		 * pScratch
+	)
+{
+	ago_harris_Gxy_t * pDstGxy = (ago_harris_Gxy_t *)((vx_uint8 *) pDstGxy_ + dstGxyStrideInBytes);
+	
+	int tmpWidth = (dstWidth + 15) & ~15;
+	tmpWidth <<= 1;
+	vx_int16 * pPrevRow = (vx_int16*)pScratch;
+	vx_int16 * pCurrRow = ((vx_int16*)pScratch) + tmpWidth;
+	vx_int16 * pNextRow = ((vx_int16*)pScratch) + (tmpWidth + tmpWidth);
+
+	vx_int16 * pLocalPrevRow = pPrevRow;
+	vx_int16 * pLocalCurrRow = pCurrRow;
+	vx_int16 * pLocalNextRow = pNextRow;
+
+	// Horizontal filtering for the first row - row 0
+	vx_uint8 * pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+		*pLocalPrevRow++ = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+	}
+
+	// Horizontal filtering for the second row - row 1
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+		*pLocalCurrRow++ = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+	}
+
+	pSrcImage += srcImageStrideInBytes;
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+
+	vx_float32 div_factor = 1; // 4.0f * 255;
+
+	// Process rows 2 until end
+	for(int y = 0; y < (int) dstHeight - 2; y++)
+	{
+		pLocalSrc = pSrcImage;
+		ago_harris_Gxy_t * pLocalDst = pDstGxy;
+		for(int x = 0; x < (int) dstWidth; x++)
+		{
+			vx_int16 gx, gy;
+			gx = (vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1];
+			gy = (vx_int16)pLocalSrc[-1] + ((vx_int16)pLocalSrc[0] << 1) + (vx_int16)pLocalSrc[1];
+
+			*pLocalNextRow++ = gx;
+			*pLocalNextRow++ = gy;
+
+			gx += *pLocalPrevRow++ + (*pLocalCurrRow++ << 1);
+			gy -= *pLocalPrevRow++;
+			pLocalCurrRow++;
+
+			pLocalDst->GxGx = ((vx_float32)gx * (vx_float32)gx) / div_factor;
+			pLocalDst->GxGy = ((vx_float32)gx * (vx_float32)gy) / div_factor;
+			pLocalDst->GyGy = ((vx_float32)gy * (vx_float32)gy) / div_factor;
+			
+			pLocalDst++;
+			pLocalSrc++;
+		}
+
+		vx_int16 * pTemp = pPrevRow;
+		pPrevRow = pCurrRow;
+		pCurrRow = pNextRow;
+		pNextRow = pTemp;
+
+		pLocalPrevRow = pPrevRow;
+		pLocalCurrRow = pCurrRow;
+		pLocalNextRow = pNextRow;
+
+		pSrcImage += srcImageStrideInBytes;
+		pDstGxy += (dstGxyStrideInBytes / sizeof(ago_harris_Gxy_t));
+	}
+
+
+#if 0
+	pSrcImage += srcImageStrideInBytes;										// First row not processed
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	__declspec(align(16)) short r0[3840 * 2], r1[3840 * 2], r2[3840 * 2];	// Intermideate buffers to store results between horizontally filtered rows - [GxL GxH GyL GyH]
+
+	__m128i * pPrevRow = (__m128i *) r0;
+	__m128i * pCurrRow = (__m128i *) r1;
+	__m128i * pNextRow = (__m128i *) r2;
+
+	__m128i row0, temp0, temp1, temp2, temp3, Gx, Gy;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__m128i * pLocalPrevRow = pPrevRow;
+	__m128i * pLocalCurrRow = pCurrRow;
+	__m128i * pLocalNextRow = pNextRow;
+	//__m128i * pTemp;
+
+	int alignedWidth = dstWidth & ~15;								// Sixteen pixels processed in a go for first two rows
+	int postfixWidth = dstWidth & 15;
+	int srcStride = (int)srcImageStrideInBytes;
+
+	// Process first two rows
+	// Process first two rows - Horizontal filtering
+	for (int x = 0; x < (int)(alignedWidth >> 4); x++)
+	{
+		__m128i shiftedR, shiftedL;
+
+		// row above
+		row0 = _mm_load_si128((__m128i *)(pLocalSrc - srcStride));
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - srcStride - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc - srcStride + 1));
+
+		temp0 = _mm_unpackhi_epi8(row0, zeromask);
+		temp0 = _mm_slli_epi16(temp0, 1);								// GyH: 2 * (0,-1)
+		Gy = _mm_cvtepu8_epi16(row0);
+		Gy = _mm_slli_epi16(Gy, 1);										// GyL: 2 * (0,-1)
+
+		Gx = _mm_cvtepu8_epi16(shiftedL);								// GxL: -1 * (-1,-1)	GyL: 1 * (-1,-1)
+		temp1 = _mm_unpackhi_epi8(shiftedL, zeromask);					// GxH: -1 * (-1,-1)	GyH: 1 * (-1,-1)
+		temp1 = _mm_add_epi16(temp0, temp1);
+		Gy = _mm_add_epi16(Gy, Gx);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// GxH: 1 * (1,-1)		GyH: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// GxL: 1 * (1,-1)		GyL: 1 * (1,-1)
+		temp1 = _mm_sub_epi16(shiftedL, temp1);
+		Gx = _mm_sub_epi16(shiftedR, Gx);
+		temp0 = _mm_add_epi16(temp0, shiftedL);
+		Gy = _mm_add_epi16(Gy, shiftedR);
+
+		_mm_store_si128(pLocalPrevRow++, Gx);
+		_mm_store_si128(pLocalPrevRow++, temp1);
+		_mm_store_si128(pLocalPrevRow++, Gy);
+		_mm_store_si128(pLocalPrevRow++, temp0);
+
+		// current row
+		row0 = _mm_load_si128((__m128i *)pLocalSrc);
+		shiftedL = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+		shiftedR = _mm_loadu_si128((__m128i *)(pLocalSrc + 1));
+
+		temp0 = _mm_unpackhi_epi8(row0, zeromask);
+		temp0 = _mm_slli_epi16(temp0, 1);								// GyH: 2 * (-1, 0)
+		Gy = _mm_cvtepu8_epi16(row0);
+		Gy = _mm_slli_epi16(Gy, 1);										// GyL: 2 * (-1, 0)
+
+		Gx = _mm_cvtepu8_epi16(shiftedL);								// GxL: -1 * (-1,-1)	GyL: 1 * (-1,-1)
+		temp1 = _mm_unpackhi_epi8(shiftedL, zeromask);					// GxH: -1 * (-1,-1)	GyH: 1 * (-1,-1)
+		temp0 = _mm_add_epi16(temp0, temp1);
+		Gy = _mm_add_epi16(Gy, Gx);
+
+		shiftedL = _mm_unpackhi_epi8(shiftedR, zeromask);				// GxH: 1 * (1,-1)		GyH: 1 * (1,-1)
+		shiftedR = _mm_cvtepu8_epi16(shiftedR);							// GxL: 1 * (1,-1)		GyL: 1 * (1,-1)
+		temp1 = _mm_sub_epi16(shiftedL, temp1);
+		Gx = _mm_sub_epi16(shiftedR, Gx);
+		temp0 = _mm_add_epi16(temp0, shiftedL);
+		Gy = _mm_add_epi16(Gy, shiftedR);
+
+		_mm_store_si128(pLocalCurrRow++, Gx);
+		_mm_store_si128(pLocalCurrRow++, temp1);
+		_mm_store_si128(pLocalCurrRow++, Gy);
+		_mm_store_si128(pLocalCurrRow++, temp0);
+
+		pLocalSrc += 16;
+	}
+
+	short * pShort_Prev = (short *)pLocalPrevRow;
+	short * pShort_Curr = (short *)pLocalCurrRow;
+	for (int x = 0; x < postfixWidth; x++)
+	{
+		// Row above
+		*pShort_Prev++ = (short)pLocalSrc[-srcStride + 1] - (short)pLocalSrc[-srcStride - 1];										// Gx
+		*pShort_Prev++ = (short)pLocalSrc[-srcStride + 1] + (short)pLocalSrc[-srcStride] + (short)pLocalSrc[-srcStride - 1];		// Gy
+
+		// Current row
+		*pShort_Curr++ = (short)pLocalSrc[1] - (short)pLocalSrc[-1];										// Gx
+		*pShort_Curr++ = (short)pLocalSrc[1] + (short)pLocalSrc[0] + (short)pLocalSrc[-1];					// Gy
+	}
+	
+	pLocalPrevRow = pPrevRow;
+	pLocalCurrRow = pCurrRow;
+	pLocalNextRow = pNextRow;
+
+	// Process rows 3 till the end
+	int height = (int)(dstHeight - 2);
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcImageStrideInBytes);				// Pointing to the row below
+
+		int width = (int)(alignedWidth >> 3);											// Eight pixels processed in a go
+		while (width)
+		{
+			__m128i prevRowFiltered, currRowFiltered;
+
+			// Horizontal filtering - next row
+			row0 = _mm_loadu_si128((__m128i *)(pLocalSrc - 1));
+			Gx = _mm_cvtepu8_epi16(row0);												// 1 * (-1,1)
+			Gy = _mm_add_epi16(Gx, zeromask);											// 1 * (-1,1)
+
+			prevRowFiltered = _mm_load_si128(pLocalPrevRow++);
+
+			row0 = _mm_srli_si128(row0, 1);
+			temp0 = _mm_cvtepu8_epi16(row0);
+			temp0 = _mm_slli_epi16(temp0, 1);											// 2 * (0,1)
+			Gy = _mm_add_epi16(Gy, temp0);
+
+			currRowFiltered = _mm_load_si128(pLocalCurrRow++);
+
+			row0 = _mm_srli_si128(row0, 1);
+			temp0 = _mm_cvtepu8_epi16(row0);											// 1 * (1,1)
+			Gx = _mm_sub_epi16(temp0, Gx);
+			Gy = _mm_add_epi16(Gy, temp0);
+
+			currRowFiltered = _mm_slli_epi16(currRowFiltered, 1);						// 2 * filteredCurrRow
+			Gx = _mm_add_epi16(Gx, currRowFiltered);
+
+			Gx = _mm_add_epi16(Gx, prevRowFiltered);									// Gx0 Gx1 Gx2 Gx3 Gx4 Gx5 Gx6 Gx7
+			Gy = _mm_subs_epi16(Gy, prevRowFiltered);									// Gy0 Gy1 Gy2 Gy3 Gy4 Gy5 Gy6 Gy7
+
+			prevRowFiltered = _mm_cvtepi16_epi32(Gx);									// Gx0 Gx1 Gx2 Gx3
+			currRowFiltered = _mm_cvtepi16_epi32(Gy);									// Gy0 Gy1 Gy2 Gy3
+
+			temp0 = _mm_shuffle_epi32(prevRowFiltered, 64);								// Gx0 Gx0 Gx0 Gx1
+			temp1 = _mm_shuffle_epi32(currRowFiltered, 64);								// Gy0 Gy0 Gy0 Gy1
+
+			temp2 = _mm_blend_epi16(temp0, temp1, 0x10);								// Gx0 Gx0 Gy0 Gx1
+			temp3 = _mm_blend_epi32(temp0, temp1, 0x14);								// Gx0 Gy0 Gy0 Gx1
+
+			
+			width--;
+
+		}
+		height--;
+	}
+#endif
+	return AGO_SUCCESS;
+}
+
+// Using separable filter
+//			-1	-2	0	2	1			1
+//										4
+//  Gx =								6
+//										4
+//										1
+int HafCpu_HarrisSobel_HG3_U8_5x5
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstGxy_,
+		vx_uint32          dstGxyStrideInBytes,
+		vx_uint8         * pSrcImage,
+		vx_uint32          srcImageStrideInBytes,
+		vx_uint8		 * pScratch
+	)
+{
+	ago_harris_Gxy_t * pDstGxy = (ago_harris_Gxy_t *)((vx_uint8 *)pDstGxy_ + 2*dstGxyStrideInBytes);
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	tmpWidth <<= 1;
+	vx_int16 * pRowMinus2 = (vx_int16*)pScratch;
+	vx_int16 * pRowMinus1 = ((vx_int16*)pScratch) + tmpWidth;
+	vx_int16 * pRowCurr = ((vx_int16*)pScratch) + (2*tmpWidth);
+	vx_int16 * pRowPlus1 = ((vx_int16*)pScratch) + (3*tmpWidth);
+	vx_int16 * pRowPlus2 = ((vx_int16*)pScratch) + (4*tmpWidth);
+
+	vx_int16 * pLocalRowMinus2 = pRowMinus2;
+	vx_int16 * pLocalRowMinus1 = pRowMinus1;
+	vx_int16 * pLocalRowCurr = pRowCurr;
+	vx_int16 * pLocalRowPlus1 = pRowPlus1;
+	vx_int16 * pLocalRowPlus2 = pRowPlus2;
+
+	// Horizontal filtering for the first row - row 0
+	vx_uint8 * pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowMinus2++ = (vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1);
+		*pLocalRowMinus2++ = (vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[0] + (vx_int16)pLocalSrc[-1]) << 2) + ((vx_int16)pLocalSrc[0] << 1);
+	}
+
+	// Horizontal filtering for the second row - row 1
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowMinus1++ = (vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1);
+		*pLocalRowMinus1++ = (vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[0] + (vx_int16)pLocalSrc[-1]) << 2) + ((vx_int16)pLocalSrc[0] << 1);
+	}
+
+	// Horizontal filtering for the second row - row 2
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowCurr++ = (vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1);
+		*pLocalRowCurr++ = (vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[0] + (vx_int16)pLocalSrc[-1]) << 2) + ((vx_int16)pLocalSrc[0] << 1);
+	}
+
+	// Horizontal filtering for the second row - row 3
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowPlus1++ = (vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1);
+		*pLocalRowPlus1++ = (vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[0] + (vx_int16)pLocalSrc[-1]) << 2) + ((vx_int16)pLocalSrc[0] << 1);
+	}
+
+	pSrcImage += srcImageStrideInBytes;
+	
+	pLocalRowMinus2 = pRowMinus2;
+	pLocalRowMinus1 = pRowMinus1;
+	pLocalRowCurr = pRowCurr;
+	pLocalRowPlus1 = pRowPlus1;
+
+	// Process rows 4 until end
+	for (int y = 0; y < (int)dstHeight - 4; y++)
+	{
+		pLocalSrc = pSrcImage;
+		for (int x = 0; x < (int)dstWidth; x++)
+		{
+			vx_int16 gx, gy;
+			
+			gx = (vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) << 1);
+			gy = (vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2] + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[0] + (vx_int16)pLocalSrc[-1]) << 2) + ((vx_int16)pLocalSrc[0] << 1);
+
+			*pLocalRowPlus2++ = gx;
+			*pLocalRowPlus2++ = gy;
+
+			gx += *pLocalRowMinus2++ + ((*pLocalRowMinus1++ + *pLocalRowCurr + *pLocalRowPlus1++) << 2) + (*pLocalRowCurr << 1);
+			gy += ((*pLocalRowPlus1++ - *pLocalRowMinus1++) << 1) - *pLocalRowMinus2++;
+			pLocalRowCurr += 2;
+
+			pDstGxy->GxGx = ((vx_float32)gx * (vx_float32)gx);	// / 16.0f;
+			pDstGxy->GxGy = ((vx_float32)gx * (vx_float32)gy);	// / 16.0f;
+			pDstGxy->GyGy = ((vx_float32)gy * (vx_float32)gy);	// / 16.0f;
+
+			pDstGxy++;
+			pLocalSrc++;
+		}
+
+		vx_int16 * pTemp = pRowMinus2;
+		pRowMinus2 = pRowMinus1;
+		pRowMinus1 = pRowCurr;
+		pRowCurr = pRowPlus1;
+		pRowPlus1 = pRowPlus2;
+		pRowPlus2 = pTemp;
+
+		pLocalRowMinus2 = pRowMinus2;
+		pLocalRowMinus1 = pRowMinus1;
+		pLocalRowCurr = pRowCurr;
+		pLocalRowPlus1 = pRowPlus1;
+		pLocalRowPlus2 = pRowPlus2;
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+	
+	return AGO_SUCCESS;
+}
+
+// Using separable filter
+//				-1	-4	-5	0	5	4	1			1
+//													6
+//													15
+//		Gx =										20	
+//													15
+//													6
+//													1
+int HafCpu_HarrisSobel_HG3_U8_7x7
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstGxy_,
+		vx_uint32          dstGxyStrideInBytes,
+		vx_uint8         * pSrcImage,
+		vx_uint32          srcImageStrideInBytes,
+		vx_uint8		 * pScratch
+	)
+{
+	ago_harris_Gxy_t * pDstGxy = (ago_harris_Gxy_t *)((vx_uint8 *)pDstGxy_ + 3*dstGxyStrideInBytes);
+
+	int tmpWidth = (dstWidth + 15) & ~15;
+	tmpWidth <<= 1;
+	vx_int16 * pRowMinus3 = (vx_int16*)pScratch;
+	vx_int16 * pRowMinus2 = ((vx_int16*)pScratch) + tmpWidth;
+	vx_int16 * pRowMinus1 = ((vx_int16*)pScratch) + (2 * tmpWidth);
+	vx_int16 * pRowCurr = ((vx_int16*)pScratch) + (3 * tmpWidth);
+	vx_int16 * pRowPlus1 = ((vx_int16*)pScratch) + (4 * tmpWidth);
+	vx_int16 * pRowPlus2 = ((vx_int16*)pScratch) + (5 * tmpWidth);
+	vx_int16 * pRowPlus3 = ((vx_int16*)pScratch) + (6 * tmpWidth);
+
+	vx_int16 * pLocalRowMinus3 = pRowMinus3;
+	vx_int16 * pLocalRowMinus2 = pRowMinus2;
+	vx_int16 * pLocalRowMinus1 = pRowMinus1;
+	vx_int16 * pLocalRowCurr = pRowCurr;
+	vx_int16 * pLocalRowPlus1 = pRowPlus1;
+	vx_int16 * pLocalRowPlus2 = pRowPlus2;
+	vx_int16 * pLocalRowPlus3 = pRowPlus3;
+
+	// Horizontal filtering for the first row - row 0
+	vx_uint8 * pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowMinus3++ = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+		*pLocalRowMinus3++ = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+	}
+
+	// Horizontal filtering for the second row - row 1
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowMinus2++ = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+		*pLocalRowMinus2++ = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+	}
+
+	// Horizontal filtering for the second row - row 2
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowMinus1++ = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+		*pLocalRowMinus1++ = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+	}
+
+	// Horizontal filtering for the second row - row 3
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowCurr++ = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+		*pLocalRowCurr++ = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+	}
+
+	// Horizontal filtering for the second row - row 4
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowPlus1++ = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+		*pLocalRowPlus1++ = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+	}
+
+	// Horizontal filtering for the second row - row 5
+	pSrcImage += srcImageStrideInBytes;
+	pLocalSrc = pSrcImage;
+	for (int x = 0; x < (int)dstWidth; x++, pLocalSrc++)
+	{
+		*pLocalRowPlus2++ = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+		*pLocalRowPlus2++ = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+	}
+
+	pSrcImage += srcImageStrideInBytes;
+
+	pLocalRowMinus3 = pRowMinus3;
+	pLocalRowMinus2 = pRowMinus2;
+	pLocalRowMinus1 = pRowMinus1;
+	pLocalRowCurr = pRowCurr;
+	pLocalRowPlus1 = pRowPlus1;
+	pLocalRowPlus2 = pRowPlus2;
+
+	// Process rows 4 until end
+	for (int y = 0; y < (int)dstHeight - 6; y++)
+	{
+		pLocalSrc = pSrcImage;
+		for (int x = 0; x < (int)dstWidth; x++)
+		{
+			vx_int16 gx, gy;
+
+			gx = (vx_int16)pLocalSrc[3] - (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] - (vx_int16)pLocalSrc[-2]) << 2) + (((vx_int16)pLocalSrc[1] - (vx_int16)pLocalSrc[-1]) * 5);
+			gy = (vx_int16)pLocalSrc[3] + (vx_int16)pLocalSrc[-3] + (((vx_int16)pLocalSrc[2] + (vx_int16)pLocalSrc[-2]) * 6) + (((vx_int16)pLocalSrc[1] + (vx_int16)pLocalSrc[-1]) * 15) + ((vx_int16)pLocalSrc[0] * 20);
+
+			*pLocalRowPlus3++ = gx;
+			*pLocalRowPlus3++ = gy;
+
+			gx += *pLocalRowMinus3++ + ((*pLocalRowMinus2++ + *pLocalRowPlus2++) * 6) + ((*pLocalRowMinus1++ + *pLocalRowPlus1++) * 15) + (*pLocalRowCurr++ * 20);
+			gy += ((*pLocalRowPlus2++ - *pLocalRowMinus2++) << 2) + ((*pLocalRowPlus1++ - *pLocalRowMinus1++) * 5) - *pLocalRowMinus3++;
+			pLocalRowCurr++;
+
+			pDstGxy->GxGx = ((vx_float32)gx * (vx_float32)gx);	// / 64.0f;
+			pDstGxy->GxGy = ((vx_float32)gx * (vx_float32)gy);	// / 64.0f;
+			pDstGxy->GyGy = ((vx_float32)gy * (vx_float32)gy);	// / 64.0f;
+
+			pDstGxy++;
+			pLocalSrc++;
+		}
+
+		vx_int16 * pTemp = pRowMinus3;
+		pRowMinus3 = pRowMinus2;
+		pRowMinus2 = pRowMinus1;
+		pRowMinus1 = pRowCurr;
+		pRowCurr = pRowPlus1;
+		pRowPlus1 = pRowPlus2;
+		pRowPlus2 = pRowPlus3;
+		pRowPlus3 = pTemp;
+
+		pLocalRowMinus3 = pRowMinus3;
+		pLocalRowMinus2 = pRowMinus2;
+		pLocalRowMinus1 = pRowMinus1;
+		pLocalRowCurr = pRowCurr;
+		pLocalRowPlus1 = pRowPlus1;
+		pLocalRowPlus2 = pRowPlus2;
+		pLocalRowPlus3 = pRowPlus3;
+
+		pSrcImage += srcImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_HarrisScore_HVC_HG3_3x3
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstVc,
+		vx_uint32          dstVcStrideInBytes,
+		vx_float32       * pSrcGxy_,
+		vx_uint32          srcGxyStrideInBytes,
+		vx_float32         sensitivity,
+		vx_float32         strength_threshold,
+		vx_float32		   normalization_factor
+	)
+{
+	ago_harris_Gxy_t * pSrcGxy = (ago_harris_Gxy_t *)pSrcGxy_;
+	vx_float32 Tc = strength_threshold;
+	vx_int32 srcStride = srcGxyStrideInBytes / sizeof(ago_harris_Gxy_t);
+	vx_int32 dstStride = dstVcStrideInBytes / sizeof(vx_float32);
+	pSrcGxy += srcStride;															// Skip first row
+	memset(pDstVc, 0, dstVcStrideInBytes);											// Zero the thresholds of first row
+	pDstVc += dstStride;
+
+	for (int y = 1; y < (int)dstHeight - 1; y++)
+	{
+		ago_harris_Gxy_t * pLocalSrc = pSrcGxy;
+		vx_float32 * pLocalDst = pDstVc;
+
+		*pLocalDst = 0;															// First column Vc = 0;
+		pLocalDst++;
+		pLocalSrc++;
+		for (int x = 1; x < (int)dstWidth - 1; x++)
+		{
+			vx_float32 gx2 = 0;
+			vx_float32 gy2 = 0;
+			vx_float32 gxy2 = 0;
+
+			// Windowing
+			for (int j = -1; j <= 1; j++)
+			{
+				ago_harris_Gxy_t * pTemp = pLocalSrc + j * srcStride;
+				for (int i = -1; i <= 1; i++)
+				{
+					gx2 += pTemp[i].GxGx;
+					gxy2 += pTemp[i].GxGy;
+					gy2 += pTemp[i].GyGy;
+				}
+			}
+
+			vx_float32 traceA = gx2 + gy2;
+			vx_float32 detA = (gx2 * gy2) - (gxy2 * gxy2);
+			vx_float32 Mc = detA - (sensitivity * traceA * traceA);
+			Mc /= normalization_factor;
+			*pLocalDst = (Mc > Tc) ? Mc : 0;
+
+			pLocalSrc++;
+			pLocalDst++;
+		}
+
+		*pLocalDst = 0;															// Last column Vc = 0;
+		pSrcGxy += srcStride;
+		pDstVc += dstStride;
+	}
+	memset(pDstVc, 0, dstVcStrideInBytes);											// Zero the thresholds of last row
+	return AGO_SUCCESS;
+}
+
+int HafCpu_HarrisScore_HVC_HG3_5x5
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstVc,
+		vx_uint32          dstVcStrideInBytes,
+		vx_float32       * pSrcGxy_,
+		vx_uint32          srcGxyStrideInBytes,
+		vx_float32         sensitivity,
+		vx_float32         strength_threshold,
+		vx_float32		   normalization_factor
+	)
+{
+	ago_harris_Gxy_t * pSrcGxy = (ago_harris_Gxy_t *)pSrcGxy_;
+	vx_float32 Tc = strength_threshold;
+	vx_int32 srcStride = srcGxyStrideInBytes / sizeof(ago_harris_Gxy_t);
+	vx_int32 dstStride = dstVcStrideInBytes / sizeof(vx_float32);
+	pSrcGxy += (srcStride + srcStride);									// Skip first two rows
+	memset(pDstVc, 0, dstVcStrideInBytes + dstVcStrideInBytes);			// Zero the thresholds of first two rows
+	pDstVc += (dstStride + dstStride);
+
+	for (int y = 2; y < (int)dstHeight - 2; y++)
+	{
+		ago_harris_Gxy_t * pLocalSrc = pSrcGxy;
+		vx_float32  * pLocalDst = pDstVc;
+
+		*pLocalDst = 0;															// First column Vc = 0;
+		pLocalDst++;
+		*pLocalDst = 0;															// Second column Vc = 0;
+		pLocalDst++;
+		pLocalSrc += 2;
+
+		for (int x = 2; x < (int)dstWidth - 2; x++)
+		{
+			vx_float32 gx2 = 0;
+			vx_float32 gy2 = 0;
+			vx_float32 gxy2 = 0;
+
+			// Windowing
+			for (int j = -2; j <= 2; j++)
+			{
+				for (int i = -2; i <= 2; i++)
+				{
+					gx2 += pLocalSrc[j * srcStride + i].GxGx;
+					gxy2 += pLocalSrc[j * srcStride + i].GxGy;
+					gy2 += pLocalSrc[j * srcStride + i].GyGy;
+				}
+			}
+
+			vx_float32 traceA = gx2 + gy2;
+			vx_float32 detA = (gx2 * gy2) - (gxy2 * gxy2);
+			vx_float32 Mc = detA - (sensitivity * traceA * traceA);
+			Mc /= normalization_factor;
+			*pLocalDst = (Mc > Tc) ? Mc : 0;
+
+			pLocalSrc++;
+			pLocalDst++;
+		}
+
+		*pLocalDst = 0;															// second to last column Vc = 0;
+		pLocalDst++;
+		*pLocalDst = 0;															// last column Vc = 0;
+
+		pSrcGxy += srcStride;
+		pDstVc += dstStride;
+	}
+	memset(pDstVc, 0, dstVcStrideInBytes + dstVcStrideInBytes);					// Zero the thresholds of last rows
+	return AGO_SUCCESS;
+}
+
+int HafCpu_HarrisScore_HVC_HG3_7x7
+	(
+		vx_uint32          dstWidth,
+		vx_uint32          dstHeight,
+		vx_float32       * pDstVc,
+		vx_uint32          dstVcStrideInBytes,
+		vx_float32       * pSrcGxy_,
+		vx_uint32          srcGxyStrideInBytes,
+		vx_float32         sensitivity,
+		vx_float32         strength_threshold,
+		vx_float32		   normalization_factor
+	)
+{
+	ago_harris_Gxy_t * pSrcGxy = (ago_harris_Gxy_t *)pSrcGxy_;
+	vx_float32 Tc = strength_threshold;
+	vx_int32 srcStride = srcGxyStrideInBytes / sizeof(ago_harris_Gxy_t);
+	vx_int32 dstStride = dstVcStrideInBytes / sizeof(vx_float32);
+	pSrcGxy += (srcStride * 3);																// Skip first three rows
+	memset(pDstVc, 0, dstVcStrideInBytes * 3);												// Zero the thresholds of first three rows
+	pDstVc += (dstStride * 3);
+
+	for (int y = 3; y < (int)dstHeight - 3; y++)
+	{
+		ago_harris_Gxy_t * pLocalSrc = pSrcGxy;
+		vx_float32 * pLocalDst = pDstVc;
+
+		*pLocalDst = 0;															// First column Vc = 0;
+		pLocalDst++;
+		*pLocalDst = 0;															// Second column Vc = 0;
+		pLocalDst++;
+		*pLocalDst = 0;															// Third column Vc = 0;
+		pLocalSrc += 3;
+
+		for (int x = 3; x < (int)dstWidth - 3; x++)
+		{
+			vx_float32 gx2 = 0;
+			vx_float32 gy2 = 0;
+			vx_float32 gxy2 = 0;
+
+			// Windowing
+			for (int j = -3; j <= 3; j++)
+			{
+				for (int i = -3; i <= 3; i++)
+				{
+					gx2 += pLocalSrc[j * srcStride + i].GxGx;
+					gxy2 += pLocalSrc[j * srcStride + i].GxGy;
+					gy2 += pLocalSrc[j * srcStride + i].GyGy;
+				}
+			}
+
+			vx_float32 traceA = gx2 + gy2;
+			vx_float32 detA = (gx2 * gy2) - (gxy2 * gxy2);
+			vx_float32 Mc = detA - (sensitivity * traceA * traceA);
+			Mc /= normalization_factor;
+			*pLocalDst = (Mc > Tc) ? Mc : 0;
+
+			pLocalSrc++;
+			pLocalDst++;
+		}
+
+		*pLocalDst = 0;															// third to last column Vc = 0;
+		pLocalDst++;
+		*pLocalDst = 0;															// second to last column Vc = 0;
+		pLocalDst++;
+		*pLocalDst = 0;															// last column Vc = 0;
+
+		pSrcGxy += srcStride;
+		pDstVc += dstStride;
+	}
+	memset(pDstVc, 0, dstVcStrideInBytes * 3);											// Zero the thresholds of last rows
+	return AGO_SUCCESS;
+}
+
+int HafCpu_HarrisMergeSortAndPick_XY_HVC
+	(
+		vx_uint32         capacityOfDstCorner,
+		vx_keypoint_t     dstCorner[],
+		vx_uint32       * pDstCornerCount,
+		vx_uint32         srcWidth,
+		vx_uint32         srcHeight,
+		vx_float32      * pSrcVc,
+		vx_uint32         srcVcStrideInBytes,
+		vx_float32        min_distance
+	)
+{
+	vx_float32      * pLocalSrc;
+	vx_float32      * pSrcVc_NMS = pSrcVc;
+	vx_int32 radius = (vx_int32) min_distance;
+
+	// Non max supression
+	for (vx_int32 y = 0; y < (vx_int32)srcHeight; y++)
+	{
+		pLocalSrc = pSrcVc_NMS;
+		for (vx_int32 x = 0; x < (vx_int32)srcWidth; x++)
+		{
+			vx_float32 Vc = *pLocalSrc;
+			if (Vc)
+			{
+				
+				for (vx_int32 i = max(y - radius, 0); i <= min(y + radius, (vx_int32) srcHeight - 1); i++)
+				{
+					for (vx_int32 j = max(x - radius, 0); j <= min(x + radius, (vx_int32) srcWidth - 1); j++)
+					{
+						if ((vx_float32)((y-i)*(y-i)) + (vx_float32)((x-j)*(x-j)) <= radius*radius)
+						{
+							vx_float32 * neighbor = (vx_float32 *)(((char *)pLocalSrc) + (i - y) * (vx_int32)srcVcStrideInBytes + (j - x) * sizeof(vx_float32));
+							if (*neighbor < Vc)
+								*neighbor = 0;
+						}
+					}
+				}
+			}
+			pLocalSrc++;
+		}
+
+		pSrcVc_NMS = (vx_float32 *)((char *)pSrcVc_NMS + srcVcStrideInBytes);
+	}	
+
+	// Populate the sorted list
+	vx_keypoint_t cand;
+	vx_uint32 numCorners = 0;
+	
+	for (vx_uint32 y = 0; y < srcHeight; y++)
+	{
+		pLocalSrc = pSrcVc;
+		for (vx_uint32 x = 0; x < srcWidth; x++)
+		{
+			if (*pLocalSrc)
+			{
+				cand.x = x;
+				cand.y = y;
+				cand.strength = *pLocalSrc;
+				cand.scale = 0;
+				cand.orientation = 0;
+				cand.error = 0;
+				cand.tracking_status = 1;
+				if (numCorners < capacityOfDstCorner)
+					AddToTheSortedKeypointList(capacityOfDstCorner, dstCorner, &numCorners, cand);
+				else
+					numCorners++;
+			}
+			pLocalSrc++;
+		}
+		pSrcVc = (vx_float32 *)((char *)pSrcVc + srcVcStrideInBytes);
+	}
+
+	*pDstCornerCount = numCorners;
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_NonMaxSupp_XY_ANY_3x3
+	(
+		vx_uint32               capacityOfList,
+		ago_keypoint_xys_t    * dstList,
+		vx_uint32             * pDstListCount,
+		vx_uint32               srcWidth,
+		vx_uint32               srcHeight,
+		vx_float32            * pSrcImg,
+		vx_uint32               srcStrideInBytes
+	)
+{
+	vx_uint32 count = 0;
+	const vx_uint8 * pImg = (const vx_uint8 *)pSrcImg;
+	for (vx_uint32 y = 1; y < srcHeight - 1; y++, pImg += srcStrideInBytes) {
+		if (count >= capacityOfList)
+			break;
+		const vx_float32 * p9 = (const vx_float32 *)&pImg[0];
+		const vx_float32 * p0 = (const vx_float32 *)&pImg[srcStrideInBytes];
+		const vx_float32 * p1 = (const vx_float32 *)&pImg[srcStrideInBytes << 1];
+		for (vx_uint32 x = 1; x < srcWidth - 1; x++) {
+			if (p0[1] >= p9[0] && p0[1] >= p9[1] && p0[1] >= p9[2] &&
+				p0[1] >= p0[0]                   && p0[1] >  p0[2] &&
+				p0[1] >  p1[0] && p0[1] >  p1[1] && p0[1] >  p1[2])
+			{
+				dstList->x = x;
+				dstList->y = y;
+				dstList->s = p0[1];
+				dstList++;
+				count++;
+				if (count >= capacityOfList)
+					break;
+			}
+			p9++;
+			p0++;
+			p1++;
+		}
+	}
+	*pDstListCount = count;
+	return AGO_SUCCESS;
+}
+
+int HafCpu_HarrisMergeSortAndPick_XY_XYS
+	(
+		vx_uint32                  capacityOfDstCorner,
+		vx_keypoint_t            * dstCorner,
+		vx_uint32                * pDstCornerCount,
+		ago_keypoint_xys_t       * srcList,
+		vx_uint32                  srcListCount,
+		vx_float32                 min_distance,
+		ago_harris_grid_header_t * gridInfo,
+		ago_coord2d_short_t      * gridBuf
+	)
+{
+	// sort the keypoint XYS list
+	std::sort((vx_int64 *)&srcList[0], (vx_int64 *)&srcList[srcListCount], std::greater<vx_int64>());
+	// extract useful keypoints from XYS list into corners array
+	vx_uint32 count = 0;
+	if (gridInfo) {
+		// get grid info and initialize grid buffer if (-1,-1) coordinate values indicating no presence of values
+		vx_uint32 gridWidth = gridInfo->width;
+		vx_uint32 gridHeight = gridInfo->height;
+		vx_uint32 cellSize = gridInfo->cellSize;
+		HafCpu_MemSet_U32(gridInfo->gridBufSize >> 2, (vx_uint32 *)gridBuf, (vx_uint32)-1);
+		// filter the keypoints with min_distance
+		vx_int32 min_dist2 = (vx_int32)ceilf(min_distance * min_distance);
+		vx_keypoint_t * corner = dstCorner;
+		for (vx_uint32 i = 0; i < srcListCount; i++) {
+			vx_uint32 x = srcList[i].x, y = srcList[i].y;
+			bool found = true;
+			vx_int32 cx = (vx_int32)x / cellSize, cy = (vx_int32)y / cellSize;
+			ago_coord2d_short_t * cgrid = gridBuf + cy * gridWidth + cx;
+			if (cgrid->x < 0) {
+				vx_int32 cxmin = max(cx - 2, 0), cxmax = min(cx + 2, (vx_int32)gridWidth - 1), cw = cxmax - cxmin + 1;
+				vx_int32 cymin = max(cy - 2, 0), cymax = min(cy + 2, (vx_int32)gridHeight - 1), ch = cymax - cymin + 1;
+				ago_coord2d_short_t * grid = gridBuf + cxmin + cymin * gridWidth;
+				for (vx_int32 icy = 0; icy < ch; icy++, grid += gridWidth) {
+					for (vx_int32 icx = 0; icx < cw; icx++) {
+						int ix = grid[icx].x;
+						if (ix >= 0) {
+							int iy = grid[icx].y;
+							ix -= x; iy -= y;
+							int dist2 = ix*ix + iy*iy;
+							if (dist2 < min_dist2) {
+								goto search_done;
+							}
+						}
+					}
+				}
+				found = false;
+			}
+		search_done:
+			if (!found) {
+				if (count < capacityOfDstCorner) {
+					corner->x = x;
+					corner->y = y;
+					corner->strength = srcList[i].s;
+					corner->tracking_status = 1;
+					corner->error = 0;
+					corner->scale = 0.0f;
+					corner->orientation = 0.0f;
+					corner++;
+				}
+				count++;
+				cgrid->x = x;
+				cgrid->y = y;
+			}
+		}
+	}
+	else {
+		// copy all points into output array
+		count = (srcListCount < capacityOfDstCorner) ? srcListCount : capacityOfDstCorner;
+		for (vx_uint32 i = 0; i < count; i++, dstCorner++, srcList++) {
+			dstCorner->x = srcList->x;
+			dstCorner->y = srcList->y;
+			dstCorner->strength = srcList->s;
+			dstCorner->tracking_status = 1;
+			dstCorner->error = 0;
+			dstCorner->scale = 0.0f;
+			dstCorner->orientation = 0.0f;
+		}
+	}
+	*pDstCornerCount = count;
+	return AGO_SUCCESS;
+}
diff --git a/openvx/ago/ago_haf_cpu_histogram.cpp b/openvx/ago/ago_haf_cpu_histogram.cpp
new file mode 100644
index 0000000..7bbb94c
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_histogram.cpp
@@ -0,0 +1,605 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+// The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+// It processes the pixels in a width which is the next highest multiple of 16 after dstWidth
+static int HafCpu_Histogram1Threshold_DATA_U8
+	(
+		vx_uint32     dstHist[],
+		vx_uint8      distThreshold,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	// thresh: source threshold in -128..127 range
+	__m128i offset = _mm_set1_epi8((char)0x80);
+	__m128i thresh = _mm_set1_epi8((char)((distThreshold - 1) ^ 0x80));
+	__m128i onemask = _mm_set1_epi8((char)1);
+	// process one pixel row at a time that counts "pixel < srcThreshold"
+	__m128i count = _mm_set1_epi8((char)0);
+	vx_uint8 * srcRow = pSrcImage;
+	vx_uint32 width = (srcWidth + 15) >> 4;
+	for (unsigned int y = 0; y < srcHeight; y++) {
+		__m128i * src = (__m128i *)srcRow;
+		for (unsigned int x = 0; x < width; x++) {
+			__m128i pixels = _mm_load_si128(src++);
+			pixels = _mm_xor_si128(pixels, offset);
+			pixels = _mm_cmpgt_epi8(pixels, thresh);
+			pixels = _mm_and_si128(pixels, onemask);
+			pixels = _mm_sad_epu8(pixels, onemask);
+			count = _mm_add_epi32(count, pixels);
+		}
+		srcRow += srcImageStrideInBytes;
+	}
+	// extract histogram from count
+	dstHist[0] = M128I(count).m128i_u32[0] + M128I(count).m128i_u32[2];
+	dstHist[1] = srcWidth * srcHeight - dstHist[0];
+	return AGO_SUCCESS;
+}
+
+static int HafCpu_Histogram3Thresholds_DATA_U8
+	(
+		vx_uint32     dstHist[],
+		vx_uint8      distThreshold0,
+		vx_uint8      distThreshold1,
+		vx_uint8      distThreshold2,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	// thresh: source threshold in -128..127 range
+	__m128i offset = _mm_set1_epi8((char)0x80);
+	__m128i T0 = _mm_set1_epi8((char)((distThreshold0 - 1) ^ 0x80));
+	__m128i T1 = _mm_set1_epi8((char)((distThreshold1 - 1) ^ 0x80));
+	__m128i T2 = _mm_set1_epi8((char)((distThreshold2 - 1) ^ 0x80));
+	__m128i onemask = _mm_set1_epi8((char)1);
+	// process one pixel row at a time that counts "pixel < srcThreshold"
+	__m128i count0 = _mm_set1_epi8((char)0);
+	__m128i count1 = _mm_set1_epi8((char)0);
+	__m128i count2 = _mm_set1_epi8((char)0);
+	vx_uint8 * srcRow = pSrcImage;
+	vx_uint32 width = (srcWidth + 15) >> 4;
+	for (unsigned int y = 0; y < srcHeight; y++) {
+		__m128i * src = (__m128i *)srcRow;
+		for (unsigned int x = 0; x < width; x++) {
+			__m128i pixels = _mm_load_si128(src++);
+			pixels = _mm_xor_si128(pixels, offset);
+			__m128i cmpout;
+			cmpout = _mm_cmpgt_epi8(pixels, T0);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count0 = _mm_add_epi32(count0, cmpout);
+			cmpout = _mm_cmpgt_epi8(pixels, T1);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count1 = _mm_add_epi32(count1, cmpout);
+			cmpout = _mm_cmpgt_epi8(pixels, T2);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count2 = _mm_add_epi32(count2, cmpout);
+		}
+		srcRow += srcImageStrideInBytes;
+	}
+	// extract histogram from count: special case needed when T1 == T2
+	dstHist[0] = M128I(count0).m128i_u32[0] + M128I(count0).m128i_u32[2];
+	dstHist[1] = M128I(count1).m128i_u32[0] + M128I(count1).m128i_u32[2] - dstHist[0];
+	dstHist[2] = M128I(count2).m128i_u32[0] + M128I(count2).m128i_u32[2] - dstHist[0] - dstHist[1];
+	dstHist[3] = srcWidth * srcHeight - dstHist[0] - dstHist[1] - dstHist[2];
+	if (M128I(T1).m128i_i8[0] == M128I(T2).m128i_i8[0]) {
+		dstHist[2] = dstHist[3];
+		dstHist[3] = 0;
+	}
+	return AGO_SUCCESS;
+}
+
+static int HafCpu_Histogram8Bins_DATA_U8
+	(
+		vx_uint32   * dstHist,
+		vx_uint8      distOffset, 
+		vx_uint8      distWindow,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	// thresh: source threshold in -128..127 range
+	__m128i offset = _mm_set1_epi8((char)0x80);
+	__m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80));
+	__m128i dT = _mm_set1_epi8((char)distWindow);
+	__m128i onemask = _mm_set1_epi8((char)1);
+	// process one pixel row at a time that counts "pixel < srcThreshold"
+	vx_uint32 count[9] = { 0 };
+	vx_uint8 * srcRow = pSrcImage;
+	vx_uint32 width = (srcWidth + 15) >> 4;
+	for (unsigned int y = 0; y < srcHeight; y++) {
+		__m128i * src = (__m128i *)srcRow;
+		__m128i count0 = _mm_set1_epi8((char)0);
+		__m128i count1 = _mm_set1_epi8((char)0);
+		__m128i count2 = _mm_set1_epi8((char)0);
+		for (unsigned int x = 0; x < width; x++) {
+			__m128i pixels = _mm_load_si128(src++);
+			pixels = _mm_xor_si128(pixels, offset);
+			__m128i cmpout, Tnext = T0;
+			// 0..3
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count0 = _mm_add_epi32(count0, cmpout);
+			// 4..7
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count1 = _mm_add_epi32(count1, cmpout);
+			// 8
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count2 = _mm_add_epi32(count2, cmpout);
+		}
+		srcRow += srcImageStrideInBytes;
+		// move counts from count0..2 into count[]
+		for (int i = 0; i < 4; i++) {
+			count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i];
+			count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i];
+		}
+		count[8 + 0] += M128I(count2).m128i_u16[0] + M128I(count2).m128i_u16[4 + 0];
+	}
+	// extract histogram from count
+	if (distOffset == 0) {
+		vx_uint32 last = (distWindow >= 32) ? srcWidth * srcHeight : count[7];
+		for (int i = 6; i >= 0; i--) {
+			count[i] = last - count[i];
+			last -= count[i];
+		}
+		dstHist[0] = last;
+		for (int i = 1; i < 8; i++)
+			dstHist[i] = count[i - 1];
+	}
+	else {
+		vx_uint32 last = (distOffset + distWindow * 8 - 1 > 255) ? srcWidth * srcHeight : count[8];
+		for (int i = 7; i >= 0; i--) {
+			count[i] = last - count[i];
+			last -= count[i];
+			dstHist[i] = count[i];
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+static int HafCpu_Histogram9Bins_DATA_U8
+	(
+		vx_uint32   * dstHist,
+		vx_uint8      distOffset,
+		vx_uint8      distWindow,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	// thresh: source threshold in -128..127 range
+	__m128i offset = _mm_set1_epi8((char)0x80);
+	__m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80));
+	__m128i dT = _mm_set1_epi8((char)distWindow);
+	__m128i onemask = _mm_set1_epi8((char)1);
+	// process one pixel row at a time that counts "pixel < srcThreshold"
+	vx_uint32 count[10] = { 0 };
+	vx_uint8 * srcRow = pSrcImage;
+	vx_uint32 width = (srcWidth + 15) >> 4;
+	for (unsigned int y = 0; y < srcHeight; y++) {
+		__m128i * src = (__m128i *)srcRow;
+		__m128i count0 = _mm_set1_epi8((char)0);
+		__m128i count1 = _mm_set1_epi8((char)0);
+		__m128i count2 = _mm_set1_epi8((char)0);
+		for (unsigned int x = 0; x < width; x++) {
+			__m128i pixels = _mm_load_si128(src++);
+			pixels = _mm_xor_si128(pixels, offset);
+			__m128i cmpout, Tnext = T0;
+			// 0..3
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count0 = _mm_add_epi32(count0, cmpout);
+			// 4..7
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count1 = _mm_add_epi32(count1, cmpout);
+			// 8..9
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count2 = _mm_add_epi32(count2, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count2 = _mm_add_epi32(count2, cmpout);
+		}
+		srcRow += srcImageStrideInBytes;
+		// move counts from count0..2 into count[]
+		for (int i = 0; i < 4; i++) {
+			count[0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i];
+			count[4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i];
+		}
+		count[8 + 0] += M128I(count2).m128i_u16[0] + M128I(count2).m128i_u16[4 + 0];
+		count[8 + 1] += M128I(count2).m128i_u16[1] + M128I(count2).m128i_u16[4 + 1];
+	}
+	// extract histogram from count
+	if (distOffset == 0) {
+		vx_uint32 last = (distWindow >= 29) ? srcWidth * srcHeight : count[8];
+		for (int i = 7; i >= 0; i--) {
+			count[i] = last - count[i];
+			last -= count[i];
+		}
+		dstHist[0] = last;
+		for (int i = 1; i < 9; i++)
+			dstHist[i] = count[i - 1];
+	}
+	else {
+		vx_uint32 last = (distOffset + distWindow * 9 - 1 > 255) ? srcWidth * srcHeight : count[9];
+		for (int i = 8; i >= 0; i--) {
+			count[i] = last - count[i];
+			last -= count[i];
+			dstHist[i] = count[i];
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+static int HafCpu_Histogram16Bins_DATA_U8
+	(
+		vx_uint32   * dstHist,
+		vx_uint8      distOffset, 
+		vx_uint8      distWindow,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
+	// thresh: source threshold in -128..127 range
+	__m128i offset = _mm_set1_epi8((char)0x80);
+	__m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80));
+	__m128i dT = _mm_set1_epi8((char)distWindow);
+	__m128i onemask = _mm_set1_epi8((char)1);
+	// process one pixel row at a time that counts "pixel < srcThreshold"
+	vx_uint32 count[16] = { 0 };
+	vx_uint8 * srcRow = pSrcImage;
+	vx_uint32 width = (srcWidth + 15) >> 4;
+	for (unsigned int y = 0; y < srcHeight; y++) {
+		__m128i * src = (__m128i *)srcRow;
+		__m128i count0 = _mm_set1_epi8((char)0);
+		__m128i count1 = _mm_set1_epi8((char)0);
+		__m128i count2 = _mm_set1_epi8((char)0);
+		__m128i count3 = _mm_set1_epi8((char)0);
+		for (unsigned int x = 0; x < width; x++) {
+			__m128i pixels = _mm_load_si128(src++);
+			pixels = _mm_xor_si128(pixels, offset);
+			__m128i cmpout, Tnext = T0;
+			// 0..3
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count0 = _mm_add_epi32(count0, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count0 = _mm_add_epi32(count0, cmpout);
+			// 4..7
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count1 = _mm_add_epi32(count1, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count1 = _mm_add_epi32(count1, cmpout);
+			// 8..11
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count2 = _mm_add_epi32(count2, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count2 = _mm_add_epi32(count2, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count2 = _mm_add_epi32(count2, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count2 = _mm_add_epi32(count2, cmpout);
+			// 12..15
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			count3 = _mm_add_epi32(count3, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 16);
+			count3 = _mm_add_epi32(count3, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 32);
+			count3 = _mm_add_epi32(count3, cmpout);
+			Tnext = _mm_add_epi8(Tnext, dT);
+			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
+			cmpout = _mm_and_si128(cmpout, onemask);
+			cmpout = _mm_sad_epu8(cmpout, onemask);
+			cmpout = _mm_slli_epi64(cmpout, 48);
+			count3 = _mm_add_epi32(count3, cmpout);
+		}
+		srcRow += srcImageStrideInBytes;
+		// move counts from count0..2 into count[]
+		for (int i = 0; i < 4; i++) {
+			count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i];
+			count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i];
+			count[ 8 + i] += M128I(count2).m128i_u16[i] + M128I(count2).m128i_u16[4 + i];
+			count[12 + i] += M128I(count3).m128i_u16[i] + M128I(count3).m128i_u16[4 + i];
+		}
+	}
+	// extract histogram from count
+	if (distOffset == 0) {
+		vx_uint32 last = (distWindow >= 16) ? srcWidth * srcHeight : count[15];
+		for (int i = 14; i >= 0; i--) {
+			count[i] = last - count[i];
+			last -= count[i];
+		}
+		dstHist[0] = last;
+		for (int i = 1; i < 16; i++)
+			dstHist[i] = count[i - 1];
+	}
+	else {
+		vx_uint32 last = srcWidth * srcHeight;
+		for (int i = 15; i >= 0; i--) {
+			count[i] = last - count[i];
+			last -= count[i];
+			dstHist[i] = count[i];
+		}
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_HistogramFixedBins_DATA_U8
+	(
+		vx_uint32     dstHist[],
+		vx_uint32     distBinCount,
+		vx_uint32     distOffset,
+		vx_uint32     distRange,
+		vx_uint32     distWindow,
+		vx_uint32     srcWidth,
+		vx_uint32     srcHeight,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	int status = AGO_ERROR_HAFCPU_NOT_IMPLEMENTED;
+
+	// compute number of split points in [0..255] range to compute the histogram
+	vx_int32 numSplits = (distBinCount - 1) + ((distOffset > 0) ? 1 : 0) + (((distOffset + distRange) < 256) ? 1 : 0);
+	bool useGeneral = (srcWidth & 7) || (((intptr_t)pSrcImage) & 15);			// Use general code if width is not multiple of 8 or the buffer is unaligned
+	if ((numSplits < 1 && distBinCount > 1) || (distBinCount == 0)) return status;
+
+	if (numSplits <= 3 && !useGeneral) {
+		if (numSplits == 0) {
+			dstHist[0] = srcWidth * srcHeight;
+			status = VX_SUCCESS;
+		}
+		else if (numSplits == 1) {
+			vx_uint32 hist[2];
+			status = HafCpu_Histogram1Threshold_DATA_U8(hist, distOffset ? distOffset : distWindow, srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes);
+			if (distBinCount == 1) {
+				dstHist[0] = hist[distOffset > 0 ? 1 : 0];
+			}
+			else {
+				dstHist[0] = hist[0];
+				dstHist[1] = hist[1];
+			}
+		}
+		else {
+			// compute thresholds (split-points)
+			vx_uint8 thresh[3], tlast = 0;
+			vx_uint32 split = 0;
+			if (distOffset > 0)
+				tlast = thresh[split++] = distOffset;
+			for (vx_uint32 bin = 1; bin < distBinCount; bin++)
+				tlast = thresh[split++] = tlast + distWindow;
+			if (split < 3) {
+				if (((int)distOffset + distRange) < 256)
+					tlast = thresh[split++] = tlast + distWindow;
+				while (split < 3)
+					thresh[split++] = tlast;
+			}
+			vx_uint32 count[4];
+			status = HafCpu_Histogram3Thresholds_DATA_U8(count, thresh[0], thresh[1], thresh[2], srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes);
+			if (!status) {
+				for (vx_uint32 i = 0; i < distBinCount; i++) {
+					dstHist[i] = count[i + (distOffset ? 1 : 0)];
+				}
+			}
+		}
+	}
+	else if (distBinCount == 8 && !useGeneral) {
+		status = HafCpu_Histogram8Bins_DATA_U8(dstHist, distOffset, distWindow, srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes);
+	}
+	else if (distBinCount == 9 && !useGeneral) {
+		status = HafCpu_Histogram9Bins_DATA_U8(dstHist, distOffset, distWindow, srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes);
+	}
+	else if (distBinCount == 16 && numSplits <= 16 && !useGeneral) {
+		status = HafCpu_Histogram16Bins_DATA_U8(dstHist, distOffset, distWindow, srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes);
+	}
+	else {
+		// use general 256-bin histogram
+		vx_uint32 histTmp[256];
+		status = HafCpu_Histogram_DATA_U8(histTmp, srcWidth, srcHeight, pSrcImage, srcImageStrideInBytes);
+		if (!status) {
+			// convert [256] histogram into [numbins]
+			if (distWindow == 1) {
+				memcpy(dstHist, &histTmp[distOffset], distBinCount * sizeof(vx_uint32));
+			}
+			else {
+				for (vx_uint32 i = 0, j = distOffset; i < distBinCount; i++) {
+					vx_uint32 count = 0, end = distOffset + distRange;
+					for (vx_uint32 jend = ((j + distWindow) < end) ? (j + distWindow) : end; j < jend; j++) {
+						count += histTmp[j];
+					}
+					dstHist[i] = count;
+				}
+			}
+		}
+	}
+	return status;
+}
diff --git a/openvx/ago/ago_haf_cpu_logical.cpp b/openvx/ago/ago_haf_cpu_logical.cpp
new file mode 100644
index 0000000..ffc163e
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_logical.cpp
@@ -0,0 +1,4759 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+vx_uint32 dataConvertU1ToU8_4bytes[16] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
+									       0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
+									       0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
+									       0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF };
+
+int HafCpu_Not_U8_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;			// Check if src and dst buffers are 16 byte aligned
+
+	__m128i *pLocalSrc_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc, *pLocalDst;
+	
+	int height = (int)dstHeight;
+	int alignedWidth = (int)(dstWidth & ~63);
+	int postfixWidth = dstWidth - alignedWidth;
+
+	__m128i ones = _mm_setzero_si128();
+	ones = _mm_cmpeq_epi32(ones, ones);
+
+	if (useAligned)
+	{
+		while (height > 0)
+		{
+
+			pLocalSrc_xmm = (__m128i*) pSrcImage;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+			int width = alignedWidth >> 6;
+
+			while (width > 0)
+			{
+				__m128i pixels0 = _mm_load_si128(pLocalSrc_xmm++);
+
+				__m128i pixels1 = _mm_load_si128(pLocalSrc_xmm++);
+				pixels0 = _mm_andnot_si128(pixels0, ones);
+
+				__m128i pixels2 = _mm_load_si128(pLocalSrc_xmm++);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels0);
+
+				__m128i pixels3 = _mm_load_si128(pLocalSrc_xmm++);
+				pixels2 = _mm_andnot_si128(pixels2, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+
+				pixels3 = _mm_andnot_si128(pixels3, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels2);
+
+				_mm_store_si128(pLocalDst_xmm++, pixels3);
+
+				width--;
+			}
+
+			width = postfixWidth;
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			while (width > 0)
+			{
+				*pLocalDst++ = ~(*pLocalSrc++);
+				width--;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	else							// Unaligned access
+	{
+		while (height > 0)
+		{
+
+			pLocalSrc_xmm = (__m128i*) pSrcImage;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+			int width = alignedWidth >> 6;
+
+			while (width > 0)
+			{
+				__m128i pixels0 = _mm_loadu_si128(pLocalSrc_xmm++);
+
+				__m128i pixels1 = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels0 = _mm_andnot_si128(pixels0, ones);
+
+				__m128i pixels2 = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels0);
+
+				__m128i pixels3 = _mm_loadu_si128(pLocalSrc_xmm++);
+				pixels2 = _mm_andnot_si128(pixels2, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+
+				pixels3 = _mm_andnot_si128(pixels3, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels2);
+
+				_mm_storeu_si128(pLocalDst_xmm++, pixels3);
+
+				width--;
+		}
+
+			width = postfixWidth;
+			pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			while (width > 0)
+			{
+				*pLocalDst++ = ~(*pLocalSrc++);
+				width--;
+			}
+
+			pSrcImage += srcImageStrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+			height--;
+		}
+	}
+	
+#if 0
+	__m128i *pLocalSrc, *pLocalDst;
+
+	int height = (int)dstHeight, width = (int)(dstWidth >> 7);
+
+	__m128i ones = _mm_setzero_si128();
+	ones = _mm_cmpeq_epi32(ones, ones);
+
+	while (height > 0)
+	{
+		pLocalSrc = (__m128i*) pSrcImage;
+		pLocalDst = (__m128i*) pDstImage;
+		while (width > 0)
+		{
+			__m128i pixels0 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels1 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels2 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels3 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels4 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels5 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels6 = _mm_load_si128(pLocalSrc++);
+			__m128i pixels7 = _mm_load_si128(pLocalSrc++);
+
+			pixels0 = _mm_andnot_si128(pixels0, ones);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			pixels2 = _mm_andnot_si128(pixels2, ones);
+			pixels3 = _mm_andnot_si128(pixels3, ones);
+			pixels4 = _mm_andnot_si128(pixels4, ones);
+			pixels5 = _mm_andnot_si128(pixels5, ones);
+			pixels6 = _mm_andnot_si128(pixels6, ones);
+			pixels7 = _mm_andnot_si128(pixels7, ones);
+
+			_mm_store_si128(pLocalDst++, pixels0);
+			_mm_store_si128(pLocalDst++, pixels1);
+			_mm_store_si128(pLocalDst++, pixels2);
+			_mm_store_si128(pLocalDst++, pixels3);
+			_mm_store_si128(pLocalDst++, pixels4);
+			_mm_store_si128(pLocalDst++, pixels5);
+			_mm_store_si128(pLocalDst++, pixels6);
+			_mm_store_si128(pLocalDst++, pixels7);
+
+			width--;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		height--;
+		width = (int)(dstWidth >> 7);
+	}
+#endif
+#if 0
+	_asm {
+		mov			ebx, dstHeight
+			mov			ecx, pDstImage
+			mov			edx, pSrcImage
+
+			pxor		xmm0, xmm0
+			pcmpeqd		xmm0, xmm0
+
+		OUTERLOOP:
+		mov			eax, dstWidth
+		INNERLOOP:
+		movdqa		xmm1, [edx]
+			movdqa		xmm2, [edx + 10h]
+			movdqa		xmm3, [edx + 20h]
+			movdqa		xmm4, [edx + 30h]
+
+			pandn		xmm1, xmm0
+			pandn		xmm2, xmm0
+			pandn		xmm3, xmm0
+			pandn		xmm4, xmm0
+
+			movdqa		[ecx], xmm1
+			movdqa		[ecx + 10h], xmm2
+			movdqa		[ecx + 20h], xmm3
+			movdqa		[ecx + 30h], xmm4
+
+			add			edx, 40h
+			add			ecx, 40h
+			sub			eax, 40h
+			jnz			INNERLOOP
+
+			dec			ebx
+			jnz			OUTERLOOP
+	}
+#endif
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Not_U8_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	__m128i zeromask = _mm_setzero_si128();
+
+	uint64_t maskConv = 0x0101010101010101;						// Getting LSB out of each byte
+	__declspec(align(16)) uint64_t pixels_u64[2];
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels_u64[0] = (uint64_t)(*pSrcImage);
+			pixels_u64[1] = (uint64_t)(*(pSrcImage + 8));
+
+			// Convert U1 to U8
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+
+			pixels = _mm_load_si128((__m128i*) pixels_u64);
+			pixels = _mm_cmpgt_epi8(pixels, zeromask);				// Convert 0x01 to 0xFF
+			
+			pixels = _mm_andnot_si128(pixels, ones);
+			_mm_store_si128(&dst[width >> 4], pixels);
+		}
+		pSrcImage += srcImageStrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Not_U1_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * src = (__m128i*)pSrcImage;
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels = _mm_load_si128(&src[width >> 4]);
+			pixels = _mm_andnot_si128(pixels, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src += (srcImageStrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function processes the pixels in a width which is the next highest multiple of 2 bytes after dstWidth */
+int HafCpu_Not_U1_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 2)
+		{
+			pixels_u64[0] = (uint64_t)(*pSrcImage);
+			pixels_u64[1] = (uint64_t)(*(pSrcImage + 8));
+
+			// Convert U1 to U8
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels = _mm_load_si128((__m128i*) pixels_u64);
+			pixels = _mm_andnot_si128(pixels, ones);						// Only LSB of each byte counts, because of extract and deposit
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 1)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+/* The function processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Not_U8_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	vx_int16 inputPixels;
+	vx_int16 * pLocalSrc;
+	int *pLocalDst;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc = (vx_int16 *)pSrcImage;
+		pLocalDst = (int *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			inputPixels = *pLocalSrc++;
+			inputPixels = ~inputPixels;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[inputPixels & 0xF];
+			inputPixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[inputPixels & 0xF];
+			inputPixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[inputPixels & 0xF];
+			inputPixels >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[inputPixels & 0xF];
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc);
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pix & 0xF];
+			pix >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pix & 0xF];
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Not_U1_U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	__m128i * pLocalSrc_xmm;
+	vx_int16 * pLocalDst_16;
+
+	__m128i pixels;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	int pixelmask;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc_xmm = (__m128i*)pSrcImage;
+		pLocalDst_16 = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc_xmm++);
+			pixels = _mm_andnot_si128(pixels, ones);
+			pixelmask = _mm_movemask_epi8(pixels);
+
+			*pLocalDst_16++ = (vx_int16)(pixelmask & 0xFFFF);
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc = (vx_uint8 *)pLocalSrc_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= (*pLocalSrc++ >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst = ~temp;
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_Not_U1_U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc, *pLocalDst;
+	vx_int16 pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc = (short *)pSrcImage;
+		pLocalDst = (short *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = *pLocalSrc++;
+			pixels = ~pixels;
+			*pLocalDst++ = pixels;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = ~(*((vx_uint8*)pLocalSrc));
+		}
+		pSrcImage += srcImageStrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+#endif
+
+int HafCpu_And_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc1++ & *pLocalSrc2++;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc1++ & *pLocalSrc2++;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_And_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_cmpgt_epi8(pixels2, zeromask);				// Convert 0x01 to 0xFF
+			pixels1 = _mm_and_si128(pixels1, pixels2);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_And_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_and_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_cmpgt_epi8(pixels1, zeromask);						// Convert 0x01 to 0xFF
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the source image pointers are 16 byte aligned, and the source strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_And_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * src2 = (__m128i*)pSrcImage2;
+	__m128i pixels1, pixels2;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+			pixels2 = _mm_load_si128(&src2[width >> 4]);
+			pixels1 = _mm_and_si128(pixels1, pixels2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		src2 += (srcImage2StrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_And_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels1 = _mm_and_si128(pixels1, pixels2);
+			
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_And_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_and_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+
+int HafCpu_And_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalDst;
+	vx_int16 *pLocalSrc2;
+	__m128i pixels1, pixels2;
+	vx_int16 U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = (temp & 1) * (vx_uint8)(*pLocalSrc1);
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = (temp & 1) * (vx_uint8)(*pLocalSrc1);
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_And_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int32 * pLocalDst;
+	vx_uint8 pixels1, pixels2;
+	
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_uint8 *)pSrcImage1;
+		pLocalSrc2 = (vx_uint8 *)pSrcImage2;
+		pLocalDst = (vx_int32 *)pDstImage;
+
+		for (int width = 0; width < (int)dstWidth; width += 8)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 & pixels2;
+
+			// U1 to U8
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+			pixels1 >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels*/
+int HafCpu_And_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i * pLocalSrc1_xmm, *pLocalSrc2_xmm;
+	__m128i pixels1, pixels2;
+	int U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ & *pLocalSrc2++) >> 7) & 1;		// the signed bit has the information
+					temp <<= 1;					
+				}
+				*pLocalDst++ = temp;
+			}
+			
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ & *pLocalSrc2++) >> 7) & 1;
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels*/
+int HafCpu_And_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * pLocalSrc1_xmm;
+
+	__m128i pixels;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+		vx_int16 * pLocalSrc2_16 = (vx_int16 *)pSrcImage2;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc1_xmm++);
+			pixels1 = (vx_int16)(_mm_movemask_epi8(pixels) & 0xFFFF);
+			pixels2 = *pLocalSrc2_16++;
+
+			pixels1 = pixels1 & pixels2;
+			*pLocalDst_16++ = pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc2_16);
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= ((*pLocalSrc1++) >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst++ = temp & pix;
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the widths are a multiple of 8 pixels*/
+int HafCpu_And_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_int16 *)pSrcImage1;
+		pLocalSrc2 = (vx_int16 *)pSrcImage2;
+		pLocalDst = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 & pixels2;
+			*pLocalDst++ = pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = *((vx_uint8*)pLocalSrc1) & *((vx_uint8*)pLocalSrc2);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+int HafCpu_Or_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc1++ | *pLocalSrc2++;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc1++ | *pLocalSrc2++;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Or_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_cmpgt_epi8(pixels2, zeromask);				// Convert 0x01 to 0xFF
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the destination image pointer is 16 byte aligned, and the destination stride as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Or_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_or_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_cmpgt_epi8(pixels1, zeromask);						// Convert 0x01 to 0xFF
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the source image pointers are 16 byte aligned, and the source strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Or_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * src2 = (__m128i*)pSrcImage2;
+	__m128i pixels1, pixels2;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+			pixels2 = _mm_load_si128(&src2[width >> 4]);
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		src2 += (srcImage2StrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Or_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Or_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_or_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+/* The function assumes that the widths are a multiple of 8 pixels*/
+int HafCpu_Or_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalDst;
+	vx_int16 *pLocalSrc2;
+	__m128i pixels1, pixels2;
+	vx_int16 U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = (temp & 1) ? (vx_uint8)255 : (vx_uint8)(*pLocalSrc1);
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = (temp & 1) ? (vx_uint8)255 : (vx_uint8)(*pLocalSrc1);
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_Or_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int32 * pLocalDst;
+	vx_uint8 pixels1, pixels2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_uint8 *)pSrcImage1;
+		pLocalSrc2 = (vx_uint8 *)pSrcImage2;
+		pLocalDst = (vx_int32 *)pDstImage;
+
+		for (int width = 0; width < (int)dstWidth; width += 8)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 | pixels2;
+
+			// U1 to U8
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+			pixels1 >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels*/
+int HafCpu_Or_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i * pLocalSrc1_xmm, *pLocalSrc2_xmm;
+	__m128i pixels1, pixels2;
+	int U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ | *pLocalSrc2++) >> 7) & 1;		// the signed bit has the information
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ | *pLocalSrc2++) >> 7) & 1;
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+			
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels*/
+int HafCpu_Or_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * pLocalSrc1_xmm;
+
+	__m128i pixels;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+		vx_int16 * pLocalSrc2_16 = (vx_int16 *)pSrcImage2;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc1_xmm++);
+			pixels1 = (vx_int16)(_mm_movemask_epi8(pixels) & 0xFFFF);
+			pixels2 = *pLocalSrc2_16++;
+
+			pixels1 = pixels1 | pixels2;
+			*pLocalDst_16++ = pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc2_16);
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= ((*pLocalSrc1++) >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst++ = temp | pix;
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the widths are a multiple of 8 pixels */
+int HafCpu_Or_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_int16 *)pSrcImage1;
+		pLocalSrc2 = (vx_int16 *)pSrcImage2;
+		pLocalDst = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 | pixels2;
+			*pLocalDst++ = pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = *((vx_uint8*)pLocalSrc1) | *((vx_uint8*)pLocalSrc2);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+int HafCpu_Xor_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc1++ ^ *pLocalSrc2++;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = *pLocalSrc1++ ^ *pLocalSrc2++;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Xor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_cmpgt_epi8(pixels2, zeromask);				// Convert 0x01 to 0xFF
+			pixels1 = _mm_xor_si128(pixels1, pixels2);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Xor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_xor_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_cmpgt_epi8(pixels1, zeromask);						// Convert 0x01 to 0xFF
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the source image pointers are 16 byte aligned, and the source strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Xor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * src2 = (__m128i*)pSrcImage2;
+	__m128i pixels1, pixels2;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+			pixels2 = _mm_load_si128(&src2[width >> 4]);
+			pixels1 = _mm_xor_si128(pixels1, pixels2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		src2 += (srcImage2StrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
+It processes the pixels in a width which is the next highest multiple of 16 after dstWidth */
+int HafCpu_Xor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			
+			pixels1 = _mm_xor_si128(pixels1, pixels2);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Xor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_xor_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+/* The function assumes that the image widths are a multiple of 8 pixels */
+int HafCpu_Xor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalDst;
+	vx_int16 *pLocalSrc2;
+	__m128i pixels1, pixels2;
+	vx_int16 U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			vx_uint8 pix;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				pix = (temp & 1) ? (vx_uint8)255 : 0;
+				*pLocalDst = pix ^ ((vx_uint8)(*pLocalSrc1));
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			vx_uint8 pix;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				pix = (temp & 1) ? (vx_uint8)255 : 0;
+				*pLocalDst = pix ^ ((vx_uint8)(*pLocalSrc1));
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_Xor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int32 * pLocalDst;
+	vx_uint8 pixels1, pixels2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_uint8 *)pSrcImage1;
+		pLocalSrc2 = (vx_uint8 *)pSrcImage2;
+		pLocalDst = (vx_int32 *)pDstImage;
+
+		for (int width = 0; width < (int)dstWidth; width += 8)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 ^ pixels2;
+
+			// U1 to U8
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+			pixels1 >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Xor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i * pLocalSrc1_xmm, *pLocalSrc2_xmm;
+	__m128i pixels1, pixels2;
+	int U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ ^ *pLocalSrc2++) >> 7) & 1;		// the signed bit has the information
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ ^ *pLocalSrc2++) >> 7) & 1;
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels*/
+int HafCpu_Xor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * pLocalSrc1_xmm;
+
+	__m128i pixels;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+		vx_int16 * pLocalSrc2_16 = (vx_int16 *)pSrcImage2;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc1_xmm++);
+			pixels1 = (vx_int16)(_mm_movemask_epi8(pixels) & 0xFFFF);
+			pixels2 = *pLocalSrc2_16++;
+
+			pixels1 = pixels1 ^ pixels2;
+			*pLocalDst_16++ = pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc2_16);
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= ((*pLocalSrc1++) >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst++ = temp ^ pix;
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the widths are a multiple of 8 pixels*/
+int HafCpu_Xor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_int16 *)pSrcImage1;
+		pLocalSrc2 = (vx_int16 *)pSrcImage2;
+		pLocalDst = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 ^ pixels2;
+			*pLocalDst++ = pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = *((vx_uint8*)pLocalSrc1) ^ *((vx_uint8*)pLocalSrc2);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+int HafCpu_Nand_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = ~(*pLocalSrc1++ & *pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = ~(*pLocalSrc1++ & *pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+int HafCpu_Nand_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_cmpgt_epi8(pixels2, zeromask);				// Convert 0x01 to 0xFF
+			pixels1 = _mm_and_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nand_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_and_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_cmpgt_epi8(pixels1, zeromask);						// Convert 0x01 to 0xFF
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+
+}
+
+int HafCpu_Nand_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * src2 = (__m128i*)pSrcImage2;
+	__m128i pixels1, pixels2;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+			pixels2 = _mm_load_si128(&src2[width >> 4]);
+			pixels1 = _mm_and_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		src2 += (srcImage2StrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nand_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels1 = _mm_and_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nand_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_and_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+/* The function assumes that the image widths are a multiple of 8 pixels */
+int HafCpu_Nand_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalDst_xmm;
+	__m128i ones = _mm_set1_epi32(0xFFFFFFFF);
+
+	vx_uint8 *pLocalSrc1, *pLocalDst;
+	vx_int16 *pLocalSrc2;
+	__m128i pixels1, pixels2;
+	vx_int16 U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = ~((temp & 1) * (vx_uint8)(*pLocalSrc1));
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = ~((temp & 1) * (vx_uint8)(*pLocalSrc1));
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_Nand_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int32 * pLocalDst;
+	vx_uint8 pixels1, pixels2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_uint8 *)pSrcImage1;
+		pLocalSrc2 = (vx_uint8 *)pSrcImage2;
+		pLocalDst = (vx_int32 *)pDstImage;
+
+		for (int width = 0; width < (int)dstWidth; width += 8)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = ~(pixels1 & pixels2);
+
+			// U1 to U8
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+			pixels1 >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Nand_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i * pLocalSrc1_xmm, *pLocalSrc2_xmm;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	int U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ & *pLocalSrc2++) >> 7) & 1;		// the signed bit has the information
+					temp <<= 1;
+				}
+				*pLocalDst++ = ~temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_and_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= ((*pLocalSrc1++ & *pLocalSrc2++) >> 7) & 1;
+					temp <<= 1;
+				}
+				*pLocalDst++ = ~temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Nand_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * pLocalSrc1_xmm;
+
+	__m128i pixels;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+		vx_int16 * pLocalSrc2_16 = (vx_int16 *)pSrcImage2;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc1_xmm++);
+			pixels1 = (vx_int16)(_mm_movemask_epi8(pixels) & 0xFFFF);
+			pixels2 = *pLocalSrc2_16++;
+
+			pixels1 = pixels1 & pixels2;
+			*pLocalDst_16++ = ~pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc2_16);
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= ((*pLocalSrc1++) >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst++ = ~(temp & pix);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nand_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_int16 *)pSrcImage1;
+		pLocalSrc2 = (vx_int16 *)pSrcImage2;
+		pLocalDst = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 & pixels2;
+			*pLocalDst++ = ~pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = ~(*((vx_uint8*)pLocalSrc1) & *((vx_uint8*)pLocalSrc2));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+int HafCpu_Nor_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = ~(*pLocalSrc1++ | *pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = ~(*pLocalSrc1++ | *pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+#if USE_BMI2
+int HafCpu_Nor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_cmpgt_epi8(pixels2, zeromask);				// Convert 0x01 to 0xFF
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_or_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_cmpgt_epi8(pixels1, zeromask);						// Convert 0x01 to 0xFF
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * src2 = (__m128i*)pSrcImage2;
+	__m128i pixels1, pixels2;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+			pixels2 = _mm_load_si128(&src2[width >> 4]);
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		src2 += (srcImage2StrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels1 = _mm_or_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+		}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Nor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_or_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+/* The function assumes that the widths are a multiple of 8 pixels */
+int HafCpu_Nor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalDst;
+	vx_int16 *pLocalSrc2;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	vx_int16 U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = (temp & 1) ? (vx_uint8)(*pLocalSrc1) : (vx_uint8)255;
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				*pLocalDst = (temp & 1) ? (vx_uint8)(*pLocalSrc1) : (vx_uint8)255;
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_Nor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int32 * pLocalDst;
+	vx_uint8 pixels1, pixels2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_uint8 *)pSrcImage1;
+		pLocalSrc2 = (vx_uint8 *)pSrcImage2;
+		pLocalDst = (vx_int32 *)pDstImage;
+
+		for (int width = 0; width < (int)dstWidth; width += 8)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = ~(pixels1 | pixels2);
+
+			// U1 to U8
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+			pixels1 >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Nor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i * pLocalSrc1_xmm, *pLocalSrc2_xmm;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	int U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= (~((*pLocalSrc1++ | *pLocalSrc2++) >> 7)) & 1;		// the signed bit has the information
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_or_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= (~((*pLocalSrc1++ | *pLocalSrc2++) >> 7)) & 1;
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Nor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * pLocalSrc1_xmm;
+
+	__m128i pixels;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+		vx_int16 * pLocalSrc2_16 = (vx_int16 *)pSrcImage2;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc1_xmm++);
+			pixels1 = (vx_int16)(_mm_movemask_epi8(pixels) & 0xFFFF);
+			pixels2 = *pLocalSrc2_16++;
+
+			pixels1 = pixels1 | pixels2;
+			*pLocalDst_16++ = ~pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc2_16);
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= ((*pLocalSrc1++) >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst++ = ~(temp | pix);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the widths are a multiple of 8 pixels */
+int HafCpu_Nor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_int16 *)pSrcImage1;
+		pLocalSrc2 = (vx_int16 *)pSrcImage2;
+		pLocalDst = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 | pixels2;
+			*pLocalDst++ = ~pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = ~(*((vx_uint8*)pLocalSrc1) | *((vx_uint8*)pLocalSrc2));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#endif
+
+int HafCpu_Xnor_U8_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2 | (intptr_t)pDstImage) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalSrc2_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = ~(*pLocalSrc1++ ^ *pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i*) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i*) pSrcImage2;
+			pLocalDst_xmm = (__m128i*) pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalSrc2 = (vx_uint8 *)pLocalSrc2_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			for (int width = 0; width < postfixWidth; width++)
+			{
+				*pLocalDst++ = ~(*pLocalSrc1++ ^ *pLocalSrc2++);
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+#if USE_BMI2
+int HafCpu_Xnor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_cmpgt_epi8(pixels2, zeromask);				// Convert 0x01 to 0xFF
+			pixels1 = _mm_xor_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Xnor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * dst = (__m128i*)pDstImage;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_xor_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+			pixels1 = _mm_cmpgt_epi8(pixels1, zeromask);						// Convert 0x01 to 0xFF
+			_mm_store_si128(&dst[width >> 4], pixels1);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		dst += (dstImageStrideInBytes >> 4);
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Xnor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i * src2 = (__m128i*)pSrcImage2;
+	__m128i pixels1, pixels2;
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+			pixels2 = _mm_load_si128(&src2[width >> 4]);
+			pixels1 = _mm_xor_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		src2 += (srcImage2StrideInBytes >> 4);
+		pDstImage += dstImageStrideInBytes;
+	}
+
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Xnor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * src1 = (__m128i*)pSrcImage1;
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[2];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			pixels1 = _mm_load_si128(&src1[width >> 4]);
+
+			// Read the U1 values
+			pixels_u64[0] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels2 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels1 = _mm_xor_si128(pixels1, pixels2);
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		src1 += (srcImage1StrideInBytes >> 4);
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+int HafCpu_Xnor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i pixels1, pixels2;
+	__m128i zeromask = _mm_setzero_si128();
+
+	__declspec(align(16)) uint64_t pixels_u64[4];
+	uint64_t maskConv = 0x0101010101010101;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	for (unsigned int height = 0; height < dstHeight; height++)
+	{
+		for (unsigned int width = 0; width < dstWidth; width += 16)
+		{
+			// Read the U1 values from src1
+			pixels_u64[0] = (uint64_t)(*(pSrcImage1 + (width >> 3)));
+			pixels_u64[1] = (uint64_t)(*(pSrcImage1 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[0] = _pdep_u64(pixels_u64[0], maskConv);
+			pixels_u64[1] = _pdep_u64(pixels_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			// Read the U1 values from src2
+			pixels_u64[2] = (uint64_t)(*(pSrcImage2 + (width >> 3)));
+			pixels_u64[3] = (uint64_t)(*(pSrcImage2 + (width >> 3) + 1));
+#ifdef _WIN64
+			pixels_u64[2] = _pdep_u64(pixels_u64[2], maskConv);
+			pixels_u64[3] = _pdep_u64(pixels_u64[3], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			pixels1 = _mm_load_si128((__m128i*) pixels_u64);
+			pixels2 = _mm_load_si128((__m128i*) (pixels_u64 + 2));
+
+			pixels1 = _mm_xor_si128(pixels1, pixels2);							// Only the LSB here has the AND value
+			pixels1 = _mm_andnot_si128(pixels1, ones);
+
+			// Convert U8 to U1
+#ifdef _WIN64
+			pixels_u64[0] = _pext_u64(pixels1.m128i_u64[0], maskConv);
+			pixels_u64[1] = _pext_u64(pixels1.m128i_u64[1], maskConv);
+#else
+#pragma message("Warning: TBD: need a 32-bit implementation using _pext_u32")
+#endif
+			*((unsigned short *)pDstImage + (width >> 4)) = (unsigned short)(((pixels_u64[1] & 0xFF) << 8) | (pixels_u64[0] & 0xFF));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#else
+/* The function assumes that the image widths are a multiple of 8 pixels */
+int HafCpu_Xnor_U8_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i *pLocalSrc1_xmm, *pLocalDst_xmm;
+	vx_uint8 *pLocalSrc1, *pLocalDst;
+	vx_int16 *pLocalSrc2;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+	vx_int16 U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_store_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			vx_uint8 pix;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				pix = (temp & 1) ? (vx_uint8)255 : 0;
+				*pLocalDst = ~(pix ^ ((vx_uint8)(*pLocalSrc1)));
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *)pSrcImage1;
+			pLocalSrc2 = (vx_int16 *)pSrcImage2;
+			pLocalDst_xmm = (__m128i *)pDstImage;
+			int width;
+			for (width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+
+				U1pixels = *pLocalSrc2++;
+				M128I(pixels2).m128i_i32[0] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[1] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[2] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+				U1pixels >>= 4;
+				M128I(pixels2).m128i_i32[3] = dataConvertU1ToU8_4bytes[U1pixels & 0xF];
+
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+				_mm_storeu_si128(pLocalDst_xmm++, pixels1);
+			}
+			pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			pLocalDst = (vx_uint8 *)pLocalDst_xmm;
+
+			width = 0;
+			vx_int16 temp = *pLocalSrc2++;
+			vx_uint8 pix;
+			for (int width = 0; width < postfixWidth; width++, pLocalSrc1++, pLocalDst++)
+			{
+				pix = (temp & 1) ? (vx_uint8)255 : 0;
+				*pLocalDst = ~(pix ^ ((vx_uint8)(*pLocalSrc1)));
+				temp >>= 1;
+			}
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the width is a multiple of 8 pixels */
+int HafCpu_Xnor_U8_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_uint8 *pLocalSrc1, *pLocalSrc2;
+	vx_int32 * pLocalDst;
+	vx_uint8 pixels1, pixels2;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_uint8 *)pSrcImage1;
+		pLocalSrc2 = (vx_uint8 *)pSrcImage2;
+		pLocalDst = (vx_int32 *)pDstImage;
+
+		for (int width = 0; width < (int)dstWidth; width += 8)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = ~(pixels1 ^ pixels2);
+
+			// U1 to U8
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+			pixels1 >>= 4;
+			*pLocalDst++ = dataConvertU1ToU8_4bytes[pixels1 & 0xF];
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Xnor_U1_U8U8
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	bool useAligned = ((((intptr_t)pSrcImage1 | (intptr_t)pSrcImage2) & 0xF) == 0) ? true : false;
+
+	__m128i * pLocalSrc1_xmm, *pLocalSrc2_xmm;
+	__m128i pixels1, pixels2;
+	__m128i ones = _mm_set1_epi16((short)0xFFFF);
+
+	int U1pixels;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	if (useAligned)
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_load_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_load_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= (~((*pLocalSrc1++ ^ *pLocalSrc2++) >> 7)) & 1;		// the signed bit has the information
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+	else
+	{
+		for (int height = 0; height < (int)dstHeight; height++)
+		{
+			pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+			pLocalSrc2_xmm = (__m128i *) pSrcImage2;
+			vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+			for (int width = 0; width < alignedWidth; width += 16)
+			{
+				pixels1 = _mm_loadu_si128(pLocalSrc1_xmm++);
+				pixels2 = _mm_loadu_si128(pLocalSrc2_xmm++);
+				pixels1 = _mm_xor_si128(pixels1, pixels2);
+				pixels1 = _mm_andnot_si128(pixels1, ones);
+
+				U1pixels = _mm_movemask_epi8(pixels1);
+				*pLocalDst_16++ = (vx_int16)(U1pixels & 0xFFFF);
+			}
+
+			if (postfixWidth)
+			{
+				vx_uint8 * pLocalSrc1 = (vx_uint8*)pLocalSrc1_xmm;
+				vx_uint8 * pLocalSrc2 = (vx_uint8*)pLocalSrc2_xmm;
+				vx_uint8 * pLocalDst = (vx_uint8*)pLocalDst_16;
+				vx_uint8 temp = 0;
+				for (int i = 0; i < 8; i++)
+				{
+					temp |= (~((*pLocalSrc1++ ^ *pLocalSrc2++) >> 7)) & 1;
+					temp <<= 1;
+				}
+				*pLocalDst++ = temp;
+			}
+
+			pSrcImage1 += srcImage1StrideInBytes;
+			pSrcImage2 += srcImage2StrideInBytes;
+			pDstImage += dstImageStrideInBytes;
+		}
+	}
+
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the input widths are a multiple of 8 pixels */
+int HafCpu_Xnor_U1_U8U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	__m128i * pLocalSrc1_xmm;
+
+	__m128i pixels;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1_xmm = (__m128i *) pSrcImage1;
+		vx_int16 * pLocalSrc2_16 = (vx_int16 *)pSrcImage2;
+		vx_int16 * pLocalDst_16 = (vx_int16 *)pDstImage;
+
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels = _mm_loadu_si128(pLocalSrc1_xmm++);
+			pixels1 = (vx_int16)(_mm_movemask_epi8(pixels) & 0xFFFF);
+			pixels2 = *pLocalSrc2_16++;
+
+			pixels1 = pixels1 ^ pixels2;
+			*pLocalDst_16++ = ~pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			vx_uint8 * pLocalSrc1 = (vx_uint8 *)pLocalSrc1_xmm;
+			vx_uint8 * pLocalDst = (vx_uint8 *)pLocalDst_16;
+			vx_uint8 pix = *((vx_uint8 *)pLocalSrc2_16);
+			vx_uint8 temp = 0;
+			for (int i = 0; i < 8; i++)
+			{
+				temp |= ((*pLocalSrc1++) >> 7) & 1;
+				temp <<= 1;
+			}
+			*pLocalDst++ = ~(temp ^ pix);
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
+/* The function assumes that the widths are a multiple of 8 pixels */
+int HafCpu_Xnor_U1_U1U1
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage1,
+		vx_uint32     srcImage1StrideInBytes,
+		vx_uint8    * pSrcImage2,
+		vx_uint32     srcImage2StrideInBytes
+	)
+{
+	vx_int16 *pLocalSrc1, *pLocalSrc2, *pLocalDst;
+	vx_int16 pixels1, pixels2;
+
+	int alignedWidth = dstWidth & ~15;
+	int postfixWidth = dstWidth - alignedWidth;
+
+	for (int height = 0; height < (int)dstHeight; height++)
+	{
+		pLocalSrc1 = (vx_int16 *)pSrcImage1;
+		pLocalSrc2 = (vx_int16 *)pSrcImage2;
+		pLocalDst = (vx_int16 *)pDstImage;
+		for (int width = 0; width < alignedWidth; width += 16)
+		{
+			pixels1 = *pLocalSrc1++;
+			pixels2 = *pLocalSrc2++;
+			pixels1 = pixels1 ^ pixels2;
+			*pLocalDst++ = ~pixels1;
+		}
+
+		if (postfixWidth)
+		{
+			*((vx_uint8*)pLocalDst) = ~(*((vx_uint8*)pLocalSrc1) ^ *((vx_uint8*)pLocalSrc2));
+		}
+		pSrcImage1 += srcImage1StrideInBytes;
+		pSrcImage2 += srcImage2StrideInBytes;
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+#endif
\ No newline at end of file
diff --git a/openvx/ago/ago_haf_cpu_opticalflow.cpp b/openvx/ago/ago_haf_cpu_opticalflow.cpp
new file mode 100644
index 0000000..271896a
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_opticalflow.cpp
@@ -0,0 +1,549 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+/* Implements OpticalFlow pyramid algorithm*/
+
+typedef struct {
+	vx_float32 x;                 /*!< \brief The x coordinate. */
+	vx_float32 y;                 /*!< \brief The y coordinate. */
+}pt2f;
+
+typedef struct {
+	vx_int32 x;                 /*!< \brief The x coordinate. */
+	vx_int32 y;                 /*!< \brief The y coordinate. */
+}pt2i;
+
+// helper functions for floating point math: used in haf_cpu implementation
+// flRound: convert to nearest integer
+inline int flRound(float value)
+{
+	__m128d t = _mm_set_sd(value);
+	return _mm_cvtsd_si32(t);
+}
+
+inline int flFloor(float value)
+{
+	int i = flRound(value);
+	return i - (((float)(value - 1)) < 0);
+}
+
+static const int W_BITS = 14;
+static const float FLT_SCALE = 1.f / (1 << 20);
+static const float MinEugThreshold = 1.0e-04F;
+static const float Epsilon = 1.0e-07F;
+
+#define DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+
+// helper functions
+static inline void pt_copy(pt2f &pt1, pt2f &pt2) { pt1.x = pt2.x; pt1.y = pt2.y; }
+static inline void pt_copy_scale(pt2f &pt1, pt2f &pt2, float &s) { pt1.x = pt2.x*s; pt1.y = pt2.y*s; }
+
+
+
+static void ComputeSharr(
+	vx_uint32   dstImageStrideInBytes,
+	vx_uint8	*dst,
+	vx_uint32	srcWidth,
+	vx_uint32   srcHeight,
+	vx_uint32   srcImageStrideInBytes,
+	vx_uint8	*src,
+	vx_uint8	*pScharrScratch
+)
+{
+	unsigned int y,x;
+	__m128i z = _mm_setzero_si128(), c3 = _mm_set1_epi16(3), c10 = _mm_set1_epi16(10);
+	vx_uint16	*_tempBuf = (vx_uint16*)pScharrScratch;
+	vx_uint16	*trow0 = (vx_uint16 *)ALIGN16(_tempBuf+1);
+	vx_uint16   *trow1 = (vx_uint16 *)ALIGN16(trow0 + srcWidth+2);
+
+#if 0		// C reference code for testing
+	vx_int16 ops[] = { 3, 10, 3, -3, -10, -3 };
+	src += srcImageStrideInBytes;
+	dst += dstImageStrideInBytes;
+	for (y = 1; y < srcHeight - 1; y++)
+	{
+		const vx_uint8* srow0 = src - srcImageStrideInBytes;
+		const vx_uint8* srow1 = src;
+		const vx_uint8* srow2 = src + srcImageStrideInBytes;
+		vx_int16* drow = (vx_int16*)dst;
+		drow+=2;
+		for (x = 1; x < srcWidth - 1; x++, drow+=2)
+		{
+			// calculate g_x
+			drow[0] = (srow0[x + 1] * ops[0]) + (srow1[x + 1] * ops[1]) + (srow2[x + 1] * ops[2]) +
+				(srow0[x - 1] * ops[3]) + (srow1[x - 1] * ops[4]) + (srow2[x - 1] * ops[5]);
+			drow[1] = (srow2[x - 1] * ops[0]) + (srow2[x] * ops[1]) + (srow2[x + 1] * ops[2]) +
+				(srow0[x - 1] * ops[3]) + (srow0[x] * ops[4]) + (srow0[x + 1] * ops[5]);
+		}
+		src += srcImageStrideInBytes;
+		dst += dstImageStrideInBytes;
+	}
+#else
+	src += srcImageStrideInBytes;
+	dst += dstImageStrideInBytes;
+	for (y = 1; y < srcHeight-1; y++)
+	{
+		const vx_uint8* srow0 = y > 0 ? src - srcImageStrideInBytes : src;
+		const vx_uint8* srow1 = src;
+		const vx_uint8* srow2 = y < srcHeight - 1 ? src + srcImageStrideInBytes : src;
+		vx_uint16* drow = (vx_uint16*)dst;
+
+		// do vertical convolution
+		x = 0;
+		for (; x <= srcWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
+			__m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
+			__m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
+			__m128i t0 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s0, s2), c3), _mm_mullo_epi16(s1, c10));
+			__m128i t1 = _mm_sub_epi16(s2, s0);
+			_mm_store_si128((__m128i*)(trow0 + x), t0);
+			_mm_store_si128((__m128i*)(trow1 + x), t1);
+		}
+		// make border: is this really needed.
+		//trow0[-1] = trow0[0]; trow0[srcWidth] = trow0[srcWidth-1];
+		//trow1[-1] = trow1[0]; trow1[srcWidth] = trow1[srcWidth - 1];
+
+		// do horizontal convolution, interleave the results and store them to dst
+		x = 0;
+		for (; x <= srcWidth - 8; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(trow0 + x - 1));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(trow0 + x + 1));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(trow1 + x - 1));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(trow1 + x));
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(trow1 + x + 1));
+
+			__m128i t0 = _mm_sub_epi16(s1, s0);
+			__m128i t1 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s2, s4), c3), _mm_mullo_epi16(s3, c10));
+			__m128i t2 = _mm_unpacklo_epi16(t0, t1);
+			t0 = _mm_unpackhi_epi16(t0, t1);
+			// this can probably be replaced with aligned stores if we aligned dst properly.
+			_mm_storeu_si128((__m128i*)(drow + x * 2), t2);
+			_mm_storeu_si128((__m128i*)(drow + x * 2 + 8), t0);
+		}
+		src += srcImageStrideInBytes;
+		dst += dstImageStrideInBytes;
+	}
+#endif
+}
+
+int HafCpu_OpticalFlowPyrLK_XY_XY_Generic
+(
+vx_keypoint_t      newKeyPoint[],
+vx_float32         pyramidScale,
+vx_uint32          pyramidLevelCount,
+ago_pyramid_u8_t * oldPyramid,
+ago_pyramid_u8_t * newPyramid,
+vx_uint32          keyPointCount,
+vx_keypoint_t      oldKeyPoint[],
+vx_keypoint_t      newKeyPointEstimate[],
+vx_enum            termination,
+vx_float32         epsilon,
+vx_uint32          num_iterations,
+vx_bool            use_initial_estimate,
+vx_uint32		   dataStrideInBytes,
+vx_uint8		 * DataPtr,
+vx_int32		   winsz
+)
+{
+	vx_size halfWin = (vx_size)(winsz>>1);  //(winsz *0.5f);
+	__m128i z = _mm_setzero_si128();
+	__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS - 1));
+	__m128i qdelta = _mm_set1_epi32(1 << (W_BITS - 5 - 1));
+	// allocate matrix for I and dI 
+	vx_int16 Imat[256];				// enough to accomodate max win size of 15
+	vx_int16 dIMat[256*2];
+	vx_uint8 * pScharrScratch = DataPtr;
+	vx_uint8 * pScratch = DataPtr + (oldPyramid[0].width + 2) * 4 + 64;
+	ago_keypoint_t *pNextPtArray = (ago_keypoint_t *)(pScratch + (oldPyramid[0].width*oldPyramid[0].height * 4));
+
+	for (int level = pyramidLevelCount - 1; level >= 0; level--)
+	{
+		int bBound;
+		vx_uint32 dWidth = oldPyramid[level].width-2;
+		vx_uint32 dHeight = oldPyramid[level].height-2;			// first and last row is not accounted
+		vx_uint32 JWidth = newPyramid[level].width;
+		vx_uint32 JHeight = newPyramid[level].height;
+		vx_uint32 IStride = oldPyramid[level].strideInBytes, JStride = newPyramid[level].strideInBytes;
+		vx_uint32 dStride = dataStrideInBytes>>1;		//in #of elements
+		vx_uint8 *SrcBase = oldPyramid[level].pImage;
+		vx_uint8 *JBase = newPyramid[level].pImage;
+		vx_int16 *DIBase = (vx_int16 *)pScratch;
+
+		// calculate sharr derivatives Ix and Iy
+		ComputeSharr(dataStrideInBytes, pScratch, oldPyramid[level].width, oldPyramid[level].height, oldPyramid[level].strideInBytes, oldPyramid[level].pImage, pScharrScratch);
+		float ptScale = (float)(pow(pyramidScale, level));
+		//	printf("\nLevel : %d***************\n", level);
+
+		// do the Lukas Kanade tracking for each feature point
+		for (unsigned int pt = 0; pt < keyPointCount; pt++){
+			if (!oldKeyPoint[pt].tracking_status)	{
+				newKeyPoint[pt].x = oldKeyPoint[pt].x;
+				newKeyPoint[pt].y = oldKeyPoint[pt].y;
+				newKeyPoint[pt].strength = oldKeyPoint[pt].strength;
+				newKeyPoint[pt].tracking_status = oldKeyPoint[pt].tracking_status;
+				newKeyPoint[pt].scale = oldKeyPoint[pt].scale;
+				newKeyPoint[pt].error = oldKeyPoint[pt].error;
+				continue;
+			}
+			
+			pt2f PrevPt, nextPt;
+			bool bUseIE = false;
+			PrevPt.x = oldKeyPoint[pt].x*ptScale;
+			PrevPt.y = oldKeyPoint[pt].y*ptScale;
+			if (level == pyramidLevelCount-1){
+				if (use_initial_estimate){
+					nextPt.x = newKeyPointEstimate[pt].x*ptScale;
+					nextPt.y = newKeyPointEstimate[pt].y*ptScale;
+					bUseIE = true;
+					newKeyPoint[pt].strength = newKeyPointEstimate[pt].strength;
+					newKeyPoint[pt].tracking_status = newKeyPointEstimate[pt].tracking_status;
+					newKeyPoint[pt].error = newKeyPointEstimate[pt].error;
+				}
+				else
+				{
+					pt_copy(nextPt, PrevPt);
+					newKeyPoint[pt].tracking_status = oldKeyPoint[pt].tracking_status;
+					newKeyPoint[pt].strength = oldKeyPoint[pt].strength;
+				}
+				pNextPtArray[pt].x = nextPt.x;
+				pNextPtArray[pt].y = nextPt.y;
+			}
+			else
+			{
+				pNextPtArray[pt].x *= 2.0f;
+				pNextPtArray[pt].y *= 2.0f;
+				nextPt.x = pNextPtArray[pt].x;
+				nextPt.y = pNextPtArray[pt].y;
+			}
+
+			if (!newKeyPoint[pt].tracking_status){
+				continue;
+			}
+
+			pt2i iprevPt, inextPt;
+			PrevPt.x = PrevPt.x - halfWin;
+			PrevPt.y = PrevPt.y - halfWin;
+			nextPt.x = nextPt.x - halfWin;
+			nextPt.y = nextPt.y - halfWin;
+
+			iprevPt.x = (vx_int32)floor(PrevPt.x);
+			iprevPt.y = (vx_int32)floor(PrevPt.y);
+			// check if the point is out of bounds in the derivative image
+			bBound = (iprevPt.x >> 31) | (iprevPt.x >= (vx_int32)(dWidth - winsz)) | (iprevPt.y >> 31) | (iprevPt.y >= (vx_int32)(dHeight - winsz));
+			if (bBound){
+				if (!level){
+					newKeyPoint[pt].x = (vx_int32)nextPt.x;
+					newKeyPoint[pt].y = (vx_int32)nextPt.y;
+					newKeyPoint[pt].tracking_status = 0;
+					newKeyPoint[pt].error = 0;
+				}
+				continue;	// go to next point.
+			}
+			// calulate weights for interpolation
+			float a = PrevPt.x - iprevPt.x;
+			float b = PrevPt.y - iprevPt.y;
+			float A11 = 0, A12 = 0, A22 = 0;
+			int x, y;
+			int iw00, iw01, iw10, iw11;
+			if ((a==0.0) && (b==0.0))
+			{
+				// no need to do interpolation for the source and derivatives
+				int x, y;
+				for (y = 0; y < winsz; y++)
+				{
+					const unsigned char* src = SrcBase + (y + iprevPt.y)*IStride + iprevPt.x;
+					const vx_int16* dsrc = DIBase + (y + iprevPt.y)*dStride + iprevPt.x * 2;
+
+					vx_int16* Iptr = &Imat[y*winsz];
+					vx_int16* dIptr = &dIMat[y*winsz * 2];
+					x = 0;
+					for (; x < winsz - 4; x += 4, dsrc += 8, dIptr += 8)
+					{
+						__m128i v00, v01, v10, v11, v12;
+						v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+						v01 = _mm_loadu_si128((const __m128i*)(dsrc));
+						v10 = _mm_shufflelo_epi16(v01, 0xd8);		// copy with shuffle
+						v10 = _mm_shufflehi_epi16(v10, 0xd8);		// iy3, iy2, ix3,ix2, iy1, iy0, ix1,ix0
+						v10 = _mm_shuffle_epi32(v10, 0xd8);			// iy3, iy2, iy1, iy0, ix3,ix2, ix1,ix0
+						v11 = _mm_shuffle_epi32(v10, 0xe4);			// copy
+						v12 = _mm_shuffle_epi32(v10, 0x4e);         // ix3,ix2, ix1,ix0, iy3, iy2, iy1, iy0
+						v00 = _mm_slli_epi16(v00, 5);
+						v12 = _mm_madd_epi16(v12, v10);			// A121, A120
+						v10 = _mm_madd_epi16(v10, v11);			// A221, A220, A111, A110
+						A11 += (float)(M128I(v10).m128i_i32[0] + M128I(v10).m128i_i32[1]);
+						A22 += (float)(M128I(v10).m128i_i32[2] + M128I(v10).m128i_i32[3]);
+						A12 += (float)(M128I(v12).m128i_i32[0] + M128I(v12).m128i_i32[1]);
+						_mm_storeu_si128((__m128i*)dIptr, v01);
+						_mm_storel_epi64((__m128i*)(Iptr + x), v00);
+					}
+					for (; x < winsz; x ++, dsrc += 2, dIptr += 2)
+					{
+
+						int ival = (src[x]<<5);
+						int ixval = dsrc[0];
+						int iyval = dsrc[1];
+
+						Iptr[x] = (short)ival;
+						dIptr[0] = (short)ixval;
+						dIptr[1] = (short)iyval;
+
+						A11 += (float)(ixval*ixval);
+						A12 += (float)(ixval*iyval);
+						A22 += (float)(iyval*iyval);
+					}
+				}
+				A11 *= FLT_SCALE;
+				A12 *= FLT_SCALE;
+				A22 *= FLT_SCALE;
+			}
+			else
+			{
+				int iw00 = (int)(((1.f - a)*(1.f - b)*(1 << W_BITS)) + 0.5);
+				int iw01 = (int)((a*(1.f - b)*(1 << W_BITS)) + 0.5);
+				int iw10 = (int)(((1.f - a)*b*(1 << W_BITS)) + 0.5);
+				int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+				__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+				__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+				__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+				// extract the patch from the old image, compute covariation matrix of derivatives
+				for (y = 0; y < winsz; y++)
+				{
+					const unsigned char* src = SrcBase + (y + iprevPt.y)*IStride + iprevPt.x;
+					const vx_int16* dsrc = DIBase + (y + iprevPt.y)*dStride + iprevPt.x * 2;
+
+					vx_int16* Iptr = &Imat[y*winsz];
+					vx_int16* dIptr = &dIMat[y*winsz * 2];
+
+					x = 0;
+					for (; x <= winsz - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+					{
+						__m128i v00, v01, v10, v11, t0, t1;
+
+						v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+						v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + 1)), z);
+						v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + IStride)), z);
+						v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + IStride + 1)), z);
+
+						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS - 5);
+						_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+						v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+						v01 = _mm_loadu_si128((const __m128i*)(dsrc + 2));
+						v10 = _mm_loadu_si128((const __m128i*)(dsrc + dStride));
+						v11 = _mm_loadu_si128((const __m128i*)(dsrc + dStride + 2));
+
+						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS);
+						t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS);
+						v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+
+						_mm_storeu_si128((__m128i*)dIptr, v00);
+						t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+						t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+						__m128 fy = _mm_cvtepi32_ps(t0);
+						__m128 fx = _mm_cvtepi32_ps(t1);
+
+						qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+						qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+						qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+					}
+					// do computation for remaining x if any
+					for (; x < winsz; x++, dsrc += 2, dIptr += 2)
+					{
+						int ival = DESCALE(src[x] * iw00 + src[x + 1] * iw01 +
+							src[x + IStride] * iw10 + src[x + IStride + 1] * iw11, W_BITS - 5);
+						int ixval = DESCALE(dsrc[0] * iw00 + dsrc[2] * iw01 +
+							dsrc[dStride] * iw10 + dsrc[dStride + 2] * iw11, W_BITS);
+						int iyval = DESCALE(dsrc[1] * iw00 + dsrc[3] * iw01 + dsrc[dStride + 1] * iw10 +
+							dsrc[dStride + 3] * iw11, W_BITS);
+
+						Iptr[x] = (short)ival;
+						dIptr[0] = (short)ixval;
+						dIptr[1] = (short)iyval;
+
+						A11 += (float)(ixval*ixval);
+						A12 += (float)(ixval*iyval);
+						A22 += (float)(iyval*iyval);
+					}
+				}
+				// add with SSE output
+				if (winsz >= 4){
+					float DECL_ALIGN(16) A11buf[4] ATTR_ALIGN(16), A12buf[4] ATTR_ALIGN(16), A22buf[4] ATTR_ALIGN(16);
+					_mm_store_ps(A11buf, qA11);
+					_mm_store_ps(A12buf, qA12);
+					_mm_store_ps(A22buf, qA22);
+					A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+					A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+					A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+				}
+				A11 *= FLT_SCALE;
+				A12 *= FLT_SCALE;
+				A22 *= FLT_SCALE;
+			}
+
+			float D = A11*A22 - A12*A12;
+			float minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
+				4.f*A12*A12)) / (2 * winsz*winsz);
+
+			if (minEig < 1.0e-04F || D < 1.0e-07F)
+			{
+				if (!level){
+					newKeyPoint[pt].x = (vx_int32)nextPt.x;
+					newKeyPoint[pt].y = (vx_int32)nextPt.y;
+					newKeyPoint[pt].tracking_status = 0;
+					newKeyPoint[pt].error = 0;
+				}
+				continue;
+			}
+			D = 1.f / D;
+			float prevDelta_x = 0.f, prevDelta_y = 0.f;
+			float delta_dx = 0.f, delta_dy = 0.f;
+			unsigned int j = 0;
+			while (j < num_iterations || termination == VX_TERM_CRITERIA_EPSILON)
+			{
+				__m128i qw0, qw1;
+				inextPt.x = (vx_int32)floor(nextPt.x);
+				inextPt.y = (vx_int32)floor(nextPt.y);
+				bBound = (inextPt.x >> 31) | (inextPt.x >=(vx_int32)(JWidth - winsz)) | (inextPt.y >> 31) | (inextPt.y >= (vx_int32)(JHeight - winsz));
+				if (bBound){
+					if (!level){
+						newKeyPoint[pt].tracking_status = 0;
+						newKeyPoint[pt].error = 0;
+					}
+					break;	// go to next point.
+				}
+				a = nextPt.x - inextPt.x;
+				b = nextPt.y - inextPt.y;
+				iw00 = (int)(((1.f - a)*(1.f - b)*(1 << W_BITS)) +0.5);
+				iw01 = (int)((a*(1.f - b)*(1 << W_BITS)) + 0.5);
+				iw10 = (int)(((1.f - a)*b*(1 << W_BITS))+0.5);
+				iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+				double ib1 = 0, ib2 = 0;
+				float b1, b2;
+				//double b1, b2;
+				qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+				qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+				__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+				for (y = 0; y < winsz; y++)
+				{
+					const unsigned char* Jptr = JBase + (y + inextPt.y)*JStride + inextPt.x;;
+					vx_int16* Iptr = &Imat[y*winsz];
+					vx_int16* dIptr = &dIMat[y*winsz*2];
+
+					x = 0;
+					for (; x <= winsz - 8; x += 8, dIptr += 8 * 2)
+					{
+						__m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
+						__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+						__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + 1)), z);
+						__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + JStride)), z);
+						__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + JStride + 1)), z);
+
+						__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS - 5);
+						t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS - 5);
+						diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
+						diff1 = _mm_unpackhi_epi16(diff0, diff0);
+						diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
+						v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+						v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+						v10 = _mm_mullo_epi16(v00, diff0);
+						v11 = _mm_mulhi_epi16(v00, diff0);
+						v00 = _mm_unpacklo_epi16(v10, v11);
+						v10 = _mm_unpackhi_epi16(v10, v11);
+						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+						v10 = _mm_mullo_epi16(v01, diff1);
+						v11 = _mm_mulhi_epi16(v01, diff1);
+						v00 = _mm_unpacklo_epi16(v10, v11);
+						v10 = _mm_unpackhi_epi16(v10, v11);
+						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+					}
+					for (; x < winsz; x++, dIptr += 2)
+					{
+						int diff = DESCALE(Jptr[x] * iw00 + Jptr[x + 1] * iw01 +
+							Jptr[x + JStride] * iw10 + Jptr[x + JStride + 1] * iw11,
+							W_BITS - 5);
+						diff -= Iptr[x];
+						ib1 += (float)(diff*dIptr[0]);
+						ib2 += (float)(diff*dIptr[1]);
+					}
+				}
+				if (winsz >= 8)
+				{
+					float DECL_ALIGN(16) bbuf[4] ATTR_ALIGN(16);
+					_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+					ib1 += bbuf[0] + bbuf[2];
+					ib2 += bbuf[1] + bbuf[3];
+
+				}
+				b1 = (float)(ib1*FLT_SCALE);
+				b2 = (float)(ib2*FLT_SCALE);
+				// calculate delta
+				float delta_x = (float)((A12*b2 - A22*b1) * D);
+				float delta_y = (float)((A12*b1 - A11*b2) * D);
+				// add to nextPt
+				nextPt.x += delta_x;
+				nextPt.y += delta_y;
+				if ((delta_x*delta_x + delta_y*delta_y) <= epsilon && (termination == VX_TERM_CRITERIA_EPSILON || termination == VX_TERM_CRITERIA_BOTH)){
+					break;
+				}
+				if (j > 0 && abs(delta_x + prevDelta_x) < 0.01 && abs(delta_y + prevDelta_y) < 0.01)
+				{
+					delta_dx = delta_x*0.5f;
+					delta_dy = delta_y*0.5f;
+					break;
+				}
+				prevDelta_x = delta_x;
+				prevDelta_y = delta_y;
+				j++;
+			}
+			if (!level){
+				newKeyPoint[pt].x = (vx_int32)(nextPt.x + halfWin - delta_dx + 0.5f);
+				newKeyPoint[pt].y = (vx_int32)(nextPt.y + halfWin - delta_dy + 0.5f);
+				//printf("Level: %d Key Point: %d  x: %d y: %d\n", level, pt, newKeyPoint[pt].x, newKeyPoint[pt].y);
+			}
+			else
+			{
+				pNextPtArray[pt].x = (nextPt.x + halfWin - delta_dx);
+				pNextPtArray[pt].y = (nextPt.y + halfWin - delta_dy);
+				//printf("Level: %d Key Point: %d  x: %d y: %d\n", level, pt, (int)pNextPtArray[pt].x, (int)pNextPtArray[pt].y);
+			}
+		}
+	}
+	return AGO_SUCCESS;
+}
diff --git a/openvx/ago/ago_haf_cpu_pyramid.cpp b/openvx/ago/ago_haf_cpu_pyramid.cpp
new file mode 100644
index 0000000..c5e684d
--- /dev/null
+++ b/openvx/ago/ago_haf_cpu_pyramid.cpp
@@ -0,0 +1,407 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+static inline unsigned short Horizontal5x5GaussianFilter_C(unsigned char * srcImage)
+{
+	return((unsigned short)srcImage[-2] + 4 * (unsigned short)srcImage[-1] + 6 * (unsigned short)srcImage[0] + 4 * (unsigned short)srcImage[1] + (unsigned short)srcImage[2]);
+}
+
+static inline __m128i Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(unsigned char * srcImage)
+{
+	__m128i shiftedL2, shiftedL1, row, shiftedR2, shiftedR1;
+	__m128i resultH, resultL;
+	__m128i zeromask = _mm_setzero_si128();
+	__m128i mask = _mm_set1_epi32((int)0x0000FFFF);
+
+	shiftedL2 = _mm_loadu_si128((__m128i *) (srcImage - 2));						// -2
+	shiftedR2 = _mm_loadu_si128((__m128i *) (srcImage + 2));						// +2
+
+	resultH = _mm_unpackhi_epi8(shiftedL2, zeromask);								// r[-2]
+	resultL = _mm_cvtepu8_epi16(shiftedL2);											// r[-2]
+	shiftedL2 = _mm_unpackhi_epi8(shiftedR2, zeromask);								// r[2]
+	shiftedR2 = _mm_cvtepu8_epi16(shiftedR2);										// r[2]
+
+	shiftedL1 = _mm_loadu_si128((__m128i *) (srcImage - 1));						// -1
+
+	resultH = _mm_add_epi16(resultH, shiftedL2);									// r[-2] + r[2]
+	resultL = _mm_add_epi16(resultL, shiftedR2);									// r[-2] + r[2]
+
+	shiftedR1 = _mm_loadu_si128((__m128i *) (srcImage + 1));						// +1
+
+	shiftedL2 = _mm_unpackhi_epi8(shiftedL1, zeromask);								// r[-1]
+	shiftedL1 = _mm_cvtepu8_epi16(shiftedL1);										// r[-1]
+
+	row = _mm_loadu_si128((__m128i *) srcImage);									// 0
+
+	shiftedR2 = _mm_unpackhi_epi8(shiftedR1, zeromask);								// r[+1]
+	shiftedR1 = _mm_cvtepu8_epi16(shiftedR1);										// r[+1]
+
+	shiftedL2 = _mm_add_epi16(shiftedL2, shiftedR2);								// r[-1] + r[1]
+	shiftedL1 = _mm_add_epi16(shiftedL1, shiftedR1);								// r[-1] + r[1]
+
+	shiftedR1 = _mm_unpackhi_epi8(row, zeromask);									// r[0]
+	row = _mm_cvtepu8_epi16(row);													// r[0]
+
+	shiftedL2 = _mm_add_epi16(shiftedL2, shiftedR1);								// r[-1] + r[0] + r[1]
+	shiftedL1 = _mm_add_epi16(shiftedL1, row);										// r[-1] + r[0] + r[1]
+	shiftedL2 = _mm_slli_epi16(shiftedL2, 2);										// 4*r[-1] + 4*r[0] + 4*r[1]
+	shiftedL1 = _mm_slli_epi16(shiftedL1, 2);										// 4*r[-1] + 4*r[0] + 4*r[1]
+
+	shiftedR1 = _mm_slli_epi16(shiftedR1, 1);										// 2*r[0]
+	row = _mm_slli_epi16(row, 1);													// 2*r[0]
+	shiftedL2 = _mm_add_epi16(shiftedL2, shiftedR1);								// 4*r[-1] + 6*r[0] + 4*r[1]
+	shiftedL1 = _mm_add_epi16(shiftedL1, row);										// 4*r[-1] + 6*r[0] + 4*r[1]
+
+	resultH = _mm_add_epi16(resultH, shiftedL2);									// r[-2] + 4*r[-1] + 6*r[0] + 4*r[1] + r[2]
+	resultL = _mm_add_epi16(resultL, shiftedL1);									// r[-2] + 4*r[-1] + 6*r[0] + 4*r[1] + r[2]
+
+	resultH = _mm_and_si128(resultH, mask);											// Select words: 0, 2, 4, 6
+	resultL = _mm_and_si128(resultL, mask);											// Select words: 0, 2, 4, 6
+
+	resultL = _mm_packus_epi32(resultL, resultH);
+
+	return(resultL);
+}
+
+/* Kernel			1   4   6   4   1			1		1   4   6   4   1
+				    4  16  24  16   4			4
+			1/256	6  24  36  24   6    =		6									>> 8				
+					4  16  24  16   4			4
+					1   4   6   4   1			1
+*/
+int HafCpu_ScaleGaussianHalf_U8_U8_5x5
+	(
+		vx_uint32     dstWidth,
+		vx_uint32     dstHeight,
+		vx_uint8    * pDstImage,
+		vx_uint32     dstImageStrideInBytes,
+		vx_uint8    * pSrcImage,
+		vx_uint32     srcImageStrideInBytes,
+		bool		  sampleFirstRow,
+		bool		  sampleFirstColumn,
+		vx_uint8	* pScratch
+	)
+{
+	int alignedDstStride = (dstImageStrideInBytes + 15) & ~15;
+	alignedDstStride <<= 2;				// Each row stores two short values (Gx,Gy) for each pixel
+	unsigned short * r0 = (unsigned short *)pScratch;
+	unsigned short * r1 = (unsigned short *)(pScratch + alignedDstStride);
+	unsigned short * r2 = (unsigned short *)(pScratch + 2 * alignedDstStride);
+	unsigned short * r3 = (unsigned short *)(pScratch + 3 * alignedDstStride);
+	unsigned short * r4 = (unsigned short *)(pScratch + 4 * alignedDstStride);
+
+	int prefixWidth = intptr_t(pDstImage) & 15;
+	prefixWidth = (prefixWidth == 0) ? 0 : (16 - prefixWidth);
+	int postfixWidth = ((int)dstWidth - prefixWidth) & 15;
+	int alignedWidth = (int)dstWidth - prefixWidth - postfixWidth;
+	int srcRowOffset = sampleFirstRow ? 0 : srcImageStrideInBytes;
+	int srcColOffset = sampleFirstColumn ? 0 : 1;
+
+	pSrcImage += srcRowOffset;																	// Offset for odd/even row sampling
+	unsigned char *pLocalSrc = (unsigned char *)pSrcImage;
+	unsigned char *pLocalDst = (unsigned char *)pDstImage;
+
+	unsigned short * pRowMinus2 = r0;
+	unsigned short * pRowMinus1 = r1;
+	unsigned short * pRowCurr = r2;
+	unsigned short * pRowPlus1 = r3;
+	unsigned short * pRowPlus2 = r4;
+
+	__m128i temp0, temp1, temp2, temp3, pixels_plus1H, pixels_plus1L, pixels_plus2H, pixels_plus2L;
+
+	unsigned short * pLocalRowMinus2 = pRowMinus2;
+	unsigned short * pLocalRowMinus1 = pRowMinus1;
+	unsigned short * pLocalRowCurr = pRowCurr;
+	unsigned short * pLocalRowPlus1 = pRowPlus1;
+	unsigned short * pLocalRowPlus2 = pRowPlus2;
+	unsigned short * pTemp0, *pTemp1;
+
+	int srcStride = (int)srcImageStrideInBytes;
+	pLocalSrc += srcColOffset;
+
+	// Process first three rows - Horizontal filtering
+	for (int x = 0; x < (prefixWidth << 1); x++, pLocalSrc += 2)
+	{
+		*pLocalRowMinus2++ = Horizontal5x5GaussianFilter_C(pLocalSrc - (srcStride + srcStride));
+		*pLocalRowMinus1++ = Horizontal5x5GaussianFilter_C(pLocalSrc - srcStride);
+		*pLocalRowCurr++ = Horizontal5x5GaussianFilter_C(pLocalSrc);
+	}
+
+	for (int x = 0; x < (alignedWidth >> 3); x++)
+	{
+		__m128i temp0, temp1;
+
+		temp0 = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc - (srcStride + srcStride));
+		_mm_storeu_si128((__m128i *)pLocalRowMinus2, temp0);
+
+		temp1 = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc - srcStride);
+		_mm_storeu_si128((__m128i *)pLocalRowMinus1, temp1);
+
+		temp0 = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc);
+		_mm_storeu_si128((__m128i *)pLocalRowCurr, temp0);
+
+		pLocalSrc += 16;
+		pLocalRowMinus2 += 8;
+		pLocalRowMinus1 += 8;
+		pLocalRowCurr += 8;
+	}
+
+	for (int x = 0; x < (postfixWidth << 1); x++, pLocalSrc += 2)
+	{
+		*pLocalRowMinus2++ = Horizontal5x5GaussianFilter_C(pLocalSrc - (srcStride + srcStride));
+		*pLocalRowMinus1++ = Horizontal5x5GaussianFilter_C(pLocalSrc - srcStride);
+		*pLocalRowCurr++ = Horizontal5x5GaussianFilter_C(pLocalSrc);
+	}
+
+	pLocalRowMinus2 = pRowMinus2;
+	pLocalRowMinus1 = pRowMinus1;
+	pLocalRowCurr = pRowCurr;
+
+	// Process rows 4 till the end
+	int height = (int)dstHeight;
+	while (height)
+	{
+		pLocalSrc = (unsigned char *)(pSrcImage + srcStride + srcColOffset);			// Pointing to the row below
+		unsigned char * pLocalSrc_NextRow = pLocalSrc + srcStride;
+		pLocalDst = (unsigned char *)pDstImage;
+
+		for (int x = 0; x < prefixWidth; x++, pLocalSrc += 2)
+		{
+			short temp_plus1 = Horizontal5x5GaussianFilter_C(pLocalSrc);				// row + 1
+			*pLocalRowPlus1++ = temp_plus1;
+			short temp_plus2 = Horizontal5x5GaussianFilter_C(pLocalSrc_NextRow);		// row + 2
+			*pLocalRowPlus2++ = temp_plus2;
+
+			*pLocalDst++ = (unsigned char)((*pLocalRowMinus2++ + 4 * (*pLocalRowMinus1++) + 6 * (*pLocalRowCurr++) + 4 * temp_plus1 + temp_plus2) >> 8);
+		}
+
+		int width = (int)(alignedWidth >> 4);															// 16 dst pixels processed in one go
+		while (width)
+		{
+			temp0 = _mm_loadu_si128((__m128i *) pLocalRowCurr);											// c[0]
+			temp1 = _mm_loadu_si128((__m128i *) (pLocalRowCurr + 8));									// c[0]
+
+			pixels_plus1L = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc);				// Horizontal filtering	- c[1]
+			_mm_storeu_si128((__m128i *)pLocalRowPlus1, pixels_plus1L);
+			pixels_plus1H = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc + 16);			// Horizontal filtering	- c[1]
+			_mm_storeu_si128((__m128i *)(pLocalRowPlus1 + 8), pixels_plus1H);
+
+			pixels_plus1H = _mm_add_epi16(pixels_plus1H, temp1);										// c[0] + c[1]
+			pixels_plus1L = _mm_add_epi16(pixels_plus1L, temp0);										// c[0] + c[1]
+
+			pixels_plus2L = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc_NextRow);		// Horizontal filtering	- c[2]
+			_mm_storeu_si128((__m128i *)pLocalRowPlus2, pixels_plus2L);
+			pixels_plus2H = Horizontal3x3GaussianFilter_SampleFirstPixel_SSE(pLocalSrc_NextRow + 16);	// Horizontal filtering	- c[2]
+			_mm_storeu_si128((__m128i *)(pLocalRowPlus2 + 8), pixels_plus2H);
+
+			temp2 = _mm_loadu_si128((__m128i *) pLocalRowMinus1);										// c[-1]
+			temp3 = _mm_loadu_si128((__m128i *) (pLocalRowMinus1 + 8));									// c[-1]
+
+			temp1 = _mm_slli_epi16(temp1, 1);															// 2*c[0]
+			temp0 = _mm_slli_epi16(temp0, 1);															// 2*c[0]
+
+			pixels_plus1H = _mm_add_epi16(pixels_plus1H, temp3);										// c[-1] + c[0] + c[1]
+			pixels_plus1L = _mm_add_epi16(pixels_plus1L, temp2);										// c[-1] + c[0] + c[1]
+
+			temp2 = _mm_loadu_si128((__m128i *) pLocalRowMinus2);										// c[-2]
+			temp3 = _mm_loadu_si128((__m128i *) (pLocalRowMinus2 + 8));									// c[-2]
+
+			pixels_plus1H = _mm_slli_epi16(pixels_plus1H, 2);											// 4*c[-1] + 4*c[0] + 4*c[1]
+			pixels_plus1L = _mm_slli_epi16(pixels_plus1L, 2);											// 4*c[-1] + 4*c[0] + 4*c[1]
+
+			pixels_plus1H = _mm_add_epi16(pixels_plus1H, temp1);										// 4*c[-1] + 6*c[0] + 4*c[1]
+			pixels_plus1L = _mm_add_epi16(pixels_plus1L, temp0);										// 4*c[-1] + 6*c[0] + 4*c[1]
+
+			pixels_plus2H = _mm_add_epi16(pixels_plus2H, temp3);										// c[-2] + c[2]
+			pixels_plus2L = _mm_add_epi16(pixels_plus2L, temp2);										// c[-2] + c[2]
+
+			pixels_plus1H = _mm_add_epi16(pixels_plus1H, pixels_plus2H);								// c[-2] + 4*c[-1] + 4*c[0] + 4*c[1] + c[2]
+			pixels_plus1L = _mm_add_epi16(pixels_plus1L, pixels_plus2L);								// c[-2] + 4*c[-1] + 4*c[0] + 4*c[1] + c[2]
+
+			pixels_plus1H = _mm_srli_epi16(pixels_plus1H, 8);											// Divide by 256
+			pixels_plus1L = _mm_srli_epi16(pixels_plus1L, 8);											// Divide by 256
+
+			pixels_plus1L = _mm_packus_epi16(pixels_plus1L, pixels_plus1H);
+			_mm_store_si128((__m128i *)pLocalDst, pixels_plus1L);
+
+			pLocalSrc += 32;
+			pLocalSrc_NextRow += 32;
+			pLocalDst += 16;
+			pLocalRowMinus2 += 16;
+			pLocalRowMinus1 += 16;
+			pLocalRowCurr += 16;
+			pLocalRowPlus1 += 16;
+			pLocalRowPlus2 += 16;
+			width--;
+		}
+
+		for (int x = 0; x < postfixWidth; x++, pLocalSrc += 2, pLocalSrc_NextRow += 2)
+		{
+			short temp_plus1 = Horizontal5x5GaussianFilter_C(pLocalSrc);				// row + 1
+			*pLocalRowPlus1++ = temp_plus1;
+			short temp_plus2 = Horizontal5x5GaussianFilter_C(pLocalSrc_NextRow);		// row + 2
+			*pLocalRowPlus2++ = temp_plus2;
+
+			*pLocalDst++ = (unsigned char)((*pLocalRowMinus2++ + 4 * (*pLocalRowMinus1++) + 6 * (*pLocalRowCurr++) + 4 * temp_plus1 + temp_plus2) >> 8);
+		}
+
+		// Move two rows ahead
+		pTemp0 = pRowMinus2;
+		pTemp1 = pRowMinus1;
+		pRowMinus2 = pRowCurr;
+		pRowMinus1 = pRowPlus1;
+		pRowCurr = pRowPlus2;
+		pRowPlus1 = pTemp1;
+		pRowPlus2 = pTemp0;
+
+		pLocalRowMinus2 = pRowMinus2;
+		pLocalRowMinus1 = pRowMinus1;
+		pLocalRowCurr = pRowCurr;
+		pLocalRowPlus1 = pRowPlus1;
+		pLocalRowPlus2 = pRowPlus2;
+
+		pSrcImage += (srcImageStrideInBytes + srcImageStrideInBytes);
+		pDstImage += dstImageStrideInBytes;
+		height--;
+	}
+
+	return AGO_SUCCESS;
+}
+
+#define FP_BITS		16
+#define FP_MUL		(1<<FP_BITS)
+#define FP_ROUND    (1<<15)
+
+// The kernel does a gaussian blur followed by ORB scaling
+
+/* Gaussian Kernel			1   4   6   4   1			1		1   4   6   4   1
+							4  16  24  16   4			4
+					1/256	6  24  36  24   6    =		6									>> 8
+							4  16  24  16   4			4
+							1   4   6   4   1			1
+*/
+
+int HafCpu_ScaleGaussianOrb_U8_U8_5x5
+(
+	vx_uint32     dstWidth,
+	vx_uint32     dstHeight,
+	vx_uint8    * pDstImage,
+	vx_uint32     dstImageStrideInBytes,
+	vx_uint8    * pSrcImage,
+	vx_uint32     srcImageStrideInBytes,
+	vx_uint32     srcWidth,
+	vx_uint32     srcHeight,
+	vx_uint8    * pLocalData
+	)
+{
+	int xpos, ypos, x;
+	// need to recalculate scale_factor because they might differ from global scale_factor for different pyramid levels.
+	float xcale = (float)srcWidth / dstWidth;
+	float yscale = (float)srcHeight / (dstHeight + 4);
+	int xinc = (int)(FP_MUL * xcale);		// to convert to fixed point
+	int yinc = (int)(FP_MUL * yscale);
+	unsigned short *Xmap = (unsigned short *)pLocalData;
+	unsigned short *r0 = (Xmap + ((dstWidth+15)&~15));
+	vx_uint8 *r1 = (vx_uint8 *)(r0 + ((srcWidth&15)&~15));
+	__m128i z = _mm_setzero_si128(), c6 = _mm_set1_epi16(6);
+
+	// generate xmap for orbit scaling
+	// generate xmap;
+	xpos = (int)(0.5f*xinc);
+	for (x = 0; x < (int)dstWidth; x++, xpos += xinc)
+	{
+		int xmap;
+		xmap = (xpos >> FP_BITS);
+		Xmap[x] = (unsigned short)xmap;
+	}
+
+	ypos = (int)((2.5f) * yinc);  //starting from row 2 of dstimage
+	// do gaussian verical filter for ypos
+	for (int y = 0; y < (int)dstHeight; y++, ypos += yinc)
+	{
+		unsigned int x;
+		unsigned int *pdst = (unsigned int *)pDstImage;
+		const vx_uint8* pSrc = pSrcImage + (ypos >> FP_BITS)*srcImageStrideInBytes;
+		const vx_uint8* srow0 = pSrc - 2 * srcImageStrideInBytes;
+		const vx_uint8* srow1 = pSrc - srcImageStrideInBytes;
+		const vx_uint8* srow2 = pSrc + srcImageStrideInBytes;
+		const vx_uint8* srow3 = pSrc + 2*srcImageStrideInBytes;
+		// do vertical convolution
+		for (x = 0; x < srcWidth; x += 16)
+		{
+			__m128i s0 = _mm_load_si128((const __m128i*)(srow0 + x));
+			__m128i s1 = _mm_load_si128((const __m128i*)(srow1 + x));
+			__m128i s2 = _mm_load_si128((const __m128i*)(pSrc + x));
+			__m128i s3 = _mm_load_si128((const __m128i*)(srow2 + x));
+			__m128i s4 = _mm_load_si128((const __m128i*)(srow3 + x));
+			__m128i s0_L = _mm_unpacklo_epi8(s0, z);
+			__m128i s4_L = _mm_unpacklo_epi8(s4, z);
+			s0 = _mm_unpackhi_epi8(s0, z);
+			s4 = _mm_unpackhi_epi8(s4, z);
+			s0_L = _mm_add_epi16(s0_L, s4_L);
+			s0 = _mm_add_epi16(s0, s4);
+			__m128i s1_L = _mm_add_epi16(_mm_unpacklo_epi8(s1, z), _mm_unpacklo_epi8(s3, z));
+			s1 = _mm_add_epi16(_mm_unpackhi_epi8(s1, z), _mm_unpackhi_epi8(s3, z));
+			s4_L = _mm_unpacklo_epi8(s2, z);
+			s2  = _mm_unpackhi_epi8(s2, z);
+			s0_L = _mm_add_epi16(s0_L, _mm_slli_epi16(s1_L, 2));
+			s0 = _mm_add_epi16(s0, _mm_slli_epi16(s1, 2));
+			s0_L = _mm_add_epi16(s0_L, _mm_mullo_epi16(s4_L, c6));		// low 8 filtered
+			s0 = _mm_add_epi16(s0, _mm_mullo_epi16(s2, c6));			// Hi 8 filtered.
+			// copy to temp
+			_mm_store_si128((__m128i*)(r0 + x), s0_L);
+			_mm_store_si128((__m128i*)(r0 + x+ 8), s0);
+		}
+
+		// do horizontal convolution and copy to r1
+		for (x = 0; x <srcWidth; x += 8)
+		{
+			__m128i s0 = _mm_loadu_si128((const __m128i*)(r0 + x - 2));
+			__m128i s1 = _mm_loadu_si128((const __m128i*)(r0 + x - 1));
+			__m128i s2 = _mm_loadu_si128((const __m128i*)(r0 + x));
+			__m128i s3 = _mm_loadu_si128((const __m128i*)(r0 + x + 1));
+			__m128i s4 = _mm_loadu_si128((const __m128i*)(r0 + x + 2));
+			s0 = _mm_add_epi16(s0, s4);
+			s1 = _mm_add_epi16(s1, s3);
+			s0 = _mm_add_epi16(s0, _mm_slli_epi16(s1, 2));
+			s0 = _mm_add_epi16(s0, _mm_mullo_epi16(s2, c6));			// filtered.
+			s0 = _mm_srli_epi16(s0, 8);				// /256
+			s0 = _mm_packus_epi16(s0, s0);
+			_mm_storel_epi64((__m128i*)(r1 + x), s0);
+		}
+		// do NN scaling and copy to dst
+		for (x = 0; x <= dstWidth-4; x += 4){
+			const unsigned short *xm = &Xmap[x];
+			*pdst++ = r1[xm[0]] | (r1[xm[1]] << 8) |
+				(r1[xm[2]] << 16) | (r1[xm[3]] << 24);
+		}
+		for (; x < dstWidth; x++)
+			pDstImage[x] = r1[Xmap[x]];
+
+		pDstImage += dstImageStrideInBytes;
+	}
+	return AGO_SUCCESS;
+}
+
diff --git a/openvx/ago/ago_haf_gpu.h b/openvx/ago/ago_haf_gpu.h
new file mode 100644
index 0000000..d3f5d53
--- /dev/null
+++ b/openvx/ago/ago_haf_gpu.h
@@ -0,0 +1,203 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __ago_haf_gpu_h__
+#define __ago_haf_gpu_h__
+
+#include "ago_internal.h"
+
+#if ENABLE_OPENCL
+
+// OpenCL string format
+#define OPENCL_FORMAT(fmt) fmt
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code to load into local memory:
+//   this code assumes following variables created by caller in "code"
+//     gx      - global work item [0]
+//     gy      - global work item [1]
+//     gbuf    - global buffer pointer
+//     gstride - global buffer stride
+//     lx      - local work item [0]
+//     ly      - local work item [1]
+//     lbuf    - local buffer pointer
+//
+int HafGpu_Load_Local(int WGWidth, int WGHeight, int LMWidth, int LMHeight, int gxoffset, int gyoffset, std::string& code);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for linear filter
+//
+int HafGpu_LinearFilter_ANY_U8(AgoNode * node, vx_df_image dst_image_format, AgoData * iConv, bool roundingMode);
+int HafGpu_LinearFilter_ANY_S16(AgoNode * node, vx_df_image dst_image_format, AgoData * iConv, bool roundingMode);
+int HafGpu_LinearFilter_ANY_F32(AgoNode * node, vx_df_image dst_image_format, AgoData * iConv, bool roundingMode);
+int HafGpu_LinearFilter_ANYx2_U8(AgoNode * node, vx_df_image dst_image_format, AgoData * iConv, AgoData * iConv2, bool roundingMode);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following half scale gaussian filters:
+//   VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_3x3
+//   VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5
+//
+int HafGpu_ScaleGaussianHalf(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following gaussian scale filters:
+//   VX_KERNEL_AMD_SCALE_GAUSSIAN_ORB_U8_U8_5x5 (interpolation = VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR)
+//
+int HafGpu_ScaleGaussianOrb(AgoNode * node, vx_interpolation_type_e interpolation);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following special case Sobel filter kernels:
+//   VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX
+//   VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY
+//   VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY
+//   VX_KERNEL_AMD_SOBEL_MAGNITUDE_PHASE_S16U8_U8_3x3
+//   VX_KERNEL_AMD_SOBEL_MAGNITUDE_S16_U8_3x3
+//   VX_KERNEL_AMD_SOBEL_PHASE_U8_U8_3x3
+//
+int HafGpu_SobelSpecialCases(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following canny sobel filter kernels:
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L2NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L2NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM
+//
+int HafGpu_CannySobelFilters(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following canny non-max supression filter kernels:
+//   VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8_U16_3x3
+//   VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3
+//
+int HafGpu_CannySuppThreshold(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following harris sobel filter kernels:
+//   VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_3x3
+//   VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_5x5
+//   VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_7x7
+//
+int HafGpu_HarrisSobelFilters(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following harris score filter kernels:
+//   VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_3x3
+//   VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_5x5
+//   VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_7x7
+//
+int HafGpu_HarrisScoreFilters(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following non-max supression filter kernels:
+//   VX_KERNEL_AMD_NON_MAX_SUPP_XY_ANY_3x3
+//
+int HafGpu_NonMaxSupp_XY_ANY_3x3(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for following non-linear filter kernels:
+//   VX_KERNEL_AMD_DILATE_U8_U8_3x3, VX_KERNEL_AMD_DILATE_U1_U8_3x3,
+//   VX_KERNEL_AMD_ERODE_U8_U8_3x3, VX_KERNEL_AMD_ERODE_U1_U8_3x3, 
+//   VX_KERNEL_AMD_MEDIAN_U8_U8_3x3
+//
+int HafGpu_NonLinearFilter_3x3_ANY_U8(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for following non-linear filter kernels:
+//   VX_KERNEL_AMD_DILATE_U8_U1_3x3, VX_KERNEL_AMD_DILATE_U1_U1_3x3,
+//   VX_KERNEL_AMD_ERODE_U8_U1_3x3, VX_KERNEL_AMD_ERODE_U1_U1_3x3, 
+//
+int HafGpu_NonLinearFilter_3x3_ANY_U1(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for following fast corner detector kernels:
+//   VX_KERNEL_AMD_FAST_CORNERS_XY_U8_NOSUPRESSION, VX_KERNEL_AMD_FAST_CORNERS_XY_U8_SUPRESSION,
+//
+int HafGpu_FastCorners_XY_U8(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following channel combine kernels
+//   VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_UYVY
+//   VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_YUYV
+//
+int HafGpu_ChannelCombine_U32_422(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following channel extractions:
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3
+//
+int HafGpu_ChannelExtract_U8_U32(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following format conversions:
+//   VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY
+//   VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV
+//   VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY
+//   VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV
+//
+int HafGpu_FormatConvert_420_422(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following format conversions:
+//   VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12
+//   VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12
+//   VX_KERNEL_AMD_FORMAT_CONVERT_UV12_IUV
+//   VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8
+//
+int HafGpu_FormatConvert_Chroma(AgoNode * node);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following color conversions:
+//   VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV
+//
+int HafGpu_ColorConvert(AgoNode * node);
+
+#endif
+
+#endif
diff --git a/openvx/ago/ago_haf_gpu_common.cpp b/openvx/ago/ago_haf_gpu_common.cpp
new file mode 100644
index 0000000..4cb1e37
--- /dev/null
+++ b/openvx/ago/ago_haf_gpu_common.cpp
@@ -0,0 +1,248 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_haf_gpu.h"
+
+#if ENABLE_OPENCL
+
+#define ENABLE_UINT4_FOR_LOCAL_MEMORY_LOADS      1  // 0:disable 1:enable uint4 for local memory loads
+#define ENABLE_UINT8_FOR_LOCAL_MEMORY_LOADS      1  // 0:disable 1:enable uint8 for local memory loads
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code to load into local memory:
+//   this code assumes following variables created by caller in "code"
+//     gx   - global work item [0]
+//     gy   - global work item [1]
+//     gbuf - global buffer
+//     lx   - local work item [0]
+//     ly   - local work item [1]
+//     lbuf - local buffer
+//
+int HafGpu_Load_Local(int WGWidth, int WGHeight, int LMWidth, int LMHeight, int gxoffset, int gyoffset, std::string& code)
+{
+	char item[1024];
+
+	// configuration parameters
+	int LMdivWGWidthShift = leftmostbit(LMWidth / WGWidth);
+	int LMWidthRemain = LMWidth - (WGWidth << LMdivWGWidthShift);
+	if (LMdivWGWidthShift < 2) {
+		agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_Load_Local(%dx%d,%dx%d,(%d,%d)): doesn't support LMdivWGWidthShift=%d\n", WGWidth, WGHeight, LMWidth, LMHeight, gxoffset, gyoffset, LMdivWGWidthShift);
+		return -1;
+	}
+
+	// identify load data type
+	const char * dType = "uint";
+	int dTypeShift = 2;
+	if ((LMdivWGWidthShift > 2)) {
+		dType = "uint2";
+		dTypeShift = 3;
+#if ENABLE_UINT4_FOR_LOCAL_MEMORY_LOADS
+		if (LMdivWGWidthShift > 3) {
+			dType = "uint4";
+			dTypeShift = 4;
+#if ENABLE_UINT8_FOR_LOCAL_MEMORY_LOADS
+			if (LMdivWGWidthShift > 4) {
+				dType = "uint8";
+				dTypeShift = 5;
+			}
+#endif
+		}
+#endif
+	}
+	int dGroupsShift = LMdivWGWidthShift - dTypeShift;
+	int dGroups = 1 << dGroupsShift;
+	bool use_vload = ((dTypeShift > 2) && (gxoffset & ((1 << dTypeShift) - 1))) ? true : false;
+
+	// generate code
+	sprintf(item,
+		OPENCL_FORMAT(
+		"  { // load %dx%d bytes into local memory using %dx%d workgroup\n" // LMWidth, LMHeight, WGWidth, WGHeight
+		"    int loffset = ly * %d + (lx << %d);\n" // LMWidth, dTypeShift
+		"    int goffset = (gy - %d) * gstride + (gx << %d) - %d;\n" // gyoffset, dTypeShift, gxoffset
+		), LMWidth, LMHeight, WGWidth, WGHeight, LMWidth, dTypeShift, gyoffset, dTypeShift, gxoffset);
+	code += item;
+	int LMHeightRemain = LMHeight - WGHeight;
+	int LMRemain = (LMWidthRemain * LMHeight + (LMWidth - LMWidthRemain) * LMHeightRemain) >> dTypeShift;
+	if (dGroups == 1 && LMWidthRemain > 0 && LMHeightRemain > 0 && LMRemain < (WGWidth * WGHeight)) {
+		// sanity check
+		if (LMWidthRemain & ((1 << dTypeShift) - 1)) {
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_Load_Local(%dx%d,%dx%d,(%d,%d)): doesn't support LMWidthRemain=%d with %s\n", WGWidth, WGHeight, LMWidth, LMHeight, gxoffset, gyoffset, LMWidthRemain, dType);
+			return -1;
+		}
+		if (use_vload) {
+			sprintf(item, "    *(__local %s *)(lbuf + loffset) = vload%c(0, (__global uint *)(gbuf + goffset));\n", dType, dType[4]);
+		}
+		else {
+			sprintf(item, "    *(__local %s *)(lbuf + loffset) = *(__global %s *)(gbuf + goffset);\n", dType, dType);
+		}
+		code += item;
+		// get configuration for extra load
+		int dWidth = LMWidthRemain >> dTypeShift;
+		int dHeight = LMHeight;
+		int dSize = dWidth * dHeight;
+		int dWidthShift = leftmostbit(dWidth);
+		if (dWidth != (1 << dWidthShift)) dWidthShift = -1;
+		sprintf(item,
+			OPENCL_FORMAT(
+			"    bool doExtraLoad = false;\n"
+			"    if (ly < %d) {\n" // LMHeight - WGHeight
+			"      loffset += %d * %d;\n" // WGHeight, LMWidth
+			"      goffset += %d * gstride;\n" // WGHeight
+			"      doExtraLoad = true;\n"
+			"    }\n"
+			"    else {\n"
+			"      int id = (ly - %d) * %d + lx;\n" // LMHeight - WGHeight, WGWidth
+			"      int ry = id %s %d;\n" // (id / dWidth) or (id >> dWidthShift)
+			"      int rx = id %s %d;\n" // (id - ry * dWidth) or (id & (dWidth-1))
+			"      loffset = ry * %d + (rx << %d) + %d;\n" // LMWidth, dTypeShift
+			"      goffset = (gy - ly + ry - %d) * gstride + ((gx - lx + rx) << %d) + %d;\n" // gyoffset, dTypeShift, (WGWidth << LMdivWGWidthShift) - gxoffset
+			"      doExtraLoad = (ry < %d) ? true : false;\n" // LMHeight
+			"    }\n"
+			"    if (doExtraLoad) {\n")
+			, LMHeight - WGHeight, WGHeight, LMWidth, WGHeight
+			, LMHeight - WGHeight, WGWidth, (dWidthShift < 0) ? "/" : ">>", (dWidthShift < 0) ? dWidth : dWidthShift, (dWidthShift < 0) ? "- ry *" : "&", (dWidthShift < 0) ? dWidth : dWidth - 1
+			, LMWidth, dTypeShift, (WGWidth << LMdivWGWidthShift)
+			, gyoffset, dTypeShift, (WGWidth << LMdivWGWidthShift) - gxoffset, LMHeight);
+		code += item;
+		if (use_vload) {
+			sprintf(item, "      *(__local %s *)(lbuf + loffset) = vload%c(0, (__global uint *)(gbuf + goffset));\n", dType, dType[4]);
+		}
+		else {
+			sprintf(item, "      *(__local %s *)(lbuf + loffset) = *(__global %s *)(gbuf + goffset);\n", dType, dType);
+		}
+		code += item;
+		code += "    }\n";
+	}
+	else {
+		for (int y = 0; y < LMHeight; y += WGHeight) {
+			if ((LMHeight - y) < WGHeight) {
+				sprintf(item, "   if (ly < %d) {\n", LMHeight - y);
+				code += item;
+			}
+			if (y > 0) {
+				sprintf(item,
+					"    loffset += %d * %d;\n" // WGHeight, LMWidth
+					"    goffset += %d * gstride;\n" // WGHeight
+					, WGHeight, LMWidth, WGHeight);
+				code += item;
+			}
+			if (use_vload) {
+				sprintf(item, "    *(__local %s *)(lbuf + loffset) = vload%c(0, (__global uint *)(gbuf + goffset));\n", dType, dType[4]);
+			}
+			else {
+				sprintf(item, "    *(__local %s *)(lbuf + loffset) = *(__global %s *)(gbuf + goffset);\n", dType, dType);
+			}
+			code += item;
+			if (dGroups > 1) {
+				if (y > 0) {
+					code +=
+						"    loffset_t = loffset;\n"
+						"    goffset_t = goffset;\n";
+				}
+				else {
+					code +=
+						"    int loffset_t = loffset;\n"
+						"    int goffset_t = goffset;\n";
+				}
+				for (int ix = 1; ix < dGroups; ix++) {
+					sprintf(item,
+						"    loffset_t += %d;\n" // WGWidth << dTypeShift
+						"    goffset_t += %d;\n" // WGWidth << dTypeShift
+						, WGWidth << dTypeShift, WGWidth << dTypeShift);
+					code += item;
+					if (use_vload) {
+						sprintf(item, "    *(__local %s *)(lbuf + loffset_t) = vload%c(0, (__global uint *)(gbuf + goffset_t));\n", dType, dType[4]);
+					}
+					else {
+						sprintf(item, "    *(__local %s *)(lbuf + loffset_t) = *(__global %s *)(gbuf + goffset_t);\n", dType, dType);
+					}
+					code += item;
+				}
+			}
+			if ((LMHeight - y) < WGHeight) {
+				code += "   }\n";
+			}
+		}
+		if (LMWidthRemain > 0) {
+			// sanity check
+			if (LMWidthRemain & ((1 << dTypeShift) - 1)) {
+				agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_Load_Local(%dx%d,%dx%d,(%d,%d)): doesn't support LMWidthRemain=%d with %s\n", WGWidth, WGHeight, LMWidth, LMHeight, gxoffset, gyoffset, LMWidthRemain, dType);
+				return -1;
+			}
+			// get configuration
+			int dWidth = LMWidthRemain >> dTypeShift;
+			int dHeight = LMHeight;
+			int dSize = dWidth * dHeight;
+			int dWidthShift = leftmostbit(dWidth);
+			if (dWidth != (1 << dWidthShift)) dWidthShift = -1;
+			// compute start addresses
+			sprintf(item,
+				"    __local uchar * lbufptr = lbuf + %d;\n" // (WGWidth << LMdivWGWidthShift)
+				"    goffset = (gy - ly - %d) * gstride + ((gx - lx) << %d) + %d;\n" // gyoffset, dTypeShift, (WGWidth << LMdivWGWidthShift) - gxoffset
+				, (WGWidth << LMdivWGWidthShift), gyoffset, dTypeShift, (WGWidth << LMdivWGWidthShift) - gxoffset);
+			code += item;
+			// load memory
+			for (int dCount = 0; dCount < dSize; dCount += WGWidth * WGHeight) {
+				// compute rx & ry of remaining region
+				if (dCount == 0) sprintf(item, "    int id = ly * %d + lx, rx, ry;\n", WGWidth);
+				else             sprintf(item, "    id += %d;\n", WGWidth * WGHeight);
+				code += item;
+				if (dWidthShift < 0) {
+					sprintf(item,
+						"    ry = id / %d;\n" // dWidth
+						"    rx = id - ry * %d;\n" // dWidth
+						, dWidth, dWidth);
+				}
+				else {
+					sprintf(item,
+						"    ry = id >> %d;\n" // dWidthShift
+						"    rx = id & %d;\n" // dWidth-1
+						, dWidthShift, dWidth - 1);
+				}
+				code += item;
+				if ((dSize - dCount) < (WGWidth * WGHeight)) {
+					sprintf(item, "   if (ry < %d) {\n", dHeight);
+					code += item;
+				}
+				if (use_vload) {
+					sprintf(item, "    *(__local %s *)(lbufptr + ry * %d + (rx << %d)) = vload%c(0, (__global uint *)(gbuf + goffset + ry * gstride + (rx << %d)));\n", dType, LMWidth, dTypeShift, dType[4], dTypeShift);
+				}
+				else {
+					sprintf(item, "    *(__local %s *)(lbufptr + ry * %d + (rx << %d)) = *(__global %s *)(gbuf + goffset + ry * gstride + (rx << %d));\n", dType, LMWidth, dTypeShift, dType, dTypeShift);
+				}
+				code += item;
+				if ((dSize - dCount) < (WGWidth * WGHeight)) {
+					code += "   }\n";
+				}
+			}
+		}
+	}
+
+	code +=
+		"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"  }\n";
+
+	return VX_SUCCESS;
+}
+
+#endif
diff --git a/openvx/ago/ago_haf_gpu_conversion.cpp b/openvx/ago/ago_haf_gpu_conversion.cpp
new file mode 100644
index 0000000..d386533
--- /dev/null
+++ b/openvx/ago/ago_haf_gpu_conversion.cpp
@@ -0,0 +1,1260 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_haf_gpu.h"
+
+#if ENABLE_OPENCL
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following channel combine kernels
+//   VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_UYVY
+//   VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_YUYV
+//
+int HafGpu_ChannelCombine_U32_422(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	// configuration
+	vx_enum kernel = node->akernel->id;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	int stride0 = node->paramList[0]->u.img.stride_in_bytes;
+	int stride1 = node->paramList[1]->u.img.stride_in_bytes;
+	int stride2 = node->paramList[2]->u.img.stride_in_bytes;
+	int stride3 = node->paramList[3]->u.img.stride_in_bytes;
+	int work_group_width = 16;
+	int work_group_height = 4;
+
+	char combineCode[1024];
+	if (kernel == VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_UYVY) {
+		sprintf(combineCode,
+			OPENCL_FORMAT(
+			"    out.s0 = amd_pack((float4)(amd_unpack0(pU), amd_unpack0(pY.s0), amd_unpack0(pV), amd_unpack1(pY.s0)));\n"
+			"    out.s1 = amd_pack((float4)(amd_unpack1(pU), amd_unpack2(pY.s0), amd_unpack1(pV), amd_unpack3(pY.s0)));\n"
+			"    out.s2 = amd_pack((float4)(amd_unpack2(pU), amd_unpack0(pY.s1), amd_unpack2(pV), amd_unpack1(pY.s1)));\n"
+			"    out.s3 = amd_pack((float4)(amd_unpack3(pU), amd_unpack2(pY.s1), amd_unpack3(pV), amd_unpack3(pY.s1)));\n"
+			));
+	}
+	else if (kernel == VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_YUYV) {
+		sprintf(combineCode,
+			OPENCL_FORMAT(
+			"    out.s0 = amd_pack((float4)(amd_unpack0(pY.s0), amd_unpack0(pU), amd_unpack1(pY.s0), amd_unpack0(pV)));\n"
+			"    out.s1 = amd_pack((float4)(amd_unpack2(pY.s0), amd_unpack1(pU), amd_unpack3(pY.s0), amd_unpack1(pV)));\n"
+			"    out.s2 = amd_pack((float4)(amd_unpack0(pY.s1), amd_unpack2(pU), amd_unpack1(pY.s1), amd_unpack2(pV)));\n"
+			"    out.s3 = amd_pack((float4)(amd_unpack2(pY.s1), amd_unpack3(pU), amd_unpack3(pY.s1), amd_unpack3(pV)));\n"
+			));
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_ChannelCombine_U32_422 doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	// kernel body
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+		"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset,\n"
+		"        uint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset,\n"
+		"        uint p3_width, uint p3_height, __global uchar * p3_buf, uint p3_stride, uint p3_offset)\n"
+		"{\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, height
+		"    p0_buf += p0_offset;\n"
+		"    p1_buf += p1_offset;\n"
+		"    p2_buf += p2_offset;\n"
+		"    p3_buf += p3_offset;\n"
+		"    p0_buf += (gy * %d) + (gx << 4);\n" // stride0
+		"    p1_buf += (gy * %d) + (gx << 3);\n" // stride1
+		"    p2_buf += (gy * %d) + (gx << 2);\n" // stride2
+		"    p3_buf += (gy * %d) + (gx << 2);\n" // stride3
+		"    uint2 pY = *(__global uint2 *) p1_buf;\n"
+		"    uint  pU = *(__global uint  *) p2_buf;\n"
+		"    uint  pV = *(__global uint  *) p3_buf;\n"
+		"    uint4 out;\n"
+		"%s"
+		"    *(__global uint4 *) p0_buf = out;\n"
+		"  }\n"
+		"}\n")
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7) / 8, height, stride0, stride1, stride2, stride3, combineCode);
+	node->opencl_code = item;
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 7) >> 3) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following channel extractions:
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2
+//   VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3
+//
+int HafGpu_ChannelExtract_U8_U32(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	// configuration
+	vx_enum kernel = node->akernel->id;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	int stride0 = node->paramList[0]->u.img.stride_in_bytes;
+	int stride1 = node->paramList[1]->u.img.stride_in_bytes;
+
+	int work_group_width = 16;
+	int work_group_height = 4;
+
+	char extractionCode[1024];
+	if (kernel == VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0)	{
+		sprintf(extractionCode,
+			"	r.s0 = amd_pack((float4)(amd_unpack0(L.s0), amd_unpack0(L.s1), amd_unpack0(L.s2), amd_unpack0(L.s3)));\n"
+			"	r.s1 = amd_pack((float4)(amd_unpack0(L.s4), amd_unpack0(L.s5), amd_unpack0(L.s6), amd_unpack0(L.s7)));\n"
+			);
+	}
+	else if (kernel == VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1)	{
+		sprintf(extractionCode,
+			"	r.s0 = amd_pack((float4)(amd_unpack1(L.s0), amd_unpack1(L.s1), amd_unpack1(L.s2), amd_unpack1(L.s3)));\n"
+			"	r.s1 = amd_pack((float4)(amd_unpack1(L.s4), amd_unpack1(L.s5), amd_unpack1(L.s6), amd_unpack1(L.s7)));\n"
+			);
+	}
+	else if (kernel == VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2)	{
+		sprintf(extractionCode,
+			"	r.s0 = amd_pack((float4)(amd_unpack2(L.s0), amd_unpack2(L.s1), amd_unpack2(L.s2), amd_unpack2(L.s3)));\n"
+			"	r.s1 = amd_pack((float4)(amd_unpack2(L.s4), amd_unpack2(L.s5), amd_unpack2(L.s6), amd_unpack2(L.s7)));\n"
+			);
+	}
+	else if (kernel == VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3)	{
+		sprintf(extractionCode,
+			"	r.s0 = amd_pack((float4)(amd_unpack3(L.s0), amd_unpack3(L.s1), amd_unpack3(L.s2), amd_unpack3(L.s3)));\n"
+			"	r.s1 = amd_pack((float4)(amd_unpack3(L.s4), amd_unpack3(L.s5), amd_unpack3(L.s6), amd_unpack3(L.s7)));\n"
+			);
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_ChannelExtract_U8_U32 doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	// kernel body
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+		"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset)\n"
+		"{\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  if ((gx < %d) && (gy < %d)) {\n" // (width+3)/4, height
+		"    p0_buf += p0_offset;\n"
+		"    p1_buf += p1_offset;\n"
+		"    p0_buf += (gy * %d) + (gx << 2);\n" // stride0
+		"    p1_buf += (gy * %d) + (gx << 4);\n" // stride1
+		"    uint8 L = *(__global uint8 *) p1_buf;\n"
+		"	 uint2 r;\n"
+		"%s"
+		"	 *(__global uint2 *) p0_buf = r;\n"
+		"  }\n"
+		"}\n")
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 3) / 4, height, stride0, stride1, extractionCode
+		);
+	node->opencl_code = item;
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 3) >> 2) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+	
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following format conversions:
+//   VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY
+//   VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV
+//   VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY
+//   VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV
+//
+int HafGpu_FormatConvert_420_422(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	// configuration
+	vx_enum kernel = node->akernel->id;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	int stride0 = node->paramList[0]->u.img.stride_in_bytes;
+	int stride1 = node->paramList[1]->u.img.stride_in_bytes;
+	int stride2 = node->paramList[2]->u.img.stride_in_bytes;
+	int stride3 = node->paramList[3] ? node->paramList[3]->u.img.stride_in_bytes : 0;
+	int work_group_width = 16;
+	int work_group_height = 4;
+
+	char conversionCode[1024];
+	if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV) {
+		sprintf(conversionCode,
+			OPENCL_FORMAT(
+			"    pY0.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack2(L0.s0), amd_unpack0(L0.s1), amd_unpack2(L0.s1)));\n"
+			"    pY0.s1 = amd_pack((float4)(amd_unpack0(L0.s2), amd_unpack2(L0.s2), amd_unpack0(L0.s3), amd_unpack2(L0.s3)));\n"
+			"    pY1.s0 = amd_pack((float4)(amd_unpack0(L1.s0), amd_unpack2(L1.s0), amd_unpack0(L1.s1), amd_unpack2(L1.s1)));\n"
+			"    pY1.s1 = amd_pack((float4)(amd_unpack0(L1.s2), amd_unpack2(L1.s2), amd_unpack0(L1.s3), amd_unpack2(L1.s3)));\n"
+			"    L0.s0  = amd_lerp(L0.s0, L1.s0, 0x01010101);\n"
+			"    L0.s1  = amd_lerp(L0.s1, L1.s1, 0x01010101);\n"
+			"    L0.s2  = amd_lerp(L0.s2, L1.s2, 0x01010101);\n"
+			"    L0.s3  = amd_lerp(L0.s3, L1.s3, 0x01010101);\n"
+			"    pU     = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack1(L0.s1), amd_unpack1(L0.s2), amd_unpack1(L0.s3)));\n"
+			"    pV     = amd_pack((float4)(amd_unpack3(L0.s0), amd_unpack3(L0.s1), amd_unpack3(L0.s2), amd_unpack3(L0.s3)));\n"
+			));
+	}
+	else if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY) {
+		sprintf(conversionCode,
+			OPENCL_FORMAT(
+			"    pY0.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack3(L0.s0), amd_unpack1(L0.s1), amd_unpack3(L0.s1)));\n"
+			"    pY0.s1 = amd_pack((float4)(amd_unpack1(L0.s2), amd_unpack3(L0.s2), amd_unpack1(L0.s3), amd_unpack3(L0.s3)));\n"
+			"    pY1.s0 = amd_pack((float4)(amd_unpack1(L1.s0), amd_unpack3(L1.s0), amd_unpack1(L1.s1), amd_unpack3(L1.s1)));\n"
+			"    pY1.s1 = amd_pack((float4)(amd_unpack1(L1.s2), amd_unpack3(L1.s2), amd_unpack1(L1.s3), amd_unpack3(L1.s3)));\n"
+			"    L0.s0  = amd_lerp(L0.s0, L1.s0, 0x01010101);\n"
+			"    L0.s1  = amd_lerp(L0.s1, L1.s1, 0x01010101);\n"
+			"    L0.s2  = amd_lerp(L0.s2, L1.s2, 0x01010101);\n"
+			"    L0.s3  = amd_lerp(L0.s3, L1.s3, 0x01010101);\n"
+			"    pU     = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack0(L0.s1), amd_unpack0(L0.s2), amd_unpack0(L0.s3)));\n"
+			"    pV     = amd_pack((float4)(amd_unpack2(L0.s0), amd_unpack2(L0.s1), amd_unpack2(L0.s2), amd_unpack2(L0.s3)));\n"
+			));
+	}
+	else if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV) {
+		sprintf(conversionCode,
+			OPENCL_FORMAT(
+			"    pY0.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack2(L0.s0), amd_unpack0(L0.s1), amd_unpack2(L0.s1)));\n"
+			"    pY0.s1 = amd_pack((float4)(amd_unpack0(L0.s2), amd_unpack2(L0.s2), amd_unpack0(L0.s3), amd_unpack2(L0.s3)));\n"
+			"    pY1.s0 = amd_pack((float4)(amd_unpack0(L1.s0), amd_unpack2(L1.s0), amd_unpack0(L1.s1), amd_unpack2(L1.s1)));\n"
+			"    pY1.s1 = amd_pack((float4)(amd_unpack0(L1.s2), amd_unpack2(L1.s2), amd_unpack0(L1.s3), amd_unpack2(L1.s3)));\n"
+			"    L0.s0  = amd_lerp(L0.s0, L1.s0, 0x01010101);\n"
+			"    L0.s1  = amd_lerp(L0.s1, L1.s1, 0x01010101);\n"
+			"    L0.s2  = amd_lerp(L0.s2, L1.s2, 0x01010101);\n"
+			"    L0.s3  = amd_lerp(L0.s3, L1.s3, 0x01010101);\n"
+			"    pUV.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack3(L0.s0), amd_unpack1(L0.s1), amd_unpack3(L0.s1)));\n"
+			"    pUV.s1 = amd_pack((float4)(amd_unpack1(L0.s2), amd_unpack3(L0.s2), amd_unpack1(L0.s3), amd_unpack3(L0.s3)));\n"
+			));
+	}
+	else if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY) {
+		sprintf(conversionCode,
+			OPENCL_FORMAT(
+			"    pY0.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack3(L0.s0), amd_unpack1(L0.s1), amd_unpack3(L0.s1)));\n"
+			"    pY0.s1 = amd_pack((float4)(amd_unpack1(L0.s2), amd_unpack3(L0.s2), amd_unpack1(L0.s3), amd_unpack3(L0.s3)));\n"
+			"    pY1.s0 = amd_pack((float4)(amd_unpack1(L1.s0), amd_unpack3(L1.s0), amd_unpack1(L1.s1), amd_unpack3(L1.s1)));\n"
+			"    pY1.s1 = amd_pack((float4)(amd_unpack1(L1.s2), amd_unpack3(L1.s2), amd_unpack1(L1.s3), amd_unpack3(L1.s3)));\n"
+			"    L0.s0  = amd_lerp(L0.s0, L1.s0, 0x01010101);\n"
+			"    L0.s1  = amd_lerp(L0.s1, L1.s1, 0x01010101);\n"
+			"    L0.s2  = amd_lerp(L0.s2, L1.s2, 0x01010101);\n"
+			"    L0.s3  = amd_lerp(L0.s3, L1.s3, 0x01010101);\n"
+			"    pUV.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack2(L0.s0), amd_unpack0(L0.s1), amd_unpack2(L0.s1)));\n"
+			"    pUV.s1 = amd_pack((float4)(amd_unpack0(L0.s2), amd_unpack2(L0.s2), amd_unpack0(L0.s3), amd_unpack2(L0.s3)));\n"
+			));
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_FormatConvert_420_422 doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	// kernel declaration
+	char item[8192];
+	if ((kernel == VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV) || (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY)) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+			"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+			"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+			"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset,\n"
+			"        uint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset,\n"
+			"        uint p3_width, uint p3_height, __global uchar * p3_buf, uint p3_stride, uint p3_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p0_buf += p0_offset;\n"
+			"    p1_buf += p1_offset;\n"
+			"    p2_buf += p2_offset;\n"
+			"    p3_buf += p3_offset;\n"
+			"    p0_buf += (gy * %d) + (gx << 3);\n" // stride0 * 2
+			"    p1_buf += (gy * %d) + (gx << 2);\n" // stride1
+			"    p2_buf += (gy * %d) + (gx << 2);\n" // stride2
+			"    p3_buf += (gy * %d) + (gx << 4);\n" // stride3 * 2
+			"    uint4 L0 = *(__global uint4 *) p3_buf;\n"
+			"    uint4 L1 = *(__global uint4 *)&p3_buf[%d];\n" // stride3
+			"    uint2 pY0, pY1; uint pU, pV;\n"
+			"%s"
+			"    *(__global uint2 *) p0_buf = pY0;\n"
+			"    *(__global uint2 *)&p0_buf[%d] = pY1;\n" // stride0
+			"    *(__global uint  *) p1_buf = pU;\n"
+			"    *(__global uint  *) p2_buf = pV;\n"
+			"  }\n"
+			"}\n")
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7) / 8, (height + 1) / 2, stride0 * 2, stride1, stride2, stride3 * 2, stride3, conversionCode, stride0);
+		node->opencl_code = item;
+	}
+	else if ((kernel == VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV) || (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY)) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+			"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+			"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+			"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset,\n"
+			"        uint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p0_buf += p0_offset;\n"
+			"    p1_buf += p1_offset;\n"
+			"    p2_buf += p2_offset;\n"
+			"    p0_buf += (gy * %d) + (gx << 3);\n" // stride0 * 2
+			"    p1_buf += (gy * %d) + (gx << 3);\n" // stride1
+			"    p2_buf += (gy * %d) + (gx << 4);\n" // stride2 * 2
+			"    uint4 L0 = *(__global uint4 *) p2_buf;\n"
+			"    uint4 L1 = *(__global uint4 *)&p2_buf[%d];\n" // stride2
+			"    uint2 pY0, pY1, pUV;\n"
+			"%s"
+			"    *(__global uint2 *) p0_buf = pY0;\n"
+			"    *(__global uint2 *)&p0_buf[%d] = pY1;\n" // stride0
+			"    *(__global uint2 *) p1_buf = pUV;\n"
+			"  }\n"
+			"}\n")
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7) / 8, (height + 1) / 2, stride0 * 2, stride1, stride2 * 2, stride2, conversionCode, stride0);
+		node->opencl_code = item;
+	}
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 7) >> 3) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (((height + 1) >> 1) + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following format conversions:
+//   VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12
+//   VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12
+//   VX_KERNEL_AMD_FORMAT_CONVERT_UV12_IUV
+//   VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8
+//
+int HafGpu_FormatConvert_Chroma(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	// configuration
+	vx_enum kernel = node->akernel->id;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	int stride0 = node->paramList[0]->u.img.stride_in_bytes;
+	int stride1 = node->paramList[1]->u.img.stride_in_bytes;
+	int stride2 = node->paramList[2] ? node->paramList[2]->u.img.stride_in_bytes : 0;
+	int work_group_width = 16;
+	int work_group_height = 4;
+
+	// kernel declaration
+	char item[8192];
+	if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+			"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+			"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+			"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset,\n"
+			"        uint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p0_buf += p0_offset;\n"
+			"    p1_buf += p1_offset;\n"
+			"    p2_buf += p2_offset;\n"
+			"    p0_buf += (gy * %d) + (gx << 3);\n" // stride0 * 2
+			"    p1_buf += (gy * %d) + (gx << 3);\n" // stride1 * 2
+			"    p2_buf += (gy * %d) + (gx << 3);\n" // stride2
+			"    uint2 L0 = *(__global uint2 *) p2_buf;\n"
+			"    uint2 pU, pV;\n"
+			"    pU.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack0(L0.s0), amd_unpack2(L0.s0), amd_unpack2(L0.s0)));\n"
+			"    pU.s1 = amd_pack((float4)(amd_unpack0(L0.s1), amd_unpack0(L0.s1), amd_unpack2(L0.s1), amd_unpack2(L0.s1)));\n"
+			"    pV.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack1(L0.s0), amd_unpack3(L0.s0), amd_unpack3(L0.s0)));\n"
+			"    pV.s1 = amd_pack((float4)(amd_unpack1(L0.s1), amd_unpack1(L0.s1), amd_unpack3(L0.s1), amd_unpack3(L0.s1)));\n"
+			"    *(__global uint2 *) p0_buf = pU;\n"
+			"    *(__global uint2 *)&p0_buf[%d] = pU;\n" // stride0
+			"    *(__global uint2 *) p1_buf = pV;\n"
+			"    *(__global uint2 *)&p1_buf[%d] = pV;\n" // stride1
+			"  }\n"
+			"}\n")
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7) / 8, (height + 1) / 2, stride0 * 2, stride1 * 2, stride2, stride0, stride1);
+		node->opencl_code = item;
+	}
+	else if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+			"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+			"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+			"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset,\n"
+			"        uint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p0_buf += p0_offset;\n"
+			"    p1_buf += p1_offset;\n"
+			"    p2_buf += p2_offset;\n"
+			"    p0_buf += (gy * %d) + (gx << 3);\n" // stride0 * 2
+			"    p1_buf += (gy * %d) + (gx << 3);\n" // stride1 * 2
+			"    p2_buf += (gy * %d) + (gx << 4);\n" // stride2 * 2
+			"    uint4 L0, L1;\n"
+			"    L0 = *(__global uint4 *) p2_buf;\n"
+			"    L1 = *(__global uint4 *) &p2_buf[%d];\n"	// stride2
+			"    uint2 pU0, pV0, pU1, pV1;\n"
+			"    pU0.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack2(L0.s0), amd_unpack0(L0.s1), amd_unpack2(L0.s1)));\n"
+			"    pU0.s1 = amd_pack((float4)(amd_unpack0(L0.s2), amd_unpack2(L0.s2), amd_unpack0(L0.s3), amd_unpack2(L0.s3)));\n"
+			"    pV0.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack3(L0.s0), amd_unpack1(L0.s1), amd_unpack3(L0.s1)));\n"
+			"    pV0.s1 = amd_pack((float4)(amd_unpack1(L0.s2), amd_unpack3(L0.s2), amd_unpack1(L0.s3), amd_unpack3(L0.s3)));\n"
+			"    pU1.s0 = amd_pack((float4)(amd_unpack0(L1.s0), amd_unpack2(L1.s0), amd_unpack0(L1.s1), amd_unpack2(L1.s1)));\n"
+			"    pU1.s1 = amd_pack((float4)(amd_unpack0(L1.s2), amd_unpack2(L1.s2), amd_unpack0(L1.s3), amd_unpack2(L1.s3)));\n"
+			"    pV1.s0 = amd_pack((float4)(amd_unpack1(L1.s0), amd_unpack3(L1.s0), amd_unpack1(L1.s1), amd_unpack3(L1.s1)));\n"
+			"    pV1.s1 = amd_pack((float4)(amd_unpack1(L1.s2), amd_unpack3(L1.s2), amd_unpack1(L1.s3), amd_unpack3(L1.s3)));\n"
+			"    *(__global uint2 *) p0_buf = pU0;\n"
+			"    *(__global uint2 *)&p0_buf[%d] = pU1;\n" // stride0
+			"    *(__global uint2 *) p1_buf = pV0;\n"
+			"    *(__global uint2 *)&p1_buf[%d] = pV1;\n" // stride1
+			"  }\n"
+			"}\n")
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7) / 8, (height + 1) / 2, stride0 * 2, stride1 * 2, stride2 * 2, stride2, stride0, stride1);
+		node->opencl_code = item;
+	}
+	else if (kernel == VX_KERNEL_AMD_FORMAT_CONVERT_UV12_IUV) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+			"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+			"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+			"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset,\n"
+			"        uint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p0_buf += p0_offset;\n"
+			"    p1_buf += p1_offset;\n"
+			"    p2_buf += p2_offset;\n"
+			"    p0_buf += (gy * %d) + (gx << 4);\n" // stride0 * 2
+			"    p1_buf += (gy * %d) + (gx << 3);\n" // stride1 * 2
+			"    p2_buf += (gy * %d) + (gx << 3);\n" // stride2 * 2
+			"    uint2 pU0 = *(__global uint2 *) p1_buf;\n"
+			"    uint2 pU1 = *(__global uint2 *)&p1_buf[%d];\n" // stride1
+			"    uint2 pV0 = *(__global uint2 *) p2_buf;\n"
+			"    uint2 pV1 = *(__global uint2 *)&p2_buf[%d];\n" // stride2
+			"    uint4 L0, L1;\n"
+			"    L0.s0 = amd_pack((float4)(amd_unpack0(pU0.s0), amd_unpack0(pV0.s0), amd_unpack1(pU0.s0), amd_unpack1(pV0.s0)));\n"
+			"    L0.s1 = amd_pack((float4)(amd_unpack2(pU0.s0), amd_unpack2(pV0.s0), amd_unpack3(pU0.s0), amd_unpack3(pV0.s0)));\n"
+			"    L0.s2 = amd_pack((float4)(amd_unpack0(pU0.s1), amd_unpack0(pV0.s1), amd_unpack1(pU0.s1), amd_unpack1(pV0.s1)));\n"
+			"    L0.s3 = amd_pack((float4)(amd_unpack2(pU0.s1), amd_unpack2(pV0.s1), amd_unpack3(pU0.s1), amd_unpack3(pV0.s1)));\n"
+			"    L1.s0 = amd_pack((float4)(amd_unpack0(pU1.s0), amd_unpack0(pV1.s0), amd_unpack1(pU1.s0), amd_unpack1(pV1.s0)));\n"
+			"    L1.s1 = amd_pack((float4)(amd_unpack2(pU1.s0), amd_unpack2(pV1.s0), amd_unpack3(pU1.s0), amd_unpack3(pV1.s0)));\n"
+			"    L1.s2 = amd_pack((float4)(amd_unpack0(pU1.s1), amd_unpack0(pV1.s1), amd_unpack1(pU1.s1), amd_unpack1(pV1.s1)));\n"
+			"    L1.s3 = amd_pack((float4)(amd_unpack2(pU1.s1), amd_unpack2(pV1.s1), amd_unpack3(pU1.s1), amd_unpack3(pV1.s1)));\n"
+			"    *(__global uint4 *) p0_buf = L0;\n"
+			"    *(__global uint4 *)&p0_buf[%d] = L1;\n" // stride0
+			"  }\n"
+			"}\n")
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7)/ 8, (height + 1)/ 2, stride0 * 2, stride1 * 2, stride2 * 2, stride1, stride2, stride0);
+		node->opencl_code = item;
+	}
+	else if (kernel == VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+			"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+			"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset,\n"
+			"        uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p0_buf += p0_offset;\n"
+			"    p1_buf += p1_offset;\n"
+			"    p0_buf += (gy * %d) + (gx << 3);\n" // stride0 * 2
+			"    p1_buf += (gy * %d) + (gx << 2);\n" // stride1
+			"    uint L0 = *(__global uint *) p1_buf;\n"
+			"    uint2 X2;\n"
+			"    X2.s0 = amd_pack((float4)(amd_unpack0(L0), amd_unpack0(L0), amd_unpack1(L0), amd_unpack1(L0)));\n"
+			"    X2.s1 = amd_pack((float4)(amd_unpack2(L0), amd_unpack2(L0), amd_unpack3(L0), amd_unpack3(L0)));\n"
+			"    *(__global uint2 *) p0_buf = X2;\n"
+			"    *(__global uint2 *)&p0_buf[%d] = X2;\n" // stride0
+			"  }\n"
+			"}\n")
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, (width + 7) / 8, (height + 1) / 2, stride0 * 2, stride1, stride0);
+		node->opencl_code = item;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_FormatConvert_Chroma doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 7) >> 3) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (((height + 1) >> 1) + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+	
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following color conversions:
+//   VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB
+//   VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY
+//   VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV
+//
+int HafGpu_ColorConvert(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	// configuration
+	vx_enum kernel = node->akernel->id;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	int pRGB_stride = 0, p422_stride = 0, pY_stride = 0, pU_stride = 0, pV_stride = 0, pUV_stride = 0;
+	int work_group_width = 16;
+	int work_group_height = 4;
+	bool isSourceRGB =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB;
+	bool isSourceRGBX =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX;
+	bool isSourceUYVY =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY;
+	bool isSourceYUYV =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV;
+	bool isSourceIYUV =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV;
+	bool isSourceNV12 =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12 ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12;
+	bool isSourceNV21 =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21 ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21;
+	bool isDestinationRGB =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12 ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21 ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV;
+	bool isDestinationRGBX =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12 ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21 ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV;
+	bool destinationHasY =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX;
+	bool destinationHasUV12 =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX;
+	bool destinationNoU =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX;
+	bool destinationNoV =
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB ||
+		kernel == VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX;
+
+	// kernel header and reading
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"typedef uint2   U8x8;\n"
+		"typedef uint8  U24x8;\n"
+		"typedef uint8  U32x8;\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(")
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME);
+	node->opencl_code = item;
+	int argCount = 0;
+	if (isDestinationRGB) {
+		node->opencl_code += "uint pRGB_width, uint pRGB_height, __global uchar * pRGB_buf, uint pRGB_stride, uint pRGB_offset,\n";
+		pRGB_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+	}
+	else if (isDestinationRGBX) {
+		node->opencl_code += "uint pRGB_width, uint pRGB_height, __global uchar * pRGB_buf, uint pRGB_stride, uint pRGB_offset,\n";
+		pRGB_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+	}
+	else {
+		if (destinationHasY) {
+			node->opencl_code += "uint pY_width, uint pY_height, __global uchar * pY_buf, uint pY_stride, uint pY_offset,\n    ";
+			pY_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		}
+		if (destinationHasUV12) {
+			node->opencl_code += "uint pUV_width, uint pUV_height, __global uchar * pUV_buf, uint pUV_stride, uint pUV_offset,\n    ";
+			pUV_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		}
+		else {
+			if (!destinationNoU) {
+				node->opencl_code += "uint pU_width, uint pU_height, __global uchar * pU_buf, uint pU_stride, uint pU_offset,\n    ";
+				pU_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+			}
+			if (!destinationNoV) {
+				node->opencl_code += "uint pV_width, uint pV_height, __global uchar * pV_buf, uint pV_stride, uint pV_offset,\n    ";
+				pV_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+			}
+		}
+	}
+	if (isSourceRGB) {
+		pRGB_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint pRGB_width, uint pRGB_height, __global uchar * pRGB_buf, uint pRGB_stride, uint pRGB_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    pRGB_buf += pRGB_offset + (gy * %d) + (gx * 24);\n" // pRGB_stride * 2
+			"    U24x8 pRGB0, pRGB1;\n"
+			"    pRGB0.s012 = *(__global uint3 *) pRGB_buf;\n"
+			"    pRGB0.s345 = *(__global uint3 *)&pRGB_buf[12];\n"
+			"    pRGB1.s012 = *(__global uint3 *)&pRGB_buf[%d];\n" // pRGB_stride
+			"    pRGB1.s345 = *(__global uint3 *)&pRGB_buf[%d+12];\n" // pRGB_stride
+			), (width + 7) / 8, (height + 1) / 2, pRGB_stride * 2, pRGB_stride, pRGB_stride);
+		node->opencl_code += item;
+	}
+	else if (isSourceRGBX) {
+		pRGB_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint pRGB_width, uint pRGB_height, __global uchar * pRGB_buf, uint pRGB_stride, uint pRGB_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    pRGB_buf += pRGB_offset + (gy * %d) + (gx << 5);\n" // pRGB_stride * 2
+			"    U32x8 pRGBX0, pRGBX1;\n"
+			"    pRGBX0 = *(__global U32x8 *) pRGB_buf;\n"
+			"    pRGBX1 = *(__global U32x8 *)&pRGB_buf[%d];\n" // pRGB_stride
+			), (width + 7) / 8, (height + 1) / 2, pRGB_stride * 2, pRGB_stride);
+		node->opencl_code += item;
+	}
+	else if (isSourceUYVY || isSourceYUYV) {
+		p422_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint p422_width, uint p422_height, __global uchar * p422_buf, uint p422_stride, uint p422_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    p422_buf += p422_offset + (gy * %d) + (gx << 4);\n" // p422_stride * 2
+			"    uint4 L0, L1;\n"
+			"    L0 = *(__global uint4 *) p422_buf;\n"
+			"    L1 = *(__global uint4 *)&p422_buf[%d];\n" // p422_stride
+			), (width + 7) / 8, (height + 1) / 2, p422_stride * 2, p422_stride);
+		node->opencl_code += item;
+	}
+	else if (isSourceIYUV) {
+		pY_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		pU_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		pV_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint pY_width, uint pY_height, __global uchar * pY_buf, uint pY_stride, uint pY_offset,\n    "
+			"uint pU_width, uint pU_height, __global uchar * pU_buf, uint pU_stride, uint pU_offset,\n    "
+			"uint pV_width, uint pV_height, __global uchar * pV_buf, uint pV_stride, uint pV_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    pY_buf += pY_offset + (gy * %d) + (gx << 3);\n" // pY_stride * 2
+			"    pU_buf += pU_offset + (gy * %d) + (gx << 2);\n" // pU_stride
+			"    pV_buf += pV_offset + (gy * %d) + (gx << 2);\n" // pV_stride
+			"    U8x8 pY0, pY1, pUV;\n"
+			"    pY0 = *(__global U8x8 *) pY_buf;\n"
+			"    pY1 = *(__global U8x8 *)&pY_buf[%d];\n" // pY_stride
+			"    pUV.s0 = *(__global uint *) pU_buf;\n"
+			"    pUV.s1 = *(__global uint *) pV_buf;\n"
+			), (width + 7) / 8, (height + 1) / 2, pY_stride * 2, pU_stride, pV_stride, pY_stride);
+		node->opencl_code += item;
+	}
+	else {
+		pY_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		pUV_stride = node->paramList[argCount++]->u.img.stride_in_bytes;
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint pY_width, uint pY_height, __global uchar * pY_buf, uint pY_stride, uint pY_offset,\n    "
+			"uint pUV_width, uint pUV_height, __global uchar * pUV_buf, uint pUV_stride, uint pUV_offset)\n"
+			"{\n"
+			"  int gx = get_global_id(0);\n"
+			"  int gy = get_global_id(1);\n"
+			"  if ((gx < %d) && (gy < %d)) {\n" // (width+7)/8, (height+1)/2
+			"    pY_buf += pY_offset + (gy * %d) + (gx << 3);\n" // pY_stride * 2
+			"    pUV_buf += pUV_offset + (gy * %d) + (gx << 3);\n" // pUV_stride
+			"    U8x8 pY0, pY1, pUV;\n"
+			"    pY0 = *(__global U8x8 *) pY_buf;\n"
+			"    pY1 = *(__global U8x8 *)&pY_buf[%d];\n" // pY_stride
+			"    pUV = *(__global U8x8 *) pUV_buf;\n"
+			), (width + 7) / 8, (height + 1) / 2, pY_stride * 2, pUV_stride, pY_stride);
+		node->opencl_code += item;
+	}
+
+	// color conversion part
+	node->opencl_code +=
+		"    float4 f;\n";
+	if (isSourceRGB || isSourceRGBX) {
+		if (isSourceRGB) {
+			if (destinationHasY) {
+				node->opencl_code +=
+					OPENCL_FORMAT(
+					"    U8x8 pY0, pY1;\n"
+					"    float3 cY = (float3)(0.2126f, 0.7152f, 0.0722f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGB0.s0), amd_unpack1(pRGB0.s0), amd_unpack2(pRGB0.s0)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack3(pRGB0.s0), amd_unpack0(pRGB0.s1), amd_unpack1(pRGB0.s1)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack2(pRGB0.s1), amd_unpack3(pRGB0.s1), amd_unpack0(pRGB0.s2)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack1(pRGB0.s2), amd_unpack2(pRGB0.s2), amd_unpack3(pRGB0.s2)));\n"
+					"    pY0.s0 = amd_pack(f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGB0.s3), amd_unpack1(pRGB0.s3), amd_unpack2(pRGB0.s3)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack3(pRGB0.s3), amd_unpack0(pRGB0.s4), amd_unpack1(pRGB0.s4)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack2(pRGB0.s4), amd_unpack3(pRGB0.s4), amd_unpack0(pRGB0.s5)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack1(pRGB0.s5), amd_unpack2(pRGB0.s5), amd_unpack3(pRGB0.s5)));\n"
+					"    pY0.s1 = amd_pack(f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGB1.s0), amd_unpack1(pRGB1.s0), amd_unpack2(pRGB1.s0)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack3(pRGB1.s0), amd_unpack0(pRGB1.s1), amd_unpack1(pRGB1.s1)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack2(pRGB1.s1), amd_unpack3(pRGB1.s1), amd_unpack0(pRGB1.s2)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack1(pRGB1.s2), amd_unpack2(pRGB1.s2), amd_unpack3(pRGB1.s2)));\n"
+					"    pY1.s0 = amd_pack(f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGB1.s3), amd_unpack1(pRGB1.s3), amd_unpack2(pRGB1.s3)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack3(pRGB1.s3), amd_unpack0(pRGB1.s4), amd_unpack1(pRGB1.s4)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack2(pRGB1.s4), amd_unpack3(pRGB1.s4), amd_unpack0(pRGB1.s5)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack1(pRGB1.s5), amd_unpack2(pRGB1.s5), amd_unpack3(pRGB1.s5)));\n"
+					"    pY1.s1 = amd_pack(f);\n"
+					);
+			}
+			if (!destinationNoU) {
+				node->opencl_code +=
+					OPENCL_FORMAT(
+					"    U8x8 pU0, pU1;\n"
+					"    float3 cU = (float3)(-0.1146f, -0.3854f, 0.5f);\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack0(pRGB0.s0), amd_unpack1(pRGB0.s0), amd_unpack2(pRGB0.s0)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack2(pRGB0.s1), amd_unpack3(pRGB0.s1), amd_unpack0(pRGB0.s2)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack0(pRGB0.s3), amd_unpack1(pRGB0.s3), amd_unpack2(pRGB0.s3)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack2(pRGB0.s4), amd_unpack3(pRGB0.s4), amd_unpack0(pRGB0.s5)));\n"
+					"    pU0.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack3(pRGB0.s0), amd_unpack0(pRGB0.s1), amd_unpack1(pRGB0.s1)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack1(pRGB0.s2), amd_unpack2(pRGB0.s2), amd_unpack3(pRGB0.s2)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack3(pRGB0.s3), amd_unpack0(pRGB0.s4), amd_unpack1(pRGB0.s4)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack1(pRGB0.s5), amd_unpack2(pRGB0.s5), amd_unpack3(pRGB0.s5)));\n"
+					"    pU0.s1 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack0(pRGB1.s0), amd_unpack1(pRGB1.s0), amd_unpack2(pRGB1.s0)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack2(pRGB1.s1), amd_unpack3(pRGB1.s1), amd_unpack0(pRGB1.s2)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack0(pRGB1.s3), amd_unpack1(pRGB1.s3), amd_unpack2(pRGB1.s3)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack2(pRGB1.s4), amd_unpack3(pRGB1.s4), amd_unpack0(pRGB1.s5)));\n"
+					"    pU1.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack3(pRGB1.s0), amd_unpack0(pRGB1.s1), amd_unpack1(pRGB1.s1)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack1(pRGB1.s2), amd_unpack2(pRGB1.s2), amd_unpack3(pRGB1.s2)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack3(pRGB1.s3), amd_unpack0(pRGB1.s4), amd_unpack1(pRGB1.s4)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack1(pRGB1.s5), amd_unpack2(pRGB1.s5), amd_unpack3(pRGB1.s5)));\n"
+					"    pU1.s1 = amd_pack(f + (float4)(128));\n"
+					"    pU0.s0 = amd_lerp(pU0.s0, pU0.s1, 0x01010101u);\n"
+					"    pU1.s0 = amd_lerp(pU1.s0, pU1.s1, 0x01010101u);\n"
+					"    pU0.s0 = amd_lerp(pU0.s0, pU1.s0, 0x01010101u);\n"
+					);
+			}
+			if (!destinationNoV) {
+				node->opencl_code +=
+					OPENCL_FORMAT(
+					"    U8x8 pV0, pV1;\n"
+					"    float3 cV = (float3)(0.5f, -0.4542f, -0.0458f);\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack0(pRGB0.s0), amd_unpack1(pRGB0.s0), amd_unpack2(pRGB0.s0)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack2(pRGB0.s1), amd_unpack3(pRGB0.s1), amd_unpack0(pRGB0.s2)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack0(pRGB0.s3), amd_unpack1(pRGB0.s3), amd_unpack2(pRGB0.s3)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack2(pRGB0.s4), amd_unpack3(pRGB0.s4), amd_unpack0(pRGB0.s5)));\n"
+					"    pV0.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack3(pRGB0.s0), amd_unpack0(pRGB0.s1), amd_unpack1(pRGB0.s1)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack1(pRGB0.s2), amd_unpack2(pRGB0.s2), amd_unpack3(pRGB0.s2)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack3(pRGB0.s3), amd_unpack0(pRGB0.s4), amd_unpack1(pRGB0.s4)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack1(pRGB0.s5), amd_unpack2(pRGB0.s5), amd_unpack3(pRGB0.s5)));\n"
+					"    pV0.s1 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack0(pRGB1.s0), amd_unpack1(pRGB1.s0), amd_unpack2(pRGB1.s0)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack2(pRGB1.s1), amd_unpack3(pRGB1.s1), amd_unpack0(pRGB1.s2)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack0(pRGB1.s3), amd_unpack1(pRGB1.s3), amd_unpack2(pRGB1.s3)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack2(pRGB1.s4), amd_unpack3(pRGB1.s4), amd_unpack0(pRGB1.s5)));\n"
+					"    pV1.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack3(pRGB1.s0), amd_unpack0(pRGB1.s1), amd_unpack1(pRGB1.s1)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack1(pRGB1.s2), amd_unpack2(pRGB1.s2), amd_unpack3(pRGB1.s2)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack3(pRGB1.s3), amd_unpack0(pRGB1.s4), amd_unpack1(pRGB1.s4)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack1(pRGB1.s5), amd_unpack2(pRGB1.s5), amd_unpack3(pRGB1.s5)));\n"
+					"    pV1.s1 = amd_pack(f + (float4)(128));\n"
+					"    pV0.s0 = amd_lerp(pV0.s0, pV0.s1, 0x01010101u);\n"
+					"    pV1.s0 = amd_lerp(pV1.s0, pV1.s1, 0x01010101u);\n"
+					"    pV0.s0 = amd_lerp(pV0.s0, pV1.s0, 0x01010101u);\n"
+					);
+			}
+		}
+		else if (isSourceRGBX) {
+			if (destinationHasY) {
+				node->opencl_code +=
+					OPENCL_FORMAT(
+					"    U8x8 pY0, pY1;\n"
+					"    float3 cY = (float3)(0.2126f, 0.7152f, 0.0722f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGBX0.s0), amd_unpack1(pRGBX0.s0), amd_unpack2(pRGBX0.s0)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack0(pRGBX0.s1), amd_unpack1(pRGBX0.s1), amd_unpack2(pRGBX0.s1)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack0(pRGBX0.s2), amd_unpack1(pRGBX0.s2), amd_unpack2(pRGBX0.s2)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack0(pRGBX0.s3), amd_unpack1(pRGBX0.s3), amd_unpack2(pRGBX0.s3)));\n"
+					"    pY0.s0 = amd_pack(f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGBX0.s4), amd_unpack1(pRGBX0.s4), amd_unpack2(pRGBX0.s4)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack0(pRGBX0.s5), amd_unpack1(pRGBX0.s5), amd_unpack2(pRGBX0.s5)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack0(pRGBX0.s6), amd_unpack1(pRGBX0.s6), amd_unpack2(pRGBX0.s6)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack0(pRGBX0.s7), amd_unpack1(pRGBX0.s7), amd_unpack2(pRGBX0.s7)));\n"
+					"    pY0.s1 = amd_pack(f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGBX1.s0), amd_unpack1(pRGBX1.s0), amd_unpack2(pRGBX1.s0)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack0(pRGBX1.s1), amd_unpack1(pRGBX1.s1), amd_unpack2(pRGBX1.s1)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack0(pRGBX1.s2), amd_unpack1(pRGBX1.s2), amd_unpack2(pRGBX1.s2)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack0(pRGBX1.s3), amd_unpack1(pRGBX1.s3), amd_unpack2(pRGBX1.s3)));\n"
+					"    pY1.s0 = amd_pack(f);\n"
+					"    f.s0 = dot(cY, (float3)(amd_unpack0(pRGBX1.s4), amd_unpack1(pRGBX1.s4), amd_unpack2(pRGBX1.s4)));\n"
+					"    f.s1 = dot(cY, (float3)(amd_unpack0(pRGBX1.s5), amd_unpack1(pRGBX1.s5), amd_unpack2(pRGBX1.s5)));\n"
+					"    f.s2 = dot(cY, (float3)(amd_unpack0(pRGBX1.s6), amd_unpack1(pRGBX1.s6), amd_unpack2(pRGBX1.s6)));\n"
+					"    f.s3 = dot(cY, (float3)(amd_unpack0(pRGBX1.s7), amd_unpack1(pRGBX1.s7), amd_unpack2(pRGBX1.s7)));\n"
+					"    pY1.s1 = amd_pack(f);\n"
+					);
+			}
+			if (!destinationNoU) {
+				node->opencl_code +=
+					OPENCL_FORMAT(
+					"    U8x8 pU0, pU1;\n"
+					"    float3 cU = (float3)(-0.1146f, -0.3854f, 0.5f);\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack0(pRGBX0.s0), amd_unpack1(pRGBX0.s0), amd_unpack2(pRGBX0.s0)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack0(pRGBX0.s2), amd_unpack1(pRGBX0.s2), amd_unpack2(pRGBX0.s2)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack0(pRGBX0.s4), amd_unpack1(pRGBX0.s4), amd_unpack2(pRGBX0.s4)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack0(pRGBX0.s6), amd_unpack1(pRGBX0.s6), amd_unpack2(pRGBX0.s6)));\n"
+					"    pU0.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack0(pRGBX0.s1), amd_unpack1(pRGBX0.s1), amd_unpack2(pRGBX0.s1)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack0(pRGBX0.s3), amd_unpack1(pRGBX0.s3), amd_unpack2(pRGBX0.s3)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack0(pRGBX0.s5), amd_unpack1(pRGBX0.s5), amd_unpack2(pRGBX0.s5)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack0(pRGBX0.s7), amd_unpack1(pRGBX0.s7), amd_unpack2(pRGBX0.s7)));\n"
+					"    pU0.s1 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack0(pRGBX1.s0), amd_unpack1(pRGBX1.s0), amd_unpack2(pRGBX1.s0)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack0(pRGBX1.s2), amd_unpack1(pRGBX1.s2), amd_unpack2(pRGBX1.s2)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack0(pRGBX1.s4), amd_unpack1(pRGBX1.s4), amd_unpack2(pRGBX1.s4)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack0(pRGBX1.s6), amd_unpack1(pRGBX1.s6), amd_unpack2(pRGBX1.s6)));\n"
+					"    pU1.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cU, (float3)(amd_unpack0(pRGBX1.s1), amd_unpack1(pRGBX1.s1), amd_unpack2(pRGBX1.s1)));\n"
+					"    f.s1 = dot(cU, (float3)(amd_unpack0(pRGBX1.s3), amd_unpack1(pRGBX1.s3), amd_unpack2(pRGBX1.s3)));\n"
+					"    f.s2 = dot(cU, (float3)(amd_unpack0(pRGBX1.s5), amd_unpack1(pRGBX1.s5), amd_unpack2(pRGBX1.s5)));\n"
+					"    f.s3 = dot(cU, (float3)(amd_unpack0(pRGBX1.s7), amd_unpack1(pRGBX1.s7), amd_unpack2(pRGBX1.s7)));\n"
+					"    pU1.s1 = amd_pack(f + (float4)(128));\n"
+					"    pU0.s0 = amd_lerp(pU0.s0, pU0.s1, 0x01010101u);\n"
+					"    pU1.s0 = amd_lerp(pU1.s0, pU1.s1, 0x01010101u);\n"
+					"    pU0.s0 = amd_lerp(pU1.s0, pU1.s0, 0x01010101u);\n"
+					);
+			}
+			if (!destinationNoV) {
+				node->opencl_code +=
+					OPENCL_FORMAT(
+					"    U8x8 pV0, pV1;\n"
+					"    float3 cV = (float3)(0.5f, -0.4542f, -0.0458f);\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack0(pRGBX0.s0), amd_unpack1(pRGBX0.s0), amd_unpack2(pRGBX0.s0)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack0(pRGBX0.s2), amd_unpack1(pRGBX0.s2), amd_unpack2(pRGBX0.s2)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack0(pRGBX0.s4), amd_unpack1(pRGBX0.s4), amd_unpack2(pRGBX0.s4)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack0(pRGBX0.s6), amd_unpack1(pRGBX0.s6), amd_unpack2(pRGBX0.s6)));\n"
+					"    pV0.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack0(pRGBX0.s1), amd_unpack1(pRGBX0.s1), amd_unpack2(pRGBX0.s1)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack0(pRGBX0.s3), amd_unpack1(pRGBX0.s3), amd_unpack2(pRGBX0.s3)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack0(pRGBX0.s5), amd_unpack1(pRGBX0.s5), amd_unpack2(pRGBX0.s5)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack0(pRGBX0.s7), amd_unpack1(pRGBX0.s7), amd_unpack2(pRGBX0.s7)));\n"
+					"    pV0.s1 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack0(pRGBX1.s0), amd_unpack1(pRGBX1.s0), amd_unpack2(pRGBX1.s0)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack0(pRGBX1.s2), amd_unpack1(pRGBX1.s2), amd_unpack2(pRGBX1.s2)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack0(pRGBX1.s4), amd_unpack1(pRGBX1.s4), amd_unpack2(pRGBX1.s4)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack0(pRGBX1.s6), amd_unpack1(pRGBX1.s6), amd_unpack2(pRGBX1.s6)));\n"
+					"    pV1.s0 = amd_pack(f + (float4)(128));\n"
+					"    f.s0 = dot(cV, (float3)(amd_unpack0(pRGBX1.s1), amd_unpack1(pRGBX1.s1), amd_unpack2(pRGBX1.s1)));\n"
+					"    f.s1 = dot(cV, (float3)(amd_unpack0(pRGBX1.s3), amd_unpack1(pRGBX1.s3), amd_unpack2(pRGBX1.s3)));\n"
+					"    f.s2 = dot(cV, (float3)(amd_unpack0(pRGBX1.s5), amd_unpack1(pRGBX1.s5), amd_unpack2(pRGBX1.s5)));\n"
+					"    f.s3 = dot(cV, (float3)(amd_unpack0(pRGBX1.s7), amd_unpack1(pRGBX1.s7), amd_unpack2(pRGBX1.s7)));\n"
+					"    pV1.s1 = amd_pack(f + (float4)(128));\n"
+					"    pV0.s0 = amd_lerp(pV0.s0, pV0.s1, 0x01010101u);\n"
+					"    pV1.s0 = amd_lerp(pV1.s0, pV1.s1, 0x01010101u);\n"
+					"    pV0.s0 = amd_lerp(pV1.s0, pV1.s0, 0x01010101u);\n"
+					);
+			}
+		}
+		if (destinationHasUV12) {
+			node->opencl_code +=
+				OPENCL_FORMAT(
+				"    U8x8 pUV;\n"
+				"    f.s0 = amd_unpack0(pU0.s0);\n"
+				"    f.s1 = amd_unpack0(pV0.s0);\n"
+				"    f.s2 = amd_unpack1(pU0.s0);\n"
+				"    f.s3 = amd_unpack1(pV0.s0);\n"
+				"    pUV.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack2(pU0.s0);\n"
+				"    f.s1 = amd_unpack2(pV0.s0);\n"
+				"    f.s2 = amd_unpack3(pU0.s0);\n"
+				"    f.s3 = amd_unpack3(pV0.s0);\n"
+				"    pUV.s1 = amd_pack(f);\n"
+				);
+			if (destinationHasY) {
+				sprintf(item,
+					OPENCL_FORMAT(
+					"    pY_buf += pY_offset + (gy * %d) + (gx << 3);\n" // pY_stride * 2
+					"    pUV_buf += pUV_offset + (gy * %d) + (gx << 3);\n" // pUV_stride
+					"    *(__global U8x8 *) pY_buf = pY0;\n"
+					"    *(__global U8x8 *)&pY_buf[%d] = pY1;\n" // pY_stride
+					"    *(__global U8x8 *) pUV_buf = pUV;\n"
+					), pY_stride * 2, pUV_stride, pY_stride);
+				node->opencl_code += item;
+			}
+			else {
+				sprintf(item,
+					OPENCL_FORMAT(
+					"    pUV_buf += pUV_offset + (gy * %d) + (gx << 3);\n" // pUV_stride
+					"    *(__global U8x8 *) pUV_buf = pUV;\n"
+					), pUV_stride);
+				node->opencl_code += item;
+			}
+		}
+		else if (destinationHasY) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"    pY_buf += pY_offset + (gy * %d) + (gx << 3);\n" // pY_stride * 2
+				"    pU_buf += pU_offset + (gy * %d) + (gx << 2);\n" // pU_stride
+				"    pV_buf += pV_offset + (gy * %d) + (gx << 2);\n" // pV_stride
+				"    *(__global U8x8 *) pY_buf = pY0;\n"
+				"    *(__global U8x8 *)&pY_buf[%d] = pY1;\n" // pY_stride
+				"    *(__global uint *) pU_buf = pU0.s0;\n"
+				"    *(__global uint *) pV_buf = pV0.s0;\n"
+				), pY_stride * 2, pU_stride, pV_stride, pY_stride);
+			node->opencl_code += item;
+		}
+		else {
+			if (!destinationNoU) {
+				sprintf(item,
+					OPENCL_FORMAT(
+					"    pU_buf += pU_offset + (gy * %d) + (gx << 2);\n" // pU_stride
+					"    *(__global uint *) pU_buf = pU0.s0;\n"
+					), pU_stride);
+				node->opencl_code += item;
+			}
+			if (!destinationNoV) {
+				sprintf(item,
+					OPENCL_FORMAT(
+					"    pV_buf += pV_offset + (gy * %d) + (gx << 2);\n" // pV_stride
+					"    *(__global uint *) pV_buf = pV0.s0;\n"
+					), pV_stride);
+				node->opencl_code += item;
+			}
+		}
+	}
+	else {
+		if (isSourceUYVY) {
+			node->opencl_code +=
+				OPENCL_FORMAT(
+				"    U8x8 pY0, pY1;\n"
+				"    U8x8 pU0, pU1;\n"
+				"    U8x8 pV0, pV1;\n"
+				"    pY0.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack3(L0.s0), amd_unpack1(L0.s1), amd_unpack3(L0.s1)));\n"
+				"    pY0.s1 = amd_pack((float4)(amd_unpack1(L0.s2), amd_unpack3(L0.s2), amd_unpack1(L0.s3), amd_unpack3(L0.s3)));\n"
+				"    pY1.s0 = amd_pack((float4)(amd_unpack1(L1.s0), amd_unpack3(L1.s0), amd_unpack1(L1.s1), amd_unpack3(L1.s1)));\n"
+				"    pY1.s1 = amd_pack((float4)(amd_unpack1(L1.s2), amd_unpack3(L1.s2), amd_unpack1(L1.s3), amd_unpack3(L1.s3)));\n"
+				"    pU0.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack0(L0.s0), amd_unpack0(L0.s1), amd_unpack0(L0.s1)));\n"
+				"    pU0.s1 = amd_pack((float4)(amd_unpack0(L0.s2), amd_unpack0(L0.s2), amd_unpack0(L0.s3), amd_unpack0(L0.s3)));\n"
+				"    pU1.s0 = amd_pack((float4)(amd_unpack0(L1.s0), amd_unpack0(L1.s0), amd_unpack0(L1.s1), amd_unpack0(L1.s1)));\n"
+				"    pU1.s1 = amd_pack((float4)(amd_unpack0(L1.s2), amd_unpack0(L1.s2), amd_unpack0(L1.s3), amd_unpack0(L1.s3)));\n"
+				"    pV0.s0 = amd_pack((float4)(amd_unpack2(L0.s0), amd_unpack2(L0.s0), amd_unpack2(L0.s1), amd_unpack2(L0.s1)));\n"
+				"    pV0.s1 = amd_pack((float4)(amd_unpack2(L0.s2), amd_unpack2(L0.s2), amd_unpack2(L0.s3), amd_unpack2(L0.s3)));\n"
+				"    pV1.s0 = amd_pack((float4)(amd_unpack2(L1.s0), amd_unpack2(L1.s0), amd_unpack2(L1.s1), amd_unpack2(L1.s1)));\n"
+				"    pV1.s1 = amd_pack((float4)(amd_unpack2(L1.s2), amd_unpack2(L1.s2), amd_unpack2(L1.s3), amd_unpack2(L1.s3)));\n"
+				);
+		}
+		else if (isSourceYUYV) {
+			node->opencl_code +=
+				OPENCL_FORMAT(
+				"    U8x8 pY0, pY1;\n"
+				"    U8x8 pU0, pU1;\n"
+				"    U8x8 pV0, pV1;\n"
+				"    pY0.s0 = amd_pack((float4)(amd_unpack0(L0.s0), amd_unpack2(L0.s0), amd_unpack0(L0.s1), amd_unpack2(L0.s1)));\n"
+				"    pY0.s1 = amd_pack((float4)(amd_unpack0(L0.s2), amd_unpack2(L0.s2), amd_unpack0(L0.s3), amd_unpack2(L0.s3)));\n"
+				"    pY1.s0 = amd_pack((float4)(amd_unpack0(L1.s0), amd_unpack2(L1.s0), amd_unpack0(L1.s1), amd_unpack2(L1.s1)));\n"
+				"    pY1.s1 = amd_pack((float4)(amd_unpack0(L1.s2), amd_unpack2(L1.s2), amd_unpack0(L1.s3), amd_unpack2(L1.s3)));\n"
+				"    pU0.s0 = amd_pack((float4)(amd_unpack1(L0.s0), amd_unpack1(L0.s0), amd_unpack1(L0.s1), amd_unpack1(L0.s1)));\n"
+				"    pU0.s1 = amd_pack((float4)(amd_unpack1(L0.s2), amd_unpack1(L0.s2), amd_unpack1(L0.s3), amd_unpack1(L0.s3)));\n"
+				"    pU1.s0 = amd_pack((float4)(amd_unpack1(L1.s0), amd_unpack1(L1.s0), amd_unpack1(L1.s1), amd_unpack1(L1.s1)));\n"
+				"    pU1.s1 = amd_pack((float4)(amd_unpack1(L1.s2), amd_unpack1(L1.s2), amd_unpack1(L1.s3), amd_unpack1(L1.s3)));\n"
+				"    pV0.s0 = amd_pack((float4)(amd_unpack3(L0.s0), amd_unpack3(L0.s0), amd_unpack3(L0.s1), amd_unpack3(L0.s1)));\n"
+				"    pV0.s1 = amd_pack((float4)(amd_unpack3(L0.s2), amd_unpack3(L0.s2), amd_unpack3(L0.s3), amd_unpack3(L0.s3)));\n"
+				"    pV1.s0 = amd_pack((float4)(amd_unpack3(L1.s0), amd_unpack3(L1.s0), amd_unpack3(L1.s1), amd_unpack3(L1.s1)));\n"
+				"    pV1.s1 = amd_pack((float4)(amd_unpack3(L1.s2), amd_unpack3(L1.s2), amd_unpack3(L1.s3), amd_unpack3(L1.s3)));\n"
+				);
+		}
+		else if (isSourceIYUV) {
+			node->opencl_code +=
+				OPENCL_FORMAT(
+				"    U8x8 pU0, pU1;\n"
+				"    U8x8 pV0, pV1;\n"
+				"    f.s0 = amd_unpack0(pUV.s0); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack1(pUV.s0); f.s3 = f.s2;\n"
+				"    pU0.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack2(pUV.s0); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack3(pUV.s0); f.s3 = f.s2;\n"
+				"    pU0.s1 = amd_pack(f);\n"
+				"    pU1.s0 = pU0.s0;\n"
+				"    pU1.s1 = pU0.s1;\n"
+				"    f.s0 = amd_unpack0(pUV.s1); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack1(pUV.s1); f.s3 = f.s2;\n"
+				"    pV0.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack2(pUV.s1); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack3(pUV.s1); f.s3 = f.s2;\n"
+				"    pV0.s1 = amd_pack(f);\n"
+				"    pV1.s0 = pV0.s0;\n"
+				"    pV1.s1 = pV0.s1;\n"
+				);
+		}
+		else if (isSourceNV12) {
+			node->opencl_code +=
+				OPENCL_FORMAT(
+				"    U8x8 pU0, pU1;\n"
+				"    U8x8 pV0, pV1;\n"
+				"    f.s0 = amd_unpack0(pUV.s0); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack2(pUV.s0); f.s3 = f.s2;\n"
+				"    pU0.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack0(pUV.s1); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack2(pUV.s1); f.s3 = f.s2;\n"
+				"    pU0.s1 = amd_pack(f);\n"
+				"    pU1.s0 = pU0.s0;\n"
+				"    pU1.s1 = pU0.s1;\n"
+				"    f.s0 = amd_unpack1(pUV.s0); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack3(pUV.s0); f.s3 = f.s2;\n"
+				"    pV0.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack1(pUV.s1); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack3(pUV.s1); f.s3 = f.s2;\n"
+				"    pV0.s1 = amd_pack(f);\n"
+				"    pV1.s0 = pV0.s0;\n"
+				"    pV1.s1 = pV0.s1;\n"
+				);
+		}
+		else if (isSourceNV21) {
+			node->opencl_code +=
+				OPENCL_FORMAT(
+				"    U8x8 pU0, pU1;\n"
+				"    U8x8 pV0, pV1;\n"
+				"    f.s0 = amd_unpack1(pUV.s0); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack3(pUV.s0); f.s3 = f.s2;\n"
+				"    pU0.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack1(pUV.s1); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack3(pUV.s1); f.s3 = f.s2;\n"
+				"    pU0.s1 = amd_pack(f);\n"
+				"    pU1.s0 = pU0.s0;\n"
+				"    pU1.s1 = pU0.s1;\n"
+				"    f.s0 = amd_unpack0(pUV.s0); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack2(pUV.s0); f.s3 = f.s2;\n"
+				"    pV0.s0 = amd_pack(f);\n"
+				"    f.s0 = amd_unpack0(pUV.s1); f.s1 = f.s0;\n"
+				"    f.s2 = amd_unpack2(pUV.s1); f.s3 = f.s2;\n"
+				"    pV0.s1 = amd_pack(f);\n"
+				"    pV1.s0 = pV0.s0;\n"
+				"    pV1.s1 = pV0.s1;\n"
+				);
+		}
+		else {
+			agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_ColorConvert doesn't support kernel %s\n", node->akernel->name);
+			return -1;
+		}
+		if (isDestinationRGB) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"    float2 cR = (float2)( 0.0000f,  1.5748f);\n"
+				"    float2 cG = (float2)(-0.1873f, -0.4681f);\n"
+				"    float2 cB = (float2)( 1.8556f,  0.0000f);\n"
+				"    float3 yuv; U24x8 pRGB0, pRGB1;\n"
+				"    yuv.s0 = amd_unpack0(pY0.s0); yuv.s1 = amd_unpack0(pU0.s0); yuv.s2 = amd_unpack0(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack1(pY0.s0); yuv.s1 = amd_unpack1(pU0.s0); yuv.s2 = amd_unpack1(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s3 = mad(cR.s1, yuv.s2, yuv.s0); pRGB0.s0 = amd_pack(f); f.s0 = mad(cG.s0, yuv.s1, yuv.s0); f.s0 = mad(cG.s1, yuv.s2, f.s0); f.s1 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack2(pY0.s0); yuv.s1 = amd_unpack2(pU0.s0); yuv.s2 = amd_unpack2(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s2 = mad(cR.s1, yuv.s2, yuv.s0); f.s3 = mad(cG.s0, yuv.s1, yuv.s0); f.s3 = mad(cG.s1, yuv.s2, f.s3); pRGB0.s1 = amd_pack(f); f.s0 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack3(pY0.s0); yuv.s1 = amd_unpack3(pU0.s0); yuv.s2 = amd_unpack3(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s1 = mad(cR.s1, yuv.s2, yuv.s0); f.s2 = mad(cG.s0, yuv.s1, yuv.s0); f.s2 = mad(cG.s1, yuv.s2, f.s2); f.s3 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s2 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack0(pY0.s1); yuv.s1 = amd_unpack0(pU0.s1); yuv.s2 = amd_unpack0(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack1(pY0.s1); yuv.s1 = amd_unpack1(pU0.s1); yuv.s2 = amd_unpack1(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s3 = mad(cR.s1, yuv.s2, yuv.s0); pRGB0.s3 = amd_pack(f); f.s0 = mad(cG.s0, yuv.s1, yuv.s0); f.s0 = mad(cG.s1, yuv.s2, f.s0); f.s1 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack2(pY0.s1); yuv.s1 = amd_unpack2(pU0.s1); yuv.s2 = amd_unpack2(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s2 = mad(cR.s1, yuv.s2, yuv.s0); f.s3 = mad(cG.s0, yuv.s1, yuv.s0); f.s3 = mad(cG.s1, yuv.s2, f.s3); pRGB0.s4 = amd_pack(f); f.s0 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack3(pY0.s1); yuv.s1 = amd_unpack3(pU0.s1); yuv.s2 = amd_unpack3(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s1 = mad(cR.s1, yuv.s2, yuv.s0); f.s2 = mad(cG.s0, yuv.s1, yuv.s0); f.s2 = mad(cG.s1, yuv.s2, f.s2); f.s3 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s5 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack0(pY1.s0); yuv.s1 = amd_unpack0(pU1.s0); yuv.s2 = amd_unpack0(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack1(pY1.s0); yuv.s1 = amd_unpack1(pU1.s0); yuv.s2 = amd_unpack1(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s3 = mad(cR.s1, yuv.s2, yuv.s0); pRGB1.s0 = amd_pack(f); f.s0 = mad(cG.s0, yuv.s1, yuv.s0); f.s0 = mad(cG.s1, yuv.s2, f.s0); f.s1 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack2(pY1.s0); yuv.s1 = amd_unpack2(pU1.s0); yuv.s2 = amd_unpack2(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s2 = mad(cR.s1, yuv.s2, yuv.s0); f.s3 = mad(cG.s0, yuv.s1, yuv.s0); f.s3 = mad(cG.s1, yuv.s2, f.s3); pRGB1.s1 = amd_pack(f); f.s0 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack3(pY1.s0); yuv.s1 = amd_unpack3(pU1.s0); yuv.s2 = amd_unpack3(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s1 = mad(cR.s1, yuv.s2, yuv.s0); f.s2 = mad(cG.s0, yuv.s1, yuv.s0); f.s2 = mad(cG.s1, yuv.s2, f.s2); f.s3 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s2 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack0(pY1.s1); yuv.s1 = amd_unpack0(pU1.s1); yuv.s2 = amd_unpack0(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack1(pY1.s1); yuv.s1 = amd_unpack1(pU1.s1); yuv.s2 = amd_unpack1(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s3 = mad(cR.s1, yuv.s2, yuv.s0); pRGB1.s3 = amd_pack(f); f.s0 = mad(cG.s0, yuv.s1, yuv.s0); f.s0 = mad(cG.s1, yuv.s2, f.s0); f.s1 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack2(pY1.s1); yuv.s1 = amd_unpack2(pU1.s1); yuv.s2 = amd_unpack2(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s2 = mad(cR.s1, yuv.s2, yuv.s0); f.s3 = mad(cG.s0, yuv.s1, yuv.s0); f.s3 = mad(cG.s1, yuv.s2, f.s3); pRGB1.s4 = amd_pack(f); f.s0 = mad(cB.s0, yuv.s1, yuv.s0);\n"
+				"    yuv.s0 = amd_unpack3(pY1.s1); yuv.s1 = amd_unpack3(pU1.s1); yuv.s2 = amd_unpack3(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s1 = mad(cR.s1, yuv.s2, yuv.s0); f.s2 = mad(cG.s0, yuv.s1, yuv.s0); f.s2 = mad(cG.s1, yuv.s2, f.s2); f.s3 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s5 = amd_pack(f);\n"
+				"    pRGB_buf += pRGB_offset + (gy * %d) + (gx * 24);\n" // pRGB_stride * 2
+				"    *(__global uint3 *) pRGB_buf = pRGB0.s012;\n"
+				"    *(__global uint3 *)&pRGB_buf[12] = pRGB0.s345;\n"
+				"    *(__global uint3 *)&pRGB_buf[%d] = pRGB1.s012;\n" // pRGB_stride
+				"    *(__global uint3 *)&pRGB_buf[%d+12] = pRGB1.s345;\n" // pRGB_stride
+				), pRGB_stride * 2, pRGB_stride, pRGB_stride);
+			node->opencl_code += item;
+		}
+		else if (isDestinationRGBX) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"    float2 cR = (float2)( 0.0000f,  1.5748f);\n"
+				"    float2 cG = (float2)(-0.1873f, -0.4681f);\n"
+				"    float2 cB = (float2)( 1.8556f,  0.0000f);\n"
+				"    float3 yuv; f.s3 = 255.0f; U32x8 pRGB0, pRGB1;\n"
+				"    yuv.s0 = amd_unpack0(pY0.s0); yuv.s1 = amd_unpack0(pU0.s0); yuv.s2 = amd_unpack0(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s0 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack1(pY0.s0); yuv.s1 = amd_unpack1(pU0.s0); yuv.s2 = amd_unpack1(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s1 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack2(pY0.s0); yuv.s1 = amd_unpack2(pU0.s0); yuv.s2 = amd_unpack2(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s2 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack3(pY0.s0); yuv.s1 = amd_unpack3(pU0.s0); yuv.s2 = amd_unpack3(pV0.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s3 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack0(pY0.s1); yuv.s1 = amd_unpack0(pU0.s1); yuv.s2 = amd_unpack0(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s4 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack1(pY0.s1); yuv.s1 = amd_unpack1(pU0.s1); yuv.s2 = amd_unpack1(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s5 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack2(pY0.s1); yuv.s1 = amd_unpack2(pU0.s1); yuv.s2 = amd_unpack2(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s6 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack3(pY0.s1); yuv.s1 = amd_unpack3(pU0.s1); yuv.s2 = amd_unpack3(pV0.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB0.s7 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack0(pY1.s0); yuv.s1 = amd_unpack0(pU1.s0); yuv.s2 = amd_unpack0(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s0 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack1(pY1.s0); yuv.s1 = amd_unpack1(pU1.s0); yuv.s2 = amd_unpack1(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s1 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack2(pY1.s0); yuv.s1 = amd_unpack2(pU1.s0); yuv.s2 = amd_unpack2(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s2 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack3(pY1.s0); yuv.s1 = amd_unpack3(pU1.s0); yuv.s2 = amd_unpack3(pV1.s0); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s3 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack0(pY1.s1); yuv.s1 = amd_unpack0(pU1.s1); yuv.s2 = amd_unpack0(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s4 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack1(pY1.s1); yuv.s1 = amd_unpack1(pU1.s1); yuv.s2 = amd_unpack1(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s5 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack2(pY1.s1); yuv.s1 = amd_unpack2(pU1.s1); yuv.s2 = amd_unpack2(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s6 = amd_pack(f);\n"
+				"    yuv.s0 = amd_unpack3(pY1.s1); yuv.s1 = amd_unpack3(pU1.s1); yuv.s2 = amd_unpack3(pV1.s1); yuv.s1 -= 128.0f;; yuv.s2 -= 128.0f;\n"
+				"    f.s0 = mad(cR.s1, yuv.s2, yuv.s0); f.s1 = mad(cG.s0, yuv.s1, yuv.s0); f.s1 = mad(cG.s1, yuv.s2, f.s1); f.s2 = mad(cB.s0, yuv.s1, yuv.s0); pRGB1.s7 = amd_pack(f);\n"
+				"    pRGB_buf += pRGB_offset + (gy * %d) + (gx << 5);\n" // pRGB_stride * 2
+				"    *(__global U32x8 *) pRGB_buf = pRGB0;\n"
+				"    *(__global U32x8 *)&pRGB_buf[%d] = pRGB1;\n" // pRGB_stride
+				), pRGB_stride * 2, pRGB_stride);
+			node->opencl_code += item;
+		}
+	}
+	node->opencl_code +=
+		"  }\n"
+		"}\n"
+		;
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 7) >> 3) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (((height + 1) >> 1) + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+#endif
diff --git a/openvx/ago/ago_haf_gpu_corners.cpp b/openvx/ago/ago_haf_gpu_corners.cpp
new file mode 100644
index 0000000..984546d
--- /dev/null
+++ b/openvx/ago/ago_haf_gpu_corners.cpp
@@ -0,0 +1,366 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_haf_gpu.h"
+
+#if ENABLE_OPENCL
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for following fast corner detector kernels:
+//   VX_KERNEL_AMD_FAST_CORNERS_XY_U8_NOSUPRESSION, VX_KERNEL_AMD_FAST_CORNERS_XY_U8_SUPRESSION,
+//
+int HafGpu_FastCorners_XY_U8(AgoNode * node)
+{
+	std::string code;
+	char item[8192];
+	int status = VX_SUCCESS;
+	bool useNonMax = (node->akernel->id == VX_KERNEL_AMD_FAST_CORNERS_XY_U8_SUPRESSION);
+
+	// configuration
+	AgoData * cornerList = node->paramList[0];
+	AgoData * numCorners = node->paramList[1];
+	AgoData * inputImg = node->paramList[2];
+	AgoData * inputThr = node->paramList[3];
+	int work_group_width = 16;
+	int work_group_height = 16;
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_param_atomic_mask = (1 << 0);
+	node->opencl_local_buffer_usage_mask = 0;
+	node->opencl_local_buffer_size_in_bytes = 0;
+	node->opencl_scalar_array_output_sync.enable = false;
+	if (numCorners) {
+		// discard the scalar argument and inform the framework that it needs to be synched with array output numitems
+		node->opencl_param_discard_mask = (1 << 1);
+		node->opencl_scalar_array_output_sync.enable = true;
+		node->opencl_scalar_array_output_sync.paramIndexArray = 0;
+		node->opencl_scalar_array_output_sync.paramIndexScalar = 1;
+	}
+
+	if (useNonMax)
+	{
+		// FAST with non-max supression
+
+		// OpenCL work items
+		node->opencl_global_work[0] = (size_t) ceil((inputImg->u.img.width - 4)/14)*16;
+		node->opencl_global_work[1] = (size_t) ceil((inputImg->u.img.height - 4)/14)*16;
+
+		// Pragma, data structure declarations and helper functions
+		sprintf(item,
+			OPENCL_FORMAT(
+				"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+				"#define MASK_EARLY_EXIT 4369\n\n"								//((1<<0) | (1<<4) | (1<<8) | (1<<12))
+				"typedef struct {\n"
+				"\t int x;\n"
+				"\t int y;\n"
+				"\t float strength;\n"
+				"\t float scale;\n"
+				"\t float orientation;\n"
+				"\t int tracking_status;\n"
+				"\t float error;\n"
+				"} KeyPt;\n\n"
+				"inline int getScore(int * boundary)	{\n"
+				"\t int strength, tmp = 0;\n"
+				"\t for (int i = 0; i < 16; i += 2)	{\n"
+				"\t\t int s = min(boundary[(i + 1) & 15], boundary[(i + 2) & 15]);\n"
+				"\t\t s = min(s, boundary[(i + 3) & 15]);\n"
+				"\t\t s = min(s, boundary[(i + 4) & 15]);\n"
+				"\t\t s = min(s, boundary[(i + 5) & 15]);\n"
+				"\t\t s = min(s, boundary[(i + 6) & 15]);\n"
+				"\t\t s = min(s, boundary[(i + 7) & 15]);\n"
+				"\t\t s = min(s, boundary[(i + 8) & 15]);\n"
+				"\t\t tmp = max(tmp, min(s, boundary[i & 15]));\n"
+				"\t\t tmp = max(tmp, min(s, boundary[(i + 9) & 15]));\n"
+				"\t }\n"
+				"\t strength = -tmp;\n"
+				"\t for (int i = 0; i < 16; i += 2)	{\n"
+				"\t\t int s = max(boundary[(i + 1) & 15], boundary[(i + 2) & 15]);\n"
+				"\t\t s = max(s, boundary[(i + 3) & 15]);\n"
+				"\t\t s = max(s, boundary[(i + 4) & 15]);\n"
+				"\t\t s = max(s, boundary[(i + 5) & 15]);\n"
+				"\t\t s = max(s, boundary[(i + 6) & 15]);\n"
+				"\t\t s = max(s, boundary[(i + 7) & 15]);\n"
+				"\t\t s = max(s, boundary[(i + 8) & 15]);\n"
+				"\t\t strength = min(strength, max(s, boundary[i & 15]));\n"
+				"\t\t strength = min(strength, max(s, boundary[(i + 9) & 15]));\n"
+				"\t }\n"
+				"\t return(-strength-1);\n } \n"
+			)
+			);
+		code = item;
+
+		// function declaration
+		sprintf(item,
+			OPENCL_FORMAT(
+				"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+				"void %s(__global char * corner_buf, uint corner_buf_offset, uint corner_capacity, uint img_width, uint img_height, __global uchar * img_buf, uint img_stride, uint img_offset, float strength_thresh)\n"
+				"{\n"
+			)
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME);
+		code += item;
+
+		sprintf(item,
+			OPENCL_FORMAT(
+				"\t int lidx = (int) get_local_id(0);\n"
+				"\t int lidy = (int)get_local_id(1);\n"
+				"\t int gidx = (int)get_group_id(0);\n"
+				"\t int gidy = (int)get_group_id(1);\n"
+				"\t int xoffset = gidx * 14 + lidx + 2;\n"
+				"\t int yoffset = gidy * 14 + lidy + 2;\n"
+				"\t __global const uchar * pTempImg = img_buf + img_offset + mad24(yoffset, (int)img_stride, xoffset);\n"
+				"\t __local int pLocalStrengthShare[16][16];\n"
+				"\t bool doCompute = true;\n"
+				"\t if((xoffset > (int)img_width - 3) || (yoffset > (int)img_height - 3) || (xoffset < 3) || (yoffset < 3))	{\n"
+				"\t\t doCompute = false;\n"
+				"\t\t pLocalStrengthShare[lidy][lidx] = 0;\n \t}\n"
+				"\t int local_strength;\n"
+				"\t if(doCompute)	{\n"
+				"\t\t int boundary[16];\n"
+				"\t\t int pos_mask, neg_mask, offs;\n"
+				"\t\t int centerPixel_neg = pTempImg[0];\n"
+				"\t\t for(int i = 0; i < 16; i++)\n"
+				"\t\t\t boundary[i] = centerPixel_neg;\n"
+				"\t\t int centerPixel_pos = centerPixel_neg + (int)strength_thresh;\n"
+				"\t\t centerPixel_neg -= (int) strength_thresh;\n"
+				"\t\t int candp = pTempImg[3];\n"
+				"\t\t int candn = pTempImg[-3];\n"
+				"\t\t neg_mask = (candp < centerPixel_neg) | ((candn < centerPixel_neg) << 8);\n"
+				"\t\t pos_mask = (candp > centerPixel_pos) | ((candn > centerPixel_pos) << 8);\n"
+				"\t\t boundary[0] -= candp;\n"
+				"\t\t boundary[8] -= candn;\n"
+				"\t\t offs = -img_stride*3;\n"
+				"\t\t candp = pTempImg[offs];\n"
+				"\t\t candn = pTempImg[-offs];\n"
+				"\t\t neg_mask |= (((candp < centerPixel_neg) << 4) | ((candn < centerPixel_neg) << 12));\n"
+				"\t\t pos_mask |= (((candp > centerPixel_pos) << 4) | ((candn > centerPixel_pos) << 12));\n"
+				"\t\t boundary[4] -= candp;\n"
+				"\t\t boundary[12] -= candn;\n"
+				"\t\t if(((pos_mask | neg_mask) & MASK_EARLY_EXIT) == 0)	{\n"
+				"\t\t\t pLocalStrengthShare[lidy][lidx] = 0;\n"
+				"\t\t\t doCompute = false;\n \t\t }\n"
+				"\t\t else  {\n"
+				"\t\t\t offs = -img_stride*3 + 1;\n"
+				"\t\t\t candp = pTempImg[offs];\n"
+				"\t\t\t candn = pTempImg[-offs];\n"
+				"\t\t\t neg_mask |= (((candp < centerPixel_neg) << 3) | ((candn < centerPixel_neg) << 11));\n"
+				"\t\t\t pos_mask |= (((candp > centerPixel_pos) << 3) | ((candn > centerPixel_pos) << 11));\n"
+				"\t\t\t boundary[3] -= candp;\n"
+				"\t\t\t boundary[11] -= candn;\n"
+				"\t\t\t offs = -img_stride*3 - 1;\n"
+				"\t\t\t candp = pTempImg[offs];\n"
+				"\t\t\t candn = pTempImg[-offs];\n"
+				"\t\t\t neg_mask |= (((candp < centerPixel_neg) << 5) | ((candn < centerPixel_neg) << 13));\n"
+				"\t\t\t pos_mask |= (((candp > centerPixel_pos) << 5) | ((candn > centerPixel_pos) << 13));\n"
+				"\t\t\t boundary[5] -= candp;\n"
+				"\t\t\t boundary[13] -= candn;\n"
+				"\t\t\t offs = -(img_stride<<1) + 2;\n"
+				"\t\t\t candp = pTempImg[offs];\n"
+				"\t\t\t candn = pTempImg[-offs];\n"
+				"\t\t\t neg_mask |= (((candp < centerPixel_neg) << 2) | ((candn < centerPixel_neg) << 10));\n"
+				"\t\t\t pos_mask |= (((candp > centerPixel_pos) << 2) | ((candn > centerPixel_pos) << 10));\n"
+				"\t\t\t boundary[2] -= candp;\n"
+				"\t\t\t boundary[10] -= candn;\n"
+				"\t\t\t offs = -(img_stride<<1) - 2;\n"
+				"\t\t\t candp = pTempImg[offs];\n"
+				"\t\t\t candn = pTempImg[-offs];\n"
+				"\t\t\t neg_mask |= (((candp < centerPixel_neg) << 6) | ((candn < centerPixel_neg) << 14));\n"
+				"\t\t\t pos_mask |= (((candp > centerPixel_pos) << 6) | ((candn > centerPixel_pos) << 14));\n"
+				"\t\t\t boundary[6] -= candp;\n"
+				"\t\t\t boundary[14] -= candn;\n"
+				"\t\t\t offs = -img_stride + 3;\n"
+				"\t\t\t candp = pTempImg[offs];\n"
+				"\t\t\t candn = pTempImg[-offs];\n"
+				"\t\t\t neg_mask |= (((candp < centerPixel_neg) << 1) | ((candn < centerPixel_neg) << 9));\n"
+				"\t\t\t pos_mask |= (((candp > centerPixel_pos) << 1) | ((candn > centerPixel_pos) << 9));\n"
+				"\t\t\t boundary[1] -= candp;\n"
+				"\t\t\t boundary[9] -= candn;\n"
+				"\t\t\t offs = -img_stride - 3;\n"
+				"\t\t\t candp = pTempImg[offs];\n"
+				"\t\t\t candn = pTempImg[-offs];\n"
+				"\t\t\t neg_mask |= (((candp < centerPixel_neg) << 7) | ((candn < centerPixel_neg) << 15));\n"
+				"\t\t\t pos_mask |= (((candp > centerPixel_pos) << 7) | ((candn > centerPixel_pos) << 15));\n"
+				"\t\t\t boundary[7] -= candp;\n"
+				"\t\t\t boundary[15] -= candn;\n"
+				"\t\t\t pos_mask |= (pos_mask << 16);\n"
+				"\t\t\t neg_mask |= (neg_mask << 16);\n"
+				"\t\t\t int cornerMask = 511;\n"
+				"\t\t\t int isCorner = 0;\n"
+				"\t\t\t for (int i = 0; i < 16; i++)	{\n"
+				"\t\t\t\t isCorner += ((pos_mask & cornerMask) == cornerMask);\n"
+				"\t\t\t\t isCorner += ((neg_mask & cornerMask) == cornerMask);\n"
+				"\t\t\t\t pos_mask >>= 1;\n"
+				"\t\t\t\t neg_mask >>= 1;\n\t\t\t }\n"
+				"\t\t\t if(isCorner == 0)	{\n"
+				"\t\t\t\t pLocalStrengthShare[lidy][lidx] = 0;\n"
+				"\t\t\t\t doCompute = false;\n\t\t\t }\n"
+				"\t\t\t else	{\n"
+				"\t\t\t\t local_strength = getScore(boundary);\n"
+				"\t\t\t\t pLocalStrengthShare[lidy][lidx] = local_strength;\n\t\t\t }\n"
+				"\t\t }\n\t }\n"
+				"\t barrier(CLK_LOCAL_MEM_FENCE);\n\n"
+				"\t bool writeCorner = doCompute && (local_strength >= pLocalStrengthShare[lidy-1][lidx-1]) && (local_strength >= pLocalStrengthShare[lidy-1][lidx]) && (local_strength >= pLocalStrengthShare[lidy-1][lidx+1])\n"
+				"\t\t\t\t\t\t && (local_strength >= pLocalStrengthShare[lidy][lidx-1]) && (local_strength > pLocalStrengthShare[lidy][lidx+1])\n"
+				"\t\t\t\t\t\t && (local_strength > pLocalStrengthShare[lidy+1][lidx-1]) && (local_strength > pLocalStrengthShare[lidy+1][lidx]) && (local_strength >= pLocalStrengthShare[lidy+1][lidx+1])\n"
+				"\t\t\t\t\t\t && (lidx > 0) && (lidy > 0) && (lidx < 15) && (lidy < 15);\n"
+				"\t __global int * numKeypoints = (__global int *) corner_buf;\n"
+				"\t __global KeyPt * keypt_list = (__global KeyPt *)(corner_buf + corner_buf_offset);\n"
+				"\t if(writeCorner)	{\n"
+				"\t\t\t int old_idx = atomic_inc(numKeypoints);\n"
+				"\t\t if(old_idx < corner_capacity)	{\n"
+				"\t\t\t keypt_list[old_idx].x = xoffset;\n"
+				"\t\t\t keypt_list[old_idx].y = yoffset;\n"
+				"\t\t\t keypt_list[old_idx].strength = (float) local_strength;\n"
+				"\t\t\t keypt_list[old_idx].scale = 0;\n"
+				"\t\t\t keypt_list[old_idx].orientation = 0;\n"
+				"\t\t\t keypt_list[old_idx].tracking_status = 1;\n"
+				"\t\t\t keypt_list[old_idx].error = 0;\n \t\t} \n \t}\n"
+			)
+			);
+		code += item;
+		code += "}\n";
+	}
+	else
+	{
+		// FAST without non-max supression
+
+		// OpenCL work items
+		node->opencl_global_work[0] = inputImg->u.img.width - 6;
+		node->opencl_global_work[1] = inputImg->u.img.height - 6;
+
+		// Pragma and data structure declarations
+		sprintf(item,
+			OPENCL_FORMAT(
+				"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+				"#define MASK_EARLY_EXIT 4369\n\n"								//((1<<0) | (1<<4) | (1<<8) | (1<<12))
+				"typedef struct {\n"
+				"\t int x;\n"
+				"\t int y;\n"
+				"\t float strength;\n"
+				"\t float scale;\n"
+				"\t float orientation;\n"
+				"\t int tracking_status;\n"
+				"\t float error;\n"
+				"} KeyPt;\n"
+			)
+			);
+		code = item;
+
+		// function declaration
+		sprintf(item,
+			OPENCL_FORMAT(
+				"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+				"void %s(__global char * corner_buf, uint corner_buf_offset, uint corner_capacity, uint img_width, uint img_height, __global uchar * img_buf, uint img_stride, uint img_offset, float strength_thresh)\n"
+				"{\n"
+			)
+			, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME);
+		code += item;
+
+		sprintf(item,
+			OPENCL_FORMAT(
+				"\t int idx = (int) get_global_id(0) + 3;\n"
+				"\t int idy = (int) get_global_id(1) + 3;\n"
+				"\t int stride = (int) img_stride;\n"
+				"\t if((idx > (int)img_width - 3) || (idy > (int)img_height - 3))  return;\n"
+				"\t __global const uchar * pTempImg = img_buf + img_offset + mad24(idy, stride, idx);\n"
+				"\t int centerPixel_neg = pTempImg[0];\n"
+				"\t int centerPixel_pos = centerPixel_neg + (int)strength_thresh;\n"
+				"\t centerPixel_neg -= (int)strength_thresh;\n"
+				"\t int candp, candn, pos_mask, neg_mask;\n"
+				"\t candp = pTempImg[3];\n"
+				"\t candn = pTempImg[-3];\n"
+				"\t neg_mask = (candp < centerPixel_neg) | ((candn < centerPixel_neg) << 8);\n"				// Position 0 and 8
+				"\t pos_mask = (candp > centerPixel_pos) | ((candn > centerPixel_pos) << 8);\n"
+				"\t int offs = -stride*3;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 4) | ((candn < centerPixel_neg) << 12));\n"		// Position 4,12
+				"\t pos_mask |= (((candp > centerPixel_pos) << 4) | ((candn > centerPixel_pos) << 12));\n"
+				"\t if(((pos_mask | neg_mask) & MASK_EARLY_EXIT) == 0)   return;\n"							// Early exit condition
+				"\t offs = -stride*3 + 1;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 3) | ((candn < centerPixel_neg) << 11));\n"		// Position 3,11
+				"\t pos_mask |= (((candp > centerPixel_pos) << 3) | ((candn > centerPixel_pos) << 11));\n"
+				"\t offs = -stride*3 - 1;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 5) | ((candn < centerPixel_neg) << 13));\n"		// Position 5,13
+				"\t pos_mask |= (((candp > centerPixel_pos) << 5) | ((candn > centerPixel_pos) << 13));\n"
+				"\t offs = -(stride << 1) + 2;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 2) | ((candn < centerPixel_neg) << 10));\n"		// Position 2,10
+				"\t pos_mask |= (((candp > centerPixel_pos) << 2) | ((candn > centerPixel_pos) << 10));\n"
+				"\t offs = -(stride << 1) - 2;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 6) | ((candn < centerPixel_neg) << 14));\n"		// Position 6,14
+				"\t pos_mask |= (((candp > centerPixel_pos) << 6) | ((candn > centerPixel_pos) << 14));\n"
+				"\t offs = -stride + 3;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 1) | ((candn < centerPixel_neg) << 9));\n"		// Position 1,9
+				"\t pos_mask |= (((candp > centerPixel_pos) << 1) | ((candn > centerPixel_pos) << 9));\n"
+				"\t offs = -stride - 3;\n"
+				"\t candp = pTempImg[offs];\n"
+				"\t candn = pTempImg[-offs];\n"
+				"\t neg_mask |= (((candp < centerPixel_neg) << 7) | ((candn < centerPixel_neg) << 15));\n"		// Position 7,15
+				"\t pos_mask |= (((candp > centerPixel_pos) << 7) | ((candn > centerPixel_pos) << 15));\n"
+				"\t pos_mask |= (pos_mask << 16);		neg_mask |= (neg_mask << 16);\n"
+				"\t int cornerMask = 511, isCorner = 0;\n"
+				"\t for(int i = 0; i < 16; i++)	{\n"
+				"\t\t isCorner += ((pos_mask & cornerMask) == cornerMask);\n"
+				"\t\t isCorner += ((neg_mask & cornerMask) == cornerMask);\n"
+				"\t\t pos_mask >>= 1;\n"
+				"\t\t neg_mask >>= 1;\n \t} \n"
+				"\t __global int * numKeypoints = (__global int *) corner_buf;\n"
+				"\t __global KeyPt * keypt_list = (__global KeyPt *)(corner_buf + corner_buf_offset);\n"
+				"\t if(isCorner)	{\n"
+				"\t\t\t int old_idx = atomic_inc(numKeypoints);\n"
+				"\t\t if(old_idx < corner_capacity)	{\n"
+				"\t\t\t keypt_list[old_idx].x = idx;\n"
+				"\t\t\t keypt_list[old_idx].y = idy;\n"
+				"\t\t\t keypt_list[old_idx].strength = strength_thresh;\n"
+				"\t\t\t keypt_list[old_idx].scale = 0;\n"
+				"\t\t\t keypt_list[old_idx].orientation = 0;\n"
+				"\t\t\t keypt_list[old_idx].tracking_status = 1;\n"
+				"\t\t\t keypt_list[old_idx].error = 0;\n \t\t} \n \t}\n"
+			)
+			);
+
+		code += item;
+		code += "}\n";
+	}
+	
+	node->opencl_code = code;
+	return status;
+}
+
+#endif
diff --git a/openvx/ago/ago_haf_gpu_linear_filter.cpp b/openvx/ago/ago_haf_gpu_linear_filter.cpp
new file mode 100644
index 0000000..b3b123a
--- /dev/null
+++ b/openvx/ago/ago_haf_gpu_linear_filter.cpp
@@ -0,0 +1,1459 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_haf_gpu.h"
+
+#if ENABLE_OPENCL
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for LinearFilter_U8_U8, LinearFilter_S16_U8, and LinearFilter_F32_U8
+//
+int HafGpu_LinearFilter_ANY_U8(AgoNode * node, vx_df_image dst_image_format, AgoData * src_filter, bool roundingMode)
+{
+	int status = VX_SUCCESS;
+	// get destination type
+	const char * dstRegType = "U8";
+	bool dstIsS16 = false;
+	bool dstIsF32 = false;
+	float roundingBias = roundingMode ? 0.0f : -0.49999f;
+	if (dst_image_format == VX_DF_IMAGE_S16) {
+		dstRegType = "S16";
+		dstIsS16 = true;
+		roundingBias = roundingMode ? 0.5f : 0.0f;
+	}
+	else if (dst_image_format == VX_DF_IMAGE_F32_AMD) {
+		dstRegType = "F32";
+		dstIsF32 = true;
+		roundingBias = 0.0f;
+	}
+	else if (dst_image_format != VX_DF_IMAGE_U8) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_U8 doesn't support non-U8/S16/F32 destinations for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// get filter details
+	bool filterCoefAreConstants = src_filter->ref.read_only;
+	vx_uint32 filterWidth = 0, filterHeight = 0;
+	float * filterCoef = nullptr;
+	if (src_filter->ref.type == VX_TYPE_CONVOLUTION) {
+		filterWidth = (vx_uint32)src_filter->u.conv.columns;
+		filterHeight = (vx_uint32)src_filter->u.conv.rows;
+		filterCoef = (float *)src_filter->reserved;
+	}
+	else if (src_filter->ref.type == VX_TYPE_MATRIX) {
+		filterWidth = (vx_uint32)src_filter->u.mat.columns;
+		filterHeight = (vx_uint32)src_filter->u.mat.rows;
+		filterCoef = (float *)src_filter->buffer;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_U8 doesn't expects vx_matrix or vx_convolution object for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	bool clampNotNeeded = false;
+	bool filterCoefAreIntegers = false;
+	if (filterCoefAreConstants) {
+		float sumP = 0.0f, sumN = 0.0f;
+		filterCoefAreIntegers = true;
+		for (vx_uint32 i = 0; i < filterWidth * filterHeight; i++) {
+			if (floorf(filterCoef[i]) != filterCoef[i])
+				filterCoefAreIntegers = false;
+			if (filterCoef[i] < 0.0f) sumN += filterCoef[i];
+			else sumP += filterCoef[i];
+		}
+		if (sumN*255.0f > -32767.0f && sumP*255.0f < 32766.0f)
+			clampNotNeeded = true;
+	}
+
+	char item[1024];
+	std::string code;
+	if (filterHeight == 1 && filterWidth > 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Mdiv2 = filterWidth >> 1; if (Mdiv2 == 0) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_U8 doesn't support %dx%d filter\n", filterWidth, filterHeight);
+			return -1; 
+		}
+		// function declaration
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_%dx1;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_%dx1 coef) {\n"
+				), filterWidth, filterWidth, node->opencl_name, dstRegType, filterWidth);
+		}
+		code = item;
+
+		// configuration
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8;
+		vx_uint32 LMemSideAlign = (Mdiv2 < 8) ? 3 : 7;
+		vx_uint32 LMemSide = ((Mdiv2 + LMemSideAlign) & ~LMemSideAlign);
+		vx_uint32 LMemStride = LMemWidth + 2 * LMemSide;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = LMemHeight * LMemStride;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight, LMemSide, 0, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum; uint2 pix; float fval;\n"
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			), LMemStride);
+		code += item;
+		int numQW = (LMemSide / 4) + 1;
+		for (int qw = 0; qw < numQW; qw++) {
+			bool loaded_pix = false;
+			for (int x = 0; x < 8; x++) {
+				int bytepos = qw * 8 + x;
+				int xpos = bytepos - LMemSide;
+				if (xpos >= -(int)Mdiv2 && xpos <= (7 + (int)Mdiv2)) {
+					bool loaded_fval = false;
+					for (int ix = 0; ix < 8; ix++) {
+						int ixpos = xpos - ix;
+						if (ixpos == -(int)Mdiv2) {
+							if (filterCoefAreConstants) {
+								if (filterCoef[0] == 0.0f) {
+									sprintf(item, "  sum.s%d = 0.0f;\n", ix);
+								}
+								else {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									sprintf(item, "  sum.s%d =     fval* %.12ef;\n", ix, filterCoef[0]);
+								}
+							}
+							else {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+								if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+								sprintf(item, "  sum.s%d =     fval* coef.f[ 0];\n", ix);
+							}
+							code += item;
+						}
+						else if ((ixpos > -(int)Mdiv2) && (ixpos <= (int)Mdiv2)) {
+							if (filterCoefAreConstants) {
+								if (filterCoef[ixpos + Mdiv2] != 0.0f) {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									sprintf(item, "  sum.s%d = mad(fval, %.12ef, sum.s%d);\n", ix, filterCoef[ixpos + Mdiv2], ix);
+									code += item;
+								}
+							}
+							else {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+								if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+								sprintf(item, "  sum.s%d = mad(fval, coef.f[%2d], sum.s%d);\n", ix, ixpos + Mdiv2, ix);
+								code += item;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else if (filterWidth == 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Ndiv2 = filterHeight >> 1;
+		// function declaration
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_1x%d;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_1x%d coef) {\n"
+				), filterHeight, filterHeight, node->opencl_name, dstRegType, filterHeight);
+		}
+		code = item;
+
+		// configuration
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8;
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = (LMemHeight + 2 * Ndiv2) * LMemWidth;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemWidth, LMemHeight + Ndiv2 * 2, 0, Ndiv2, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum; uint2 pix; float fval;\n"
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			), LMemWidth);
+		code += item;
+
+		bool first_item = true;
+		for (int y = 0; y < (int)filterHeight; y++) {
+			if (!filterCoefAreConstants || filterCoef[y] != 0.0f) {
+				sprintf(item, "  pix = lbufptr[%d];\n", y * LMemWidth / 8); code += item;
+				if (filterCoefAreConstants) {
+					sprintf(item, "  fval = %.12ef;\n", filterCoef[y]); code += item;
+				}
+				else {
+					sprintf(item, "  fval = coef.f[%d];\n", y); code += item;
+				}
+				if (first_item) {
+					first_item = false;
+					for (int x = 0; x < 8; x++) {
+						sprintf(item, "  sum.s%d = amd_unpack%d(pix.s%d) * fval;\n", x, x & 3, x >> 2); code += item;
+					}
+				}
+				else {
+					for (int x = 0; x < 8; x++) {
+						sprintf(item, "  sum.s%d = mad(amd_unpack%d(pix.s%d), fval, sum.s%d);\n", x, x & 3, x >> 2, x); code += item;
+					}
+				}
+			}
+		}
+	}
+	else {
+		// generate code for MxN filter
+		vx_uint32 Ndiv2 = filterHeight >> 1;
+		vx_uint32 Mdiv2 = filterWidth >> 1;
+
+		// function declaration
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_%dx%d;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_%dx%d coef) {\n"
+				), filterWidth*filterHeight, filterWidth, filterHeight, node->opencl_name, dstRegType, filterWidth, filterHeight);
+		}
+		code = item;
+
+		// configuration
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8;
+		vx_uint32 LMemSideLR = ((Mdiv2 + 3) & ~3);
+		vx_uint32 LMemStride = LMemWidth + 2 * LMemSideLR;
+		vx_uint32 LMemSideTB = Ndiv2;
+		vx_uint32 LMemSize = (LMemHeight + 2 * LMemSideTB) * LMemStride;
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = LMemSize;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight + 2 * LMemSideTB, LMemSideLR, LMemSideTB, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item, 
+			OPENCL_FORMAT(
+			"  F32x8 sum = (F32x8)0.0f; uint2 pix; float fval;\n"
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			), LMemStride);
+		code += item;
+		int numQW = (LMemSideLR / 4) + 1;
+		for (int y = 0; y < (int)filterHeight; y++) {
+			sprintf(item, "  // filterRow = %d\n", y); code += item;
+			for (int qw = 0; qw < numQW; qw++) {
+				bool loaded_pix = false;
+				for (int x = 0; x < 8; x++) {
+					int bytepos = qw * 8 + x;
+					int xpos = bytepos - LMemSideLR;
+					if (xpos >= -(int)Mdiv2 && xpos <= (7 + (int)Mdiv2)) {
+						bool loaded_fval = false;
+						for (int ix = 0; ix < 8; ix++) {
+							int ixpos = xpos - ix;
+							if ((ixpos >= -(int)Mdiv2) && (ixpos <= (int)Mdiv2)) {
+								int coefPos = y * filterWidth + ixpos + Mdiv2;
+								if (filterCoefAreConstants) {
+									if (filterCoef[coefPos] != 0.0f) {
+										if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw + y*LMemStride / 8); code += item; }
+										if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+										if (filterCoef[coefPos] == 1.0f)       sprintf(item, "  sum.s%d += fval;\n", ix);
+										else if (filterCoef[coefPos] == -1.0f) sprintf(item, "  sum.s%d -= fval;\n", ix);
+										else                                   sprintf(item, "  sum.s%d  = mad(fval, %.12ef, sum.s%d);\n", ix, filterCoef[coefPos], ix);
+										code += item;
+									}
+								}
+								else {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw + y*LMemStride / 8); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									sprintf(item, "  sum.s%d = mad(fval, coef.f[%2d], sum.s%d);\n", ix, coefPos, ix);
+									code += item;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	if (!filterCoefAreIntegers && roundingBias != 0.0f) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  sum.s0 = sum.s0 + %.12ef;\n"
+			"  sum.s1 = sum.s1 + %.12ef;\n"
+			"  sum.s2 = sum.s2 + %.12ef;\n"
+			"  sum.s3 = sum.s3 + %.12ef;\n"
+			"  sum.s4 = sum.s4 + %.12ef;\n"
+			"  sum.s5 = sum.s5 + %.12ef;\n"
+			"  sum.s6 = sum.s6 + %.12ef;\n"
+			"  sum.s7 = sum.s7 + %.12ef;\n"
+			), roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias);
+		code += item;
+	}
+	if (dstIsS16) {
+		if (clampNotNeeded) {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)sum.s0) & 0xffff;\n"
+				"  rv.s0 |= ((int)sum.s1) << 16;\n"
+				"  rv.s1  = ((int)sum.s2) & 0xffff;\n"
+				"  rv.s1 |= ((int)sum.s3) << 16;\n"
+				"  rv.s2  = ((int)sum.s4) & 0xffff;\n"
+				"  rv.s2 |= ((int)sum.s5) << 16;\n"
+				"  rv.s3  = ((int)sum.s6) & 0xffff;\n"
+				"  rv.s3 |= ((int)sum.s7) << 16;\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		else {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)clamp(sum.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+	}
+	else if (dstIsF32) {
+		code +=
+			"  *r = sum;\n"
+			"}\n";
+	}
+	else {
+		code +=
+			OPENCL_FORMAT(
+			"  U8x8 rv;\n"
+			"  rv.s0 = amd_pack(sum.s0123);\n"
+			"  rv.s1 = amd_pack(sum.s4567);\n"
+			"  *r = rv;\n"
+			"}\n");
+	}
+	node->opencl_code = code;
+	node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for LinearFilter_U8_S16, LinearFilter_S16_S16, and LinearFilter_F32_S16
+//
+int HafGpu_LinearFilter_ANY_S16(AgoNode * node, vx_df_image dst_image_format, AgoData * src_filter, bool roundingMode)
+{
+	int status = VX_SUCCESS;
+	// get destination type
+	const char * dstRegType = "U8";
+	bool dstIsS16 = false;
+	bool dstIsF32 = false;
+	float roundingBias = roundingMode ? 0.0f : -0.49999f;
+	if (dst_image_format == VX_DF_IMAGE_S16) {
+		dstRegType = "S16";
+		dstIsS16 = true;
+		roundingBias = roundingMode ? 0.5f : 0.0f;
+	}
+	else if (dst_image_format == VX_DF_IMAGE_F32_AMD) {
+		dstRegType = "F32";
+		dstIsF32 = true;
+		roundingBias = 0.0f;
+	}
+	else if (dst_image_format != VX_DF_IMAGE_U8) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_S16 doesn't support non-U8/S16/F32 destinations for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// get filter size
+	bool filterCoefAreConstants = src_filter->ref.read_only;
+	vx_uint32 filterWidth = 0, filterHeight = 0;
+	float * filterCoef = nullptr;
+	if (src_filter->ref.type == VX_TYPE_CONVOLUTION) {
+		filterWidth = (vx_uint32)src_filter->u.conv.columns;
+		filterHeight = (vx_uint32)src_filter->u.conv.rows;
+		filterCoef = (float *)src_filter->reserved;
+	}
+	else if (src_filter->ref.type == VX_TYPE_MATRIX) {
+		filterWidth = (vx_uint32)src_filter->u.mat.columns;
+		filterHeight = (vx_uint32)src_filter->u.mat.rows;
+		filterCoef = (float *)src_filter->buffer;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_S16 doesn't expects vx_matrix or vx_convolution object for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	bool filterCoefAreIntegers = false;
+	if (filterCoefAreConstants) {
+		filterCoefAreIntegers = true;
+		for (vx_uint32 i = 0; i < filterWidth * filterHeight; i++) {
+			if (floorf(filterCoef[i]) != filterCoef[i])
+				filterCoefAreIntegers = false;
+		}
+	}
+
+	if (filterHeight == 1 && filterWidth > 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Mdiv2 = filterWidth >> 1; if (Mdiv2 == 0) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_S16 doesn't support %dx%d filter\n", filterWidth, filterHeight);
+			return -1; 
+		}
+		vx_uint32 BytesPerPixel = (vx_uint32)sizeof(vx_int16);
+		vx_uint32 LMemSidePixelAlign = 4;
+		vx_uint32 BytesPerWorkItem = 8 * BytesPerPixel;
+		vx_uint32 BytesPerPixelShift = leftmostbit(BytesPerPixel);
+		vx_uint32 BytesPerWorkItemShift = leftmostbit(BytesPerWorkItem);
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+		vx_uint32 LMemWidth = (AGO_OPENCL_WORKGROUP_SIZE_0 * 8) * BytesPerPixel;
+		vx_uint32 LMemSide = ((Mdiv2 + (LMemSidePixelAlign - 1)) & ~(LMemSidePixelAlign - 1)) * BytesPerPixel;
+		vx_uint32 LMemStride = LMemWidth + 2 * LMemSide;
+		char item[1024];
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_%dx1;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_%dx1 coef) {\n"
+				), filterWidth, filterWidth, node->opencl_name, dstRegType, filterWidth);
+		}
+		std::string code = item;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = LMemHeight * LMemStride;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight, LMemSide, 0, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum; short4 pix; float fval;\n"
+			"  __local short4 * lbufptr = (__local short4 *) (lbuf + ly * %d + (lx << 4));\n" // LMemStride
+			), LMemStride);
+		code += item;
+		int numQF = 2 * (((2 * LMemSide) / BytesPerWorkItem) + 1);
+		for (int qf = 0; qf < numQF; qf++) {
+			bool loaded_pix = false;
+			for (int x = 0; x < 4; x++) {
+				int pixpos = qf * 4 + x;
+				int xpos = pixpos - (LMemSide / BytesPerPixel);
+				bool loaded_fval = false;
+				for (int ix = 0; ix < 8; ix++) {
+					int ixpos = xpos - ix;
+					if (ixpos == -(int)Mdiv2) {
+						if (filterCoefAreConstants) {
+							if (filterCoef[0] == 0.0f) {
+								if (dstIsS16) sprintf(item, "  sum.s%d = 0.5f;\n", ix);
+								else          sprintf(item, "  sum.s%d = 0.0f;\n", ix);
+							}
+							else {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf);  code += item; }
+								if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = (float)pix.s%d;\n", x);  code += item; }
+								if (dstIsS16) sprintf(item, "  sum.s%d = mad(fval, %.12ef, 0.5f);\n", ix, filterCoef[0]);
+								else          sprintf(item, "  sum.s%d =     fval* %.12ef;\n", ix, filterCoef[0]);
+							}
+						}
+						else {
+							if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf);  code += item; }
+							if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = (float)pix.s%d;\n", x);  code += item; }
+							if (dstIsS16) sprintf(item, "  sum.s%d = mad(fval, coef.f[ 0], 0.5f);\n", ix);
+							else          sprintf(item, "  sum.s%d =     fval* coef.f[ 0];\n", ix);
+						}
+						code += item;
+					}
+					else if ((ixpos > -(int)Mdiv2) && (ixpos <= (int)Mdiv2)) {
+						if (filterCoefAreConstants) {
+							if (filterCoef[ixpos + Mdiv2] != 0.0f) {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf);  code += item; }
+								if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = (float)pix.s%d;\n", x);  code += item; }
+								sprintf(item, "  sum.s%d = mad(fval, %.12ef, sum.s%d);\n", ix, filterCoef[ixpos + Mdiv2], ix);
+								code += item;
+							}
+						}
+						else {
+							if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf);  code += item; }
+							if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = (float)pix.s%d;\n", x);  code += item; }
+							sprintf(item, "  sum.s%d = mad(fval, coef.f[%2d], sum.s%d);\n", ix, ixpos + Mdiv2, ix);
+							code += item;
+						}
+					}
+				}
+			}
+		}
+		if (!filterCoefAreIntegers && roundingBias != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  sum.s0 = sum.s0 + %.12ef;\n"
+				"  sum.s1 = sum.s1 + %.12ef;\n"
+				"  sum.s2 = sum.s2 + %.12ef;\n"
+				"  sum.s3 = sum.s3 + %.12ef;\n"
+				"  sum.s4 = sum.s4 + %.12ef;\n"
+				"  sum.s5 = sum.s5 + %.12ef;\n"
+				"  sum.s6 = sum.s6 + %.12ef;\n"
+				"  sum.s7 = sum.s7 + %.12ef;\n"
+				), roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias);
+			code += item;
+		}
+		if (dstIsS16) {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)clamp(sum.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		else if (dstIsF32) {
+			code +=
+				"  *r = sum;\n"
+				"}\n";
+		}
+		else {
+			code +=
+				OPENCL_FORMAT(
+				"  U8x8 rv;\n"
+				"  rv.s0 = amd_pack(sum.s0123);\n"
+				"  rv.s1 = amd_pack(sum.s4567);\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		node->opencl_code = code;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+	else if (filterWidth == 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Ndiv2 = filterHeight >> 1;
+		// function declaration
+		char item[1024];
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_1x%d;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_1x%d coef) {\n"
+				), filterHeight, filterHeight, node->opencl_name, dstRegType, filterHeight);
+		}
+		std::string code = item;
+
+		// configuration
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8 * 2;
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = (LMemHeight + 2 * Ndiv2) * LMemWidth;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemWidth, LMemHeight + Ndiv2 * 2, 0, Ndiv2, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum; short8 pix; float fval;\n"
+			"  __local short8 * lbufptr = (__local short8 *) (lbuf + ly * %d + (lx << 4));\n" // LMemStride
+			), LMemWidth);
+		code += item;
+		bool first_item = true;
+		for (int y = 0; y < (int)filterHeight; y++) {
+			if (!filterCoefAreConstants || filterCoef[y] != 0.0f) {
+				sprintf(item, "  pix = lbufptr[%d];\n", y * LMemWidth / 16); code += item;
+				if (filterCoefAreConstants) {
+					sprintf(item, "  fval = %.12ef;\n", filterCoef[y]); code += item;
+				}
+				else {
+					sprintf(item, "  fval = coef.f[%d];\n", y); code += item;
+				}
+				if (first_item) {
+					first_item = false;
+					for (int x = 0; x < 8; x++) {
+						if (dstIsS16) {
+							sprintf(item, "  sum.s%d = mad((float)pix.s%d, fval, 0.5f);\n", x, x); code += item;
+						}
+						else {
+							sprintf(item, "  sum.s%d = (float)pix.s%d * fval;\n", x, x); code += item;
+						}
+					}
+				}
+				else {
+					for (int x = 0; x < 8; x++) {
+						sprintf(item, "  sum.s%d = mad((float)pix.s%d, fval, sum.s%d);\n", x, x, x); code += item;
+					}
+				}
+			}
+		}
+		if (!filterCoefAreIntegers && roundingBias != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  sum.s0 = sum.s0 + %.12ef;\n"
+				"  sum.s1 = sum.s1 + %.12ef;\n"
+				"  sum.s2 = sum.s2 + %.12ef;\n"
+				"  sum.s3 = sum.s3 + %.12ef;\n"
+				"  sum.s4 = sum.s4 + %.12ef;\n"
+				"  sum.s5 = sum.s5 + %.12ef;\n"
+				"  sum.s6 = sum.s6 + %.12ef;\n"
+				"  sum.s7 = sum.s7 + %.12ef;\n"
+				), roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias);
+			code += item;
+		}
+		if (dstIsS16) {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)clamp(sum.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		else if (dstIsF32) {
+			code +=
+				"  *r = sum;\n"
+				"}\n";
+		}
+		else {
+			code +=
+				OPENCL_FORMAT(
+				"  U8x8 rv;\n"
+				"  rv.s0 = amd_pack(sum.s0123);\n"
+				"  rv.s1 = amd_pack(sum.s4567);\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		node->opencl_code = code;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+	else {
+		status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	}
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for LinearFilter_U8_F32, LinearFilter_S16_F32, and LinearFilter_F32_F32
+//
+int HafGpu_LinearFilter_ANY_F32(AgoNode * node, vx_df_image dst_image_format, AgoData * src_filter, bool roundingMode)
+{
+	int status = VX_SUCCESS;
+	// get destination type
+	const char * dstRegType = "U8";
+	bool dstIsS16 = false;
+	bool dstIsF32 = false;
+	float roundingBias = roundingMode ? 0.0f : -0.49999f;
+	if (dst_image_format == VX_DF_IMAGE_S16) {
+		dstRegType = "S16";
+		dstIsS16 = true;
+		roundingBias = roundingMode ? 0.5f : 0.0f;
+	}
+	else if (dst_image_format == VX_DF_IMAGE_F32_AMD) {
+		dstRegType = "F32";
+		dstIsF32 = true;
+		roundingBias = 0.0f;
+	}
+	else if (dst_image_format != VX_DF_IMAGE_U8) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_F32 doesn't support non-U8/S16/F32 destinations for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// get filter size
+	bool filterCoefAreConstants = src_filter->ref.read_only;
+	vx_uint32 filterWidth = 0, filterHeight = 0;
+	float * filterCoef = nullptr;
+	if (src_filter->ref.type == VX_TYPE_CONVOLUTION) {
+		filterWidth = (vx_uint32)src_filter->u.conv.columns;
+		filterHeight = (vx_uint32)src_filter->u.conv.rows;
+		filterCoef = (float *)src_filter->reserved;
+	}
+	else if (src_filter->ref.type == VX_TYPE_MATRIX) {
+		filterWidth = (vx_uint32)src_filter->u.mat.columns;
+		filterHeight = (vx_uint32)src_filter->u.mat.rows;
+		filterCoef = (float *)src_filter->buffer;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_F32 doesn't expects vx_matrix or vx_convolution object for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	if (filterHeight == 1 && filterWidth > 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Mdiv2 = filterWidth >> 1; if (Mdiv2 == 0) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_F32 doesn't support %dx%d filter\n", filterWidth, filterHeight);
+			return -1; 
+		}
+		vx_uint32 BytesPerPixel = (vx_uint32)sizeof(vx_float32);
+		vx_uint32 LMemSidePixelAlign = 4;
+		vx_uint32 BytesPerWorkItem = 8 * BytesPerPixel;
+		vx_uint32 BytesPerPixelShift = leftmostbit(BytesPerPixel);
+		vx_uint32 BytesPerWorkItemShift = leftmostbit(BytesPerWorkItem);
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+		vx_uint32 LMemWidth = (AGO_OPENCL_WORKGROUP_SIZE_0 * 8) * BytesPerPixel;
+		vx_uint32 LMemSide = ((Mdiv2 + (LMemSidePixelAlign - 1)) & ~(LMemSidePixelAlign - 1)) * BytesPerPixel;
+		vx_uint32 LMemStride = LMemWidth + 2 * LMemSide;
+		char item[1024];
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_%dx1;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_%dx1 coef) {\n"
+				), filterWidth, filterWidth, node->opencl_name, dstRegType, filterWidth);
+		}
+		std::string code = item;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = LMemHeight * LMemStride;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight, LMemSide, 0, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum; float4 pix;\n"
+			"  __local float4 * lbufptr = (__local float4 *) (lbuf + ly * %d + (lx << 5));\n" // LMemStride
+			), LMemStride);
+		code += item;
+		int numQF = 2 * (((2 * LMemSide) / BytesPerWorkItem) + 1);
+		for (int qf = 0; qf < numQF; qf++) {
+			bool loaded_pix = false;
+			for (int x = 0; x < 4; x++) {
+				int pixpos = qf * 4 + x;
+				int xpos = pixpos - (LMemSide / BytesPerPixel);
+				for (int ix = 0; ix < 8; ix++) {
+					int ixpos = xpos - ix;
+					if (ixpos == -(int)Mdiv2) {
+						if (filterCoefAreConstants) {
+							if (filterCoef[0] == 0.0f) {
+								if (dstIsS16) sprintf(item, "  sum.s%d = 0.5f;\n", ix);
+								else          sprintf(item, "  sum.s%d = 0.0f;\n", ix);
+							}
+							else {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf); code += item; }
+								if (dstIsS16) sprintf(item, "  sum.s%d = mad(pix.s%d, %.12ef, 0.5f);\n", ix, x, filterCoef[0]);
+								else          sprintf(item, "  sum.s%d =     pix.s%d* %.12ef;\n", ix, x, filterCoef[0]);
+							}
+						}
+						else {
+							if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf); code += item; }
+							if (dstIsS16) sprintf(item, "  sum.s%d = mad(pix.s%d, coef.f[ 0], 0.5f);\n", ix, x);
+							else          sprintf(item, "  sum.s%d =     pix.s%d* coef.f[ 0];\n", ix, x);
+						}
+						code += item;
+					}
+					else if ((ixpos > -(int)Mdiv2) && (ixpos <= (int)Mdiv2)) {
+						if (filterCoefAreConstants) {
+							if (filterCoef[ixpos + Mdiv2] != 0.0f) {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf); code += item; }
+								sprintf(item, "  sum.s%d = mad(pix.s%d, %.12ef, sum.s%d);\n", ix, x, filterCoef[ixpos + Mdiv2], ix);
+								code += item;
+							}
+						}
+						else {
+							if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qf); code += item; }
+							sprintf(item, "  sum.s%d = mad(pix.s%d, coef.f[%2d], sum.s%d);\n", ix, x, ixpos + Mdiv2, ix);
+							code += item;
+						}
+					}
+				}
+			}
+		}
+		if (roundingBias != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  sum.s0 = sum.s0 + %.12ef;\n"
+				"  sum.s1 = sum.s1 + %.12ef;\n"
+				"  sum.s2 = sum.s2 + %.12ef;\n"
+				"  sum.s3 = sum.s3 + %.12ef;\n"
+				"  sum.s4 = sum.s4 + %.12ef;\n"
+				"  sum.s5 = sum.s5 + %.12ef;\n"
+				"  sum.s6 = sum.s6 + %.12ef;\n"
+				"  sum.s7 = sum.s7 + %.12ef;\n"
+				), roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias);
+			code += item;
+		}
+		if (dstIsS16) {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)clamp(sum.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		else if (dstIsF32) {
+			code +=
+				"  *r = sum;\n"
+				"}\n";
+		}
+		else {
+			code +=
+				OPENCL_FORMAT(
+				"  U8x8 rv;\n"
+				"  rv.s0 = amd_pack(sum.s0123);\n"
+				"  rv.s1 = amd_pack(sum.s4567);\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		node->opencl_code = code;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+	else if (filterWidth == 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Ndiv2 = filterHeight >> 1;
+		// function declaration
+		char item[1024];
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_1x%d;\n"
+				"void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_1x%d coef) {\n"
+				), filterHeight, filterHeight, node->opencl_name, dstRegType, filterHeight);
+		}
+		std::string code = item;
+
+		// configuration
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8 * 4;
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (1 << 2) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 1);
+		node->opencl_local_buffer_size_in_bytes = (LMemHeight + 2 * Ndiv2) * LMemWidth;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemWidth, LMemHeight + Ndiv2 * 2, 0, Ndiv2, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum; float8 pix; float fval;\n"
+			"  __local float8 * lbufptr = (__local float8 *) (lbuf + ly * %d + (lx << 5));\n" // LMemStride
+			), LMemWidth);
+		code += item;
+		bool first_item = true;
+		for (int y = 0; y < (int)filterHeight; y++) {
+			if (!filterCoefAreConstants || filterCoef[y] != 0.0f) {
+				sprintf(item, "  pix = lbufptr[%d];\n", y * LMemWidth / 32); code += item;
+				if (filterCoefAreConstants) {
+					sprintf(item, "  fval = %.12ef;\n", filterCoef[y]); code += item;
+				}
+				else {
+					sprintf(item, "  fval = coef.f[%d];\n", y); code += item;
+				}
+				if (first_item) {
+					first_item = false;
+					for (int x = 0; x < 8; x++) {
+						if (dstIsS16) {
+							sprintf(item, "  sum.s%d = mad(pix.s%d, fval, 0.5f);\n", x, x); code += item;
+						}
+						else {
+							sprintf(item, "  sum.s%d = pix.s%d * fval;\n", x, x); code += item;
+						}
+					}
+				}
+				else {
+					for (int x = 0; x < 8; x++) {
+						sprintf(item, "  sum.s%d = mad(pix.s%d, fval, sum.s%d);\n", x, x, x); code += item;
+					}
+				}
+			}
+		}
+		if (roundingBias != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  sum.s0 = sum.s0 + %.12ef;\n"
+				"  sum.s1 = sum.s1 + %.12ef;\n"
+				"  sum.s2 = sum.s2 + %.12ef;\n"
+				"  sum.s3 = sum.s3 + %.12ef;\n"
+				"  sum.s4 = sum.s4 + %.12ef;\n"
+				"  sum.s5 = sum.s5 + %.12ef;\n"
+				"  sum.s6 = sum.s6 + %.12ef;\n"
+				"  sum.s7 = sum.s7 + %.12ef;\n"
+				), roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias);
+			code += item;
+		}
+		if (dstIsS16) {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)clamp(sum.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		else if (dstIsF32) {
+			code +=
+				"  *r = sum;\n"
+				"}\n";
+		}
+		else {
+			code +=
+				OPENCL_FORMAT(
+				"  U8x8 rv;\n"
+				"  rv.s0 = amd_pack(sum.s0123);\n"
+				"  rv.s1 = amd_pack(sum.s4567);\n"
+				"  *r = rv;\n"
+				"}\n");
+		}
+		node->opencl_code = code;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+	else {
+		status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	}
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for LinearFilter_U8x2_U8, LinearFilter_S16x2_U8, and LinearFilter_F32x2_U8
+//
+int HafGpu_LinearFilter_ANYx2_U8(AgoNode * node, vx_df_image dst_image_format, AgoData * src_filter, AgoData * src_filter2, bool roundingMode)
+{
+	int status = VX_SUCCESS;
+	// get destination type
+	const char * dstRegType = "U8";
+	bool dstIsS16 = false;
+	bool dstIsF32 = false;
+	float roundingBias = roundingMode ? 0.0f : -0.49999f;
+	if (dst_image_format == VX_DF_IMAGE_S16) {
+		dstRegType = "S16";
+		dstIsS16 = true;
+		roundingBias = roundingMode ? 0.5f : 0.0f;
+	}
+	else if (dst_image_format == VX_DF_IMAGE_F32_AMD) {
+		dstRegType = "F32";
+		dstIsF32 = true;
+		roundingBias = 0.0f;
+	}
+	else if (dst_image_format != VX_DF_IMAGE_U8) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANYx2_U8 doesn't support non-U8/S16/F32 destinations for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// get filter size
+	bool filterCoefAreConstants = src_filter->ref.read_only;
+	vx_uint32 filterWidth = 0, filterHeight = 0, filter2Width = 0, filter2Height = 0;
+	float * filterCoef = nullptr, *filter2Coef = nullptr;
+	if (src_filter->ref.type == VX_TYPE_CONVOLUTION) {
+		filterWidth = (vx_uint32)src_filter->u.conv.columns;
+		filterHeight = (vx_uint32)src_filter->u.conv.rows;
+		filterCoef = (float *)src_filter->reserved;
+		filter2Width = (vx_uint32)src_filter2->u.conv.columns;
+		filter2Height = (vx_uint32)src_filter2->u.conv.rows;
+		filter2Coef = (float *)src_filter2->reserved;
+	}
+	else if (src_filter->ref.type == VX_TYPE_MATRIX) {
+		filterWidth = (vx_uint32)src_filter->u.mat.columns;
+		filterHeight = (vx_uint32)src_filter->u.mat.rows;
+		filterCoef = (float *)src_filter->buffer;
+		filter2Width = (vx_uint32)src_filter2->u.mat.columns;
+		filter2Height = (vx_uint32)src_filter2->u.mat.rows;
+		filter2Coef = (float *)src_filter2->buffer;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANYx2_U8 doesn't expects vx_matrix or vx_convolution object for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	if (filterWidth != filter2Width || filterHeight != filter2Height || src_filter->ref.read_only != src_filter2->ref.read_only || src_filter->ref.type != src_filter2->ref.type) {
+		agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANYx2_U8 requires both filters to have same attributes\n");
+		return -1;
+	}
+	bool clampNotNeeded = false;
+	bool filterCoefAreIntegers = false;
+	if (filterCoefAreConstants) {
+		float sumP = 0.0f, sumN = 0.0f;
+		float sumP2 = 0.0f, sumN2 = 0.0f;
+		filterCoefAreIntegers = true;
+		for (vx_uint32 i = 0; i < filterWidth * filterHeight; i++) {
+			if (floorf(filterCoef[i]) != filter2Coef[i] || floorf(filter2Coef[i]) != filter2Coef[i])
+				filterCoefAreIntegers = false;
+			if (filterCoef[i] < 0.0f) sumN += filterCoef[i];
+			else sumP += filterCoef[i];
+			if (filter2Coef[i] < 0.0f) sumN2 += filter2Coef[i];
+			else sumP2 += filter2Coef[i];
+		}
+		if ((sumN*255.0f > -32767.0f && sumP*255.0f < 32766.0f) && (sumN2*255.0f > -32767.0f && sumP2*255.0f < 32766.0f))
+			clampNotNeeded = true;
+	}
+
+	std::string code;
+	char item[1024];
+	if (filterHeight == 1 && filterWidth > 1) {
+		// generate code for Mx1 filter
+		vx_uint32 Mdiv2 = filterWidth >> 1; if (Mdiv2 == 0) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANY_U8 doesn't support %dx%d filter\n", filterWidth, filterHeight);
+			return -1; 
+		}
+		// function declaration
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r1, %sx8 * r2, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_%dx1;\n"
+				"void %s(%sx8 * r1, %sx8 * r2, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_%dx1 coef1, COEF_%dx1 coef2) {\n"
+				), filterWidth, filterWidth, node->opencl_name, dstRegType, dstRegType, filterWidth, filterWidth);
+		}
+		code = item;
+
+		// configuration
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8;
+		vx_uint32 LMemSideAlign = (Mdiv2 < 8) ? 3 : 7;
+		vx_uint32 LMemSide = ((Mdiv2 + LMemSideAlign) & ~LMemSideAlign);
+		vx_uint32 LMemStride = LMemWidth + 2 * LMemSide;
+
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (3 << 3) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 2);
+		node->opencl_local_buffer_size_in_bytes = LMemHeight * LMemStride;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight, LMemSide, 0, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum1, sum2; uint2 pix; float fval;\n"
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			), LMemStride);
+		code += item;
+		int numQW = (LMemSide / 4) + 1;
+		for (int qw = 0; qw < numQW; qw++) {
+			bool loaded_pix = false;
+			for (int x = 0; x < 8; x++) {
+				int bytepos = qw * 8 + x;
+				int xpos = bytepos - LMemSide;
+				if (xpos >= -(int)Mdiv2 && xpos <= (7 + (int)Mdiv2)) {
+					bool loaded_fval = false;
+					for (int ix = 0; ix < 8; ix++) {
+						int ixpos = xpos - ix;
+						if (ixpos == -(int)Mdiv2) {
+							if (filterCoefAreConstants) {
+								if (filterCoef[0] == 0.0f) {
+									sprintf(item, "  sum1.s%d = 0.0f;\n", ix);
+								}
+								else {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									if (filterCoef[0] == 1.0f) {
+										sprintf(item, "  sum1.s%d =     fval;\n", ix);
+									}
+									else {
+										sprintf(item, "  sum1.s%d =     fval* %.12ef;\n", ix, filterCoef[0]);
+									}
+								}
+								code += item;
+								if (filter2Coef[0] == 0.0f) {
+									sprintf(item, "  sum2.s%d = 0.0f;\n", ix);
+								}
+								else {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									if (filter2Coef[0] == 1.0f) {
+										sprintf(item, "  sum2.s%d =     fval;\n", ix);
+									}
+									else {
+										sprintf(item, "  sum2.s%d =     fval* %.12ef;\n", ix, filter2Coef[0]);
+									}
+								}
+								code += item;
+							}
+							else {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+								if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+								sprintf(item, 
+									"  sum1.s%d =     fval* coef1.f[ 0];\n"
+									"  sum2.s%d =     fval* coef2.f[ 0];\n"
+									, ix, ix);
+								code += item;
+							}
+						}
+						else if ((ixpos > -(int)Mdiv2) && (ixpos <= (int)Mdiv2)) {
+							if (filterCoefAreConstants) {
+								if (filterCoef[ixpos + Mdiv2] != 0.0f) {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									sprintf(item, "  sum1.s%d = mad(fval, %.12ef, sum1.s%d);\n", ix, filterCoef[ixpos + Mdiv2], ix);
+									code += item;
+								}
+								if (filter2Coef[ixpos + Mdiv2] != 0.0f) {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									sprintf(item, "  sum2.s%d = mad(fval, %.12ef, sum2.s%d);\n", ix, filter2Coef[ixpos + Mdiv2], ix);
+									code += item;
+								}
+							}
+							else {
+								if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw); code += item; }
+								if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+								sprintf(item, "  sum1.s%d = mad(fval, coef1.f[%2d], sum1.s%d);\n", ix, ixpos + Mdiv2, ix);
+								code += item;
+								sprintf(item, "  sum2.s%d = mad(fval, coef2.f[%2d], sum2.s%d);\n", ix, ixpos + Mdiv2, ix);
+								code += item;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else {
+		// generate code for MxN filter
+		vx_uint32 Ndiv2 = filterHeight >> 1;
+		vx_uint32 Mdiv2 = filterWidth >> 1;
+		if (Mdiv2 == 0 || Ndiv2 == 0) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: HafGpu_LinearFilter_ANYx2_U8 doesn't support %dx%d filter\n", filterWidth, filterHeight);
+			return -1; 
+		}
+
+		// function declaration
+		if (filterCoefAreConstants) {
+			sprintf(item,
+				"void %s(%sx8 * r1, %sx8 * r2, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				, node->opencl_name, dstRegType, dstRegType);
+		}
+		else {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"typedef struct { float f[%d]; } COEF_%dx%d;\n"
+				"void %s(%sx8 * r1, %sx8 * r2, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride, COEF_%dx%d coef1, COEF_%dx%d coef2) {\n"
+				), filterWidth*filterHeight, filterWidth, filterHeight, node->opencl_name, dstRegType, dstRegType, filterWidth, filterHeight, filterWidth, filterHeight);
+		}
+		code = item;
+
+		// configuration
+		vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+		vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8;
+		vx_uint32 LMemSideLR = ((Mdiv2 + 3) & ~3);
+		vx_uint32 LMemStride = LMemWidth + 2 * LMemSideLR;
+		vx_uint32 LMemSideTB = Ndiv2;
+		vx_uint32 LMemSize = (LMemHeight + 2 * LMemSideTB) * LMemStride;
+		node->opencl_param_discard_mask = filterCoefAreConstants ? (3 << 3) : 0;
+		node->opencl_local_buffer_usage_mask = (1 << 2);
+		node->opencl_local_buffer_size_in_bytes = LMemSize;
+
+		// generate local memory load
+		code +=
+			OPENCL_FORMAT(
+			"  int lx = get_local_id(0);\n"
+			"  int ly = get_local_id(1);\n"
+			"  int gx = x >> 3;\n"
+			"  int gy = y;\n"
+			"  int gstride = stride;\n"
+			"  __global uchar * gbuf = p;\n");
+		if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight + 2 * LMemSideTB, LMemSideLR, LMemSideTB, code) < 0) {
+			return -1;
+		}
+
+		// generate computation
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  F32x8 sum1 = (F32x8)0.0f, sum2 = (F32x8)0.0f; uint2 pix; float fval;\n"
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			), LMemStride);
+		code += item;
+		int numQW = (LMemSideLR / 4) + 1;
+		for (int y = 0; y < (int)filterHeight; y++) {
+			sprintf(item, "  // filterRow = %d\n", y); code += item;
+			for (int qw = 0; qw < numQW; qw++) {
+				bool loaded_pix = false;
+				for (int x = 0; x < 8; x++) {
+					int bytepos = qw * 8 + x;
+					int xpos = bytepos - LMemSideLR;
+					if (xpos >= -(int)Mdiv2 && xpos <= (7 + (int)Mdiv2)) {
+						bool loaded_fval = false;
+						for (int ix = 0; ix < 8; ix++) {
+							int ixpos = xpos - ix;
+							if ((ixpos >= -(int)Mdiv2) && (ixpos <= (int)Mdiv2)) {
+								int coefPos = y * filterWidth + ixpos + Mdiv2;
+								if (filterCoefAreConstants) {
+									if (filterCoef[coefPos] != 0.0f) {
+										if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw + y*LMemStride / 8); code += item; }
+										if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+										if (filterCoef[coefPos] == 1.0f)       sprintf(item, "  sum1.s%d += fval;\n", ix);
+										else if (filterCoef[coefPos] == -1.0f) sprintf(item, "  sum1.s%d -= fval;\n", ix);
+										else                                   sprintf(item, "  sum1.s%d  = mad(fval, %.12ef, sum1.s%d);\n", ix, filterCoef[coefPos], ix);
+										code += item;
+									}
+									if (filter2Coef[coefPos] != 0.0f) {
+										if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw + y*LMemStride / 8); code += item; }
+										if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+										if (filter2Coef[coefPos] == 1.0f)       sprintf(item, "  sum2.s%d += fval;\n", ix);
+										else if (filter2Coef[coefPos] == -1.0f) sprintf(item, "  sum2.s%d -= fval;\n", ix);
+										else                                    sprintf(item, "  sum2.s%d  = mad(fval, %.12ef, sum2.s%d);\n", ix, filter2Coef[coefPos], ix);
+										code += item;
+									}
+								}
+								else {
+									if (!loaded_pix) { loaded_pix = true; sprintf(item, "  pix = lbufptr[%d];\n", qw + y*LMemStride / 8); code += item; }
+									if (!loaded_fval) { loaded_fval = true; sprintf(item, "  fval = amd_unpack%d(pix.s%d);\n", x & 3, x >> 2); code += item; }
+									sprintf(item, "  sum1.s%d = mad(fval, coef1.f[%2d], sum1.s%d);\n", ix, coefPos, ix);
+									code += item;
+									sprintf(item, "  sum2.s%d = mad(fval, coef2.f[%2d], sum2.s%d);\n", ix, coefPos, ix);
+									code += item;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	if (!filterCoefAreIntegers && roundingBias != 0.0f) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  sum1.s0 = sum1.s0 + %.12ef;\n"
+			"  sum1.s1 = sum1.s1 + %.12ef;\n"
+			"  sum1.s2 = sum1.s2 + %.12ef;\n"
+			"  sum1.s3 = sum1.s3 + %.12ef;\n"
+			"  sum1.s4 = sum1.s4 + %.12ef;\n"
+			"  sum1.s5 = sum1.s5 + %.12ef;\n"
+			"  sum1.s6 = sum1.s6 + %.12ef;\n"
+			"  sum1.s7 = sum1.s7 + %.12ef;\n"
+			"  sum2.s0 = sum2.s0 + %.12ef;\n"
+			"  sum2.s1 = sum2.s1 + %.12ef;\n"
+			"  sum2.s2 = sum2.s2 + %.12ef;\n"
+			"  sum2.s3 = sum2.s3 + %.12ef;\n"
+			"  sum2.s4 = sum2.s4 + %.12ef;\n"
+			"  sum2.s5 = sum2.s5 + %.12ef;\n"
+			"  sum2.s6 = sum2.s6 + %.12ef;\n"
+			"  sum2.s7 = sum2.s7 + %.12ef;\n")
+			, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias
+			, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias, roundingBias);
+		code += item;
+	}
+	if (dstIsS16) {
+		if (clampNotNeeded) {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)sum1.s0) & 0xffff;\n"
+				"  rv.s0 |= ((int)sum1.s1) << 16;\n"
+				"  rv.s1  = ((int)sum1.s2) & 0xffff;\n"
+				"  rv.s1 |= ((int)sum1.s3) << 16;\n"
+				"  rv.s2  = ((int)sum1.s4) & 0xffff;\n"
+				"  rv.s2 |= ((int)sum1.s5) << 16;\n"
+				"  rv.s3  = ((int)sum1.s6) & 0xffff;\n"
+				"  rv.s3 |= ((int)sum1.s7) << 16;\n"
+				"  *r1 = rv;\n"
+				"  rv.s0  = ((int)sum2.s0) & 0xffff;\n"
+				"  rv.s0 |= ((int)sum2.s1) << 16;\n"
+				"  rv.s1  = ((int)sum2.s2) & 0xffff;\n"
+				"  rv.s1 |= ((int)sum2.s3) << 16;\n"
+				"  rv.s2  = ((int)sum2.s4) & 0xffff;\n"
+				"  rv.s2 |= ((int)sum2.s5) << 16;\n"
+				"  rv.s3  = ((int)sum2.s6) & 0xffff;\n"
+				"  rv.s3 |= ((int)sum2.s7) << 16;\n"
+				"  *r2 = rv;\n"
+				"}\n");
+		}
+		else {
+			code +=
+				OPENCL_FORMAT(
+				"  S16x8 rv;\n"
+				"  rv.s0  = ((int)clamp(sum1.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum1.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum1.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum1.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum1.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum1.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum1.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum1.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r1 = rv;\n"
+				"  rv.s0  = ((int)clamp(sum2.s0, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s0 |= ((int)clamp(sum2.s1, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s1  = ((int)clamp(sum2.s2, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s1 |= ((int)clamp(sum2.s3, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s2  = ((int)clamp(sum2.s4, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s2 |= ((int)clamp(sum2.s5, -32768.0f, 32767.0f)) << 16;\n"
+				"  rv.s3  = ((int)clamp(sum2.s6, -32768.0f, 32767.0f)) & 0xffff;\n"
+				"  rv.s3 |= ((int)clamp(sum2.s7, -32768.0f, 32767.0f)) << 16;\n"
+				"  *r2 = rv;\n"
+				"}\n");
+		}
+	}
+	else if (dstIsF32) {
+		code +=
+			"  *r1 = sum1;\n"
+			"  *r2 = sum2;\n"
+			"}\n";
+	}
+	else {
+		code +=
+			OPENCL_FORMAT(
+			"  U8x8 rv;\n"
+			"  rv.s0 = amd_pack(sum1.s0123);\n"
+			"  rv.s1 = amd_pack(sum1.s4567);\n"
+			"  *r1 = rv;\n"
+			"  rv.s0 = amd_pack(sum2.s0123);\n"
+			"  rv.s1 = amd_pack(sum2.s4567);\n"
+			"  *r2 = rv;\n"
+			"}\n");
+	}
+	node->opencl_code += code;
+	node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+
+	return status;
+}
+
+#endif
diff --git a/openvx/ago/ago_haf_gpu_special_filters.cpp b/openvx/ago/ago_haf_gpu_special_filters.cpp
new file mode 100644
index 0000000..eff0861
--- /dev/null
+++ b/openvx/ago/ago_haf_gpu_special_filters.cpp
@@ -0,0 +1,1703 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_haf_gpu.h"
+
+#if ENABLE_OPENCL
+
+#define ENABLE_FAST_MEDIAN_3x3       0   // 0:disable 1:enable fast shortcut for median 3x3
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Useful pre-defined filters
+//
+static float scharrFilter_3x3_x[3][3] = {
+	{ -3, 0,  3 },
+	{-10, 0, 10 },
+	{ -3, 0,  3 },
+};
+static float scharrFilter_3x3_y[3][3] = {
+	{ -3, -10, -3 },
+	{  0,   0,  0 },
+	{  3,  10,  3 },
+};
+static float sobelFilter_3x3_x[3][3] = {
+        {-1, 0, 1},
+        {-2, 0, 2},
+        {-1, 0, 1},
+};
+static float sobelFilter_3x3_y[3][3] = {
+        {-1,-2,-1},
+        { 0, 0, 0},
+        { 1, 2, 1},
+};
+static float sobelFilter_5x5_x[5][5] = {
+        {-1, -2, 0, 2, 1},
+        {-4, -8, 0, 8, 4},
+        {-6,-12, 0,12, 6},
+        {-4, -8, 0, 8, 4},
+        {-1, -2, 0, 2, 1},
+};
+static float sobelFilter_5x5_y[5][5] = {
+        {-1,-4, -6,-4,-1},
+        {-2,-8,-12,-8,-2},
+        { 0, 0,  0, 0, 0},
+        { 2, 8, 12, 8, 2},
+        { 1, 4,  6, 4, 1},
+};
+static float sobelFilter_7x7_x[7][7] = {
+        {  -1,  -4,  -5, 0,   5,  4,  1},
+        {  -6, -24, -30, 0,  30, 24,  6},
+        { -15, -60, -75, 0,  75, 60, 15},
+        { -20, -80,-100, 0, 100, 80, 20},
+        { -15, -60, -75, 0,  75, 60, 15},
+        {  -6, -24, -30, 0,  30, 24,  6},
+        {  -1,  -4,  -5, 0,   5,  4,  1},
+};
+static float sobelFilter_7x7_y[7][7] = {
+        {-1, -6,-15, -20,-15, -6,-1},
+        {-4,-24,-60, -80,-60,-24,-4},
+        {-5,-30,-75,-100,-75,-30,-5},
+        { 0,  0,  0,   0,  0,  0, 0},
+        { 5, 30, 75, 100, 75, 30, 5},
+        { 4, 24, 60,  80, 60, 24, 4},
+        { 1,  6, 15,  20, 15,  6, 1},
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for following non-linear filter kernels:
+//   VX_KERNEL_AMD_DILATE_U8_U8_3x3, VX_KERNEL_AMD_DILATE_U1_U8_3x3,
+//   VX_KERNEL_AMD_ERODE_U8_U8_3x3, VX_KERNEL_AMD_ERODE_U1_U8_3x3, 
+//   VX_KERNEL_AMD_MEDIAN_U8_U8_3x3
+//
+int HafGpu_NonLinearFilter_3x3_ANY_U8(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// get destination type
+	const char * dstRegType = "U8";
+	bool dstIsU1 = false;
+	if (node->paramList[0]->u.img.format == VX_DF_IMAGE_U1_AMD) {
+		dstRegType = "U1";
+		dstIsU1 = true;
+	}
+	else if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_NonLinearFilter_3x3_ANY_U8 doesn't support non-U8/U1 destinations for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// function declaration
+	char item[8192];
+	sprintf(item, "void %s(%sx8 * r, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n", node->opencl_name, dstRegType);
+	std::string code = item;
+
+	// configuration
+	vx_uint32 LMemHeight = AGO_OPENCL_WORKGROUP_SIZE_1;
+	vx_uint32 LMemWidth = AGO_OPENCL_WORKGROUP_SIZE_0 * 8;
+	vx_uint32 LMemSideLR = 4;
+	vx_uint32 LMemSideTB = 1;
+	vx_uint32 LMemStride = LMemWidth + 2 * LMemSideLR;
+	vx_uint32 LMemSize = (LMemHeight + 2 * LMemSideTB) * LMemStride;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_local_buffer_usage_mask = (1 << 1);
+	node->opencl_local_buffer_size_in_bytes = LMemSize;
+
+	// generate local memory load
+	code +=
+		"  int lx = get_local_id(0);\n"
+		"  int ly = get_local_id(1);\n"
+		"  int gx = x >> 3;\n"
+		"  int gy = y;\n"
+		"  int gstride = stride;\n"
+		"  __global uchar * gbuf = p;\n";
+	if (HafGpu_Load_Local(AGO_OPENCL_WORKGROUP_SIZE_0, AGO_OPENCL_WORKGROUP_SIZE_1, LMemStride, LMemHeight + 2 * LMemSideTB, LMemSideLR, LMemSideTB, code) < 0) {
+		return -1;
+	}
+
+	// generate computation
+	if (node->akernel->id == VX_KERNEL_AMD_DILATE_U8_U8_3x3 || node->akernel->id == VX_KERNEL_AMD_DILATE_U1_U8_3x3) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			"  F32x8 sum; uint4 pix; float4 val;\n"
+			"  pix.s01 = lbufptr[0];\n"
+			"  pix.s23 = lbufptr[1];\n"
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  sum.s0 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  sum.s1 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  sum.s2 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  sum.s3 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  sum.s4 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  sum.s5 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  sum.s6 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  sum.s7 = amd_max3(val.s0, val.s1, val.s2);\n"
+			"  pix.s01 = lbufptr[%d];\n" // LMemStride / 8
+			"  pix.s23 = lbufptr[%d];\n" // LMemStride / 8 + 1
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s0 = max(sum.s0, val.s3);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s1 = max(sum.s1, val.s3);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s2 = max(sum.s2, val.s3);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s3 = max(sum.s3, val.s3);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s4 = max(sum.s4, val.s3);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s5 = max(sum.s5, val.s3);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s6 = max(sum.s6, val.s3);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s7 = max(sum.s7, val.s3);\n"
+			"  pix.s01 = lbufptr[%d];\n" // 2 * LMemStride / 8
+			"  pix.s23 = lbufptr[%d];\n" // 2 * LMemStride / 8 + 1
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s0 = max(sum.s0, val.s3);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s1 = max(sum.s1, val.s3);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s2 = max(sum.s2, val.s3);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s3 = max(sum.s3, val.s3);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s4 = max(sum.s4, val.s3);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s5 = max(sum.s5, val.s3);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s6 = max(sum.s6, val.s3);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  val.s3 = amd_max3(val.s0, val.s1, val.s2); sum.s7 = max(sum.s7, val.s3);\n"
+			)
+		, LMemStride, LMemStride / 8, LMemStride / 8 + 1, LMemStride * 2 / 8, LMemStride * 2 / 8 + 1);
+		code += item;
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_ERODE_U8_U8_3x3 || node->akernel->id == VX_KERNEL_AMD_ERODE_U1_U8_3x3) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			"  F32x8 sum; uint4 pix; float4 val;\n"
+			"  pix.s01 = lbufptr[0];\n"
+			"  pix.s23 = lbufptr[1];\n"
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  sum.s0 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  sum.s1 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  sum.s2 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  sum.s3 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  sum.s4 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  sum.s5 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  sum.s6 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  sum.s7 = amd_min3(val.s0, val.s1, val.s2);\n"
+			"  pix.s01 = lbufptr[%d];\n" // LMemStride / 8
+			"  pix.s23 = lbufptr[%d];\n" // LMemStride / 8 + 1
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s0 = min(sum.s0, val.s3);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s1 = min(sum.s1, val.s3);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s2 = min(sum.s2, val.s3);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s3 = min(sum.s3, val.s3);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s4 = min(sum.s4, val.s3);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s5 = min(sum.s5, val.s3);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s6 = min(sum.s6, val.s3);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s7 = min(sum.s7, val.s3);\n"
+			"  pix.s01 = lbufptr[%d];\n" // 2 * LMemStride / 8
+			"  pix.s23 = lbufptr[%d];\n" // 2 * LMemStride / 8 + 1
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s0 = min(sum.s0, val.s3);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s1 = min(sum.s1, val.s3);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s2 = min(sum.s2, val.s3);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s3 = min(sum.s3, val.s3);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s4 = min(sum.s4, val.s3);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s5 = min(sum.s5, val.s3);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s6 = min(sum.s6, val.s3);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  val.s3 = amd_min3(val.s0, val.s1, val.s2); sum.s7 = min(sum.s7, val.s3);\n"
+			)
+			, LMemStride, LMemStride / 8, LMemStride / 8 + 1, LMemStride * 2 / 8, LMemStride * 2 / 8 + 1);
+		code += item;
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_MEDIAN_U8_U8_3x3) {
+#if ENABLE_FAST_MEDIAN_3x3
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			"  F32x8 sum, tum; uint4 pix; float4 val;\n"
+			"  pix.s01 = lbufptr[0];\n"
+			"  pix.s23 = lbufptr[1];\n"
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  sum.s0 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  sum.s1 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  sum.s2 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  sum.s3 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  sum.s4 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  sum.s5 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  sum.s6 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  sum.s7 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  pix.s01 = lbufptr[%d];\n" // LMemStride / 8
+			"  pix.s23 = lbufptr[%d];\n" // LMemStride / 8 + 1
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  tum.s0 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  tum.s1 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  tum.s2 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  tum.s3 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  tum.s4 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  tum.s5 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  tum.s6 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  tum.s7 = amd_median3(val.s0, val.s1, val.s2);\n"
+			"  pix.s01 = lbufptr[%d];\n" // 2 * LMemStride / 8
+			"  pix.s23 = lbufptr[%d];\n" // 2 * LMemStride / 8 + 1
+			"  val.s0 = amd_unpack3(pix.s0);\n"
+			"  val.s1 = amd_unpack0(pix.s1);\n"
+			"  val.s2 = amd_unpack1(pix.s1);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s0 = amd_median3(sum.s0, tum.s0, val.s3);\n"
+			"  val.s0 = amd_unpack2(pix.s1);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s1 = amd_median3(sum.s1, tum.s1, val.s3);\n"
+			"  val.s1 = amd_unpack3(pix.s1);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s2 = amd_median3(sum.s2, tum.s2, val.s3);\n"
+			"  val.s2 = amd_unpack0(pix.s2);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s3 = amd_median3(sum.s3, tum.s3, val.s3);\n"
+			"  val.s0 = amd_unpack1(pix.s2);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s4 = amd_median3(sum.s4, tum.s4, val.s3);\n"
+			"  val.s1 = amd_unpack2(pix.s2);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s5 = amd_median3(sum.s5, tum.s5, val.s3);\n"
+			"  val.s2 = amd_unpack3(pix.s2);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s6 = amd_median3(sum.s6, tum.s6, val.s3);\n"
+			"  val.s0 = amd_unpack0(pix.s3);\n"
+			"  val.s3 = amd_median3(val.s0, val.s1, val.s2); sum.s7 = amd_median3(sum.s7, tum.s7, val.s3);\n"
+			)
+			, LMemStride, LMemStride / 8, LMemStride / 8 + 1, LMemStride * 2 / 8, LMemStride * 2 / 8 + 1);
+		code += item;
+#else
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uint2 * lbufptr = (__local uint2 *) (lbuf + ly * %d + (lx << 3));\n" // LMemStride
+			"  F32x8 sum;\n"
+			"  float4 val0, val1, val2, valz;\n"
+			"  uint4 pix0, pix1, pix2;\n"
+			"  pix0.s01 = lbufptr[0];\n"
+			"  pix0.s23 = lbufptr[1];\n"
+			"  pix1.s01 = lbufptr[%d];\n" //     LMemStride / 8
+			"  pix1.s23 = lbufptr[%d];\n" //     LMemStride / 8 + 1
+			"  pix2.s01 = lbufptr[%d];\n" // 2 * LMemStride / 8
+			"  pix2.s23 = lbufptr[%d];\n" // 2 * LMemStride / 8 + 1
+			), LMemStride, LMemStride / 8, LMemStride / 8 + 1, LMemStride * 2 / 8, LMemStride * 2 / 8 + 1);
+		code += item;
+		code +=
+			OPENCL_FORMAT(
+			"  // pixel 0\n"
+			"  valz.s0 = amd_unpack3(pix0.s0);\n"
+			"  valz.s1 = amd_unpack0(pix0.s1);\n"
+			"  valz.s2 = amd_unpack1(pix0.s1);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack3(pix1.s0);\n"
+			"  valz.s1 = amd_unpack0(pix1.s1);\n"
+			"  valz.s2 = amd_unpack1(pix1.s1);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack3(pix2.s0);\n"
+			"  valz.s1 = amd_unpack0(pix2.s1);\n"
+			"  valz.s2 = amd_unpack1(pix2.s1);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s0  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 1\n"
+			"  valz.s0 = amd_unpack0(pix0.s1);\n"
+			"  valz.s1 = amd_unpack1(pix0.s1);\n"
+			"  valz.s2 = amd_unpack2(pix0.s1);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack0(pix1.s1);\n"
+			"  valz.s1 = amd_unpack1(pix1.s1);\n"
+			"  valz.s2 = amd_unpack2(pix1.s1);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack0(pix2.s1);\n"
+			"  valz.s1 = amd_unpack1(pix2.s1);\n"
+			"  valz.s2 = amd_unpack2(pix2.s1);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s1  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 2\n"
+			"  valz.s0 = amd_unpack1(pix0.s1);\n"
+			"  valz.s1 = amd_unpack2(pix0.s1);\n"
+			"  valz.s2 = amd_unpack3(pix0.s1);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack1(pix1.s1);\n"
+			"  valz.s1 = amd_unpack2(pix1.s1);\n"
+			"  valz.s2 = amd_unpack3(pix1.s1);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack1(pix2.s1);\n"
+			"  valz.s1 = amd_unpack2(pix2.s1);\n"
+			"  valz.s2 = amd_unpack3(pix2.s1);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s2  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 3\n"
+			"  valz.s0 = amd_unpack2(pix0.s1);\n"
+			"  valz.s1 = amd_unpack3(pix0.s1);\n"
+			"  valz.s2 = amd_unpack0(pix0.s2);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack2(pix1.s1);\n"
+			"  valz.s1 = amd_unpack3(pix1.s1);\n"
+			"  valz.s2 = amd_unpack0(pix1.s2);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack2(pix2.s1);\n"
+			"  valz.s1 = amd_unpack3(pix2.s1);\n"
+			"  valz.s2 = amd_unpack0(pix2.s2);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s3  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 4\n"
+			"  valz.s0 = amd_unpack3(pix0.s1);\n"
+			"  valz.s1 = amd_unpack0(pix0.s2);\n"
+			"  valz.s2 = amd_unpack1(pix0.s2);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack3(pix1.s1);\n"
+			"  valz.s1 = amd_unpack0(pix1.s2);\n"
+			"  valz.s2 = amd_unpack1(pix1.s2);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack3(pix2.s1);\n"
+			"  valz.s1 = amd_unpack0(pix2.s2);\n"
+			"  valz.s2 = amd_unpack1(pix2.s2);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s4  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 5\n"
+			"  valz.s0 = amd_unpack0(pix0.s2);\n"
+			"  valz.s1 = amd_unpack1(pix0.s2);\n"
+			"  valz.s2 = amd_unpack2(pix0.s2);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack0(pix1.s2);\n"
+			"  valz.s1 = amd_unpack1(pix1.s2);\n"
+			"  valz.s2 = amd_unpack2(pix1.s2);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack0(pix2.s2);\n"
+			"  valz.s1 = amd_unpack1(pix2.s2);\n"
+			"  valz.s2 = amd_unpack2(pix2.s2);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s5  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 6\n"
+			"  valz.s0 = amd_unpack1(pix0.s2);\n"
+			"  valz.s1 = amd_unpack2(pix0.s2);\n"
+			"  valz.s2 = amd_unpack3(pix0.s2);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack1(pix1.s2);\n"
+			"  valz.s1 = amd_unpack2(pix1.s2);\n"
+			"  valz.s2 = amd_unpack3(pix1.s2);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack1(pix2.s2);\n"
+			"  valz.s1 = amd_unpack2(pix2.s2);\n"
+			"  valz.s2 = amd_unpack3(pix2.s2);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s6  = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  // pixel 7\n"
+			"  valz.s0 = amd_unpack2(pix0.s2);\n"
+			"  valz.s1 = amd_unpack3(pix0.s2);\n"
+			"  valz.s2 = amd_unpack0(pix0.s3);\n"
+			"  val0.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val0.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack2(pix1.s2);\n"
+			"  valz.s1 = amd_unpack3(pix1.s2);\n"
+			"  valz.s2 = amd_unpack0(pix1.s3);\n"
+			"  val1.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val1.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_unpack2(pix2.s2);\n"
+			"  valz.s1 = amd_unpack3(pix2.s2);\n"
+			"  valz.s2 = amd_unpack0(pix2.s3);\n"
+			"  val2.s0 = amd_min3   (valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s1 = amd_median3(valz.s0, valz.s1, valz.s2);\n"
+			"  val2.s2 = amd_max3   (valz.s0, valz.s1, valz.s2);\n"
+			"  valz.s0 = amd_max3   (val0.s0, val1.s0, val2.s0);\n"
+			"  valz.s1 = amd_median3(val0.s1, val1.s1, val2.s1);\n"
+			"  valz.s2 = amd_min3   (val0.s2, val1.s2, val2.s2);\n"
+			"  sum.s7  = amd_median3(valz.s0, valz.s1, valz.s2);\n");
+#endif
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_NonLinearFilter_3x3_ANY_U8 doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	if (dstIsU1) {
+		code +=
+			OPENCL_FORMAT(
+			"  U8x8 rv;\n"
+			"  rv.s0 = amd_pack(sum.s0123);\n"
+			"  rv.s1 = amd_pack(sum.s4567);\n"
+			"  Convert_U1_U8(r, rv);\n"
+			"}\n");
+	}
+	else {
+		code +=
+			OPENCL_FORMAT(
+			"  U8x8 rv;\n"
+			"  rv.s0 = amd_pack(sum.s0123);\n"
+			"  rv.s1 = amd_pack(sum.s4567);\n"
+			"  *r = rv;\n"
+			"}\n");
+	}
+
+	node->opencl_code = code;
+	node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for following non-linear filter kernels:
+//   VX_KERNEL_AMD_DILATE_U8_U1_3x3, VX_KERNEL_AMD_DILATE_U1_U1_3x3,
+//   VX_KERNEL_AMD_ERODE_U8_U1_3x3, VX_KERNEL_AMD_ERODE_U1_U1_3x3, 
+//
+int HafGpu_NonLinearFilter_3x3_ANY_U1(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// get destination type
+	const char * dstRegType = "U8";
+	bool dstIsU1 = false;
+	if (node->paramList[0]->u.img.format == VX_DF_IMAGE_U1_AMD) {
+		dstRegType = "U1";
+		dstIsU1 = true;
+	}
+	else if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_NonLinearFilter_3x3_ANY_U1 doesn't support non-U8/U1 destinations for kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// function declaration
+	char item[8192];
+	sprintf(item, "void %s(%sx8 * r, uint x, uint y, __global uchar * p, uint stride) {\n", node->opencl_name, dstRegType);
+	std::string code = item;
+
+	// configuration
+	int stride = node->paramList[1]->u.img.stride_in_bytes;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_local_buffer_usage_mask = 0;
+	node->opencl_local_buffer_size_in_bytes = 0;
+
+	// generate computation
+	if (node->akernel->id == VX_KERNEL_AMD_DILATE_U8_U1_3x3 || 
+		node->akernel->id == VX_KERNEL_AMD_DILATE_U1_U1_3x3 ||
+		node->akernel->id == VX_KERNEL_AMD_ERODE_U8_U1_3x3 ||
+		node->akernel->id == VX_KERNEL_AMD_ERODE_U1_U1_3x3) {
+		int op = (node->akernel->id == VX_KERNEL_AMD_DILATE_U8_U1_3x3 || node->akernel->id == VX_KERNEL_AMD_DILATE_U1_U1_3x3) ? '|' : '&';
+		sprintf(item,
+			// TBD: this code segment uses risky 32-bit loads without 32-bit alignment
+			//      it works great on our hardware though it doesn't follow OpenCL rules
+			OPENCL_FORMAT(
+			"  x = (x >> 3) - 1;\n"
+			"  p += y * %d + x;\n" // stride
+			"  uint L0 = *(__global uint *)&p[-%d];\n" //  stride
+			"  uint L1 = *(__global uint *) p;\n"
+			"  uint L2 = *(__global uint *)&p[%d];\n" //  stride
+			"  L0 = L0 %c (L0 >> 1) %c (L0 << 1);\n" // op, op
+			"  L1 = L1 %c (L1 >> 1) %c (L1 << 1);\n" // op, op
+			"  L2 = L2 %c (L2 >> 1) %c (L2 << 1);\n" // op, op
+			"  L0 = L0 %c  L1       %c  L2;\n" // op, op
+			"  L0 = L0 >> 8;\n"
+			), stride, stride, stride, op, op, op, op, op, op, op, op);
+		code += item;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_NonLinearFilter_3x3_ANY_U1 doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	if (dstIsU1) {
+		code +=
+			OPENCL_FORMAT(
+			"  *r = (U1x8)L0;\n"
+			"}\n");
+	}
+	else {
+		code +=
+			OPENCL_FORMAT(
+			"  Convert_U8_U1(r, (U1x8)L0);\n"
+			"}\n");
+	}
+
+	node->opencl_code = code;
+	node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following special case Sobel filter kernels:
+//   VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX
+//   VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY
+//   VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY
+//   VX_KERNEL_AMD_SOBEL_MAGNITUDE_PHASE_S16U8_U8_3x3
+//   VX_KERNEL_AMD_SOBEL_MAGNITUDE_S16_U8_3x3
+//   VX_KERNEL_AMD_SOBEL_PHASE_U8_U8_3x3
+//
+int HafGpu_SobelSpecialCases(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	if (node->akernel->id == VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY) {
+		AgoData filterGX, filterGY;
+		filterGX.ref.type = VX_TYPE_MATRIX; filterGX.u.mat.type = VX_TYPE_FLOAT32; filterGX.u.mat.columns = filterGX.u.mat.rows = 3; filterGX.buffer = (vx_uint8 *)&sobelFilter_3x3_x[0][0]; filterGX.ref.read_only = true;
+		filterGY.ref.type = VX_TYPE_MATRIX; filterGY.u.mat.type = VX_TYPE_FLOAT32; filterGY.u.mat.columns = filterGY.u.mat.rows = 3; filterGY.buffer = (vx_uint8 *)&sobelFilter_3x3_y[0][0]; filterGY.ref.read_only = true;
+		status = HafGpu_LinearFilter_ANYx2_U8(node, VX_DF_IMAGE_S16, &filterGX, &filterGY, false);
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX) {
+		AgoData filterGX;
+		filterGX.ref.type = VX_TYPE_MATRIX; filterGX.u.mat.type = VX_TYPE_FLOAT32; filterGX.u.mat.columns = filterGX.u.mat.rows = 3; filterGX.buffer = (vx_uint8 *)&sobelFilter_3x3_x[0][0]; filterGX.ref.read_only = true;
+		status = HafGpu_LinearFilter_ANY_U8(node, VX_DF_IMAGE_S16, &filterGX, false);
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY) {
+		AgoData filterGY;
+		filterGY.ref.type = VX_TYPE_MATRIX; filterGY.u.mat.type = VX_TYPE_FLOAT32; filterGY.u.mat.columns = filterGY.u.mat.rows = 3; filterGY.buffer = (vx_uint8 *)&sobelFilter_3x3_y[0][0]; filterGY.ref.read_only = true;
+		status = HafGpu_LinearFilter_ANY_U8(node, VX_DF_IMAGE_S16, &filterGY, false);
+	}
+	else {
+		// for other special cases
+		// re-use LinearFilter_ANYx2_U8 for computing GX & GY
+		char opencl_name[VX_MAX_KERNEL_NAME];
+		strcpy(opencl_name, node->opencl_name);
+		sprintf(node->opencl_name, "%s_GXY", opencl_name);
+		AgoData filterGX, filterGY;
+		filterGX.ref.type = VX_TYPE_MATRIX; filterGX.u.mat.type = VX_TYPE_FLOAT32; filterGX.u.mat.columns = filterGX.u.mat.rows = 3; filterGX.buffer = (vx_uint8 *)&sobelFilter_3x3_x[0][0]; filterGX.ref.read_only = true;
+		filterGY.ref.type = VX_TYPE_MATRIX; filterGY.u.mat.type = VX_TYPE_FLOAT32; filterGY.u.mat.columns = filterGY.u.mat.rows = 3; filterGY.buffer = (vx_uint8 *)&sobelFilter_3x3_y[0][0]; filterGY.ref.read_only = true;
+		status = HafGpu_LinearFilter_ANYx2_U8(node, VX_DF_IMAGE_S16, &filterGX, &filterGY, false);
+		strcpy(node->opencl_name, opencl_name);
+		if (status) {
+			return status;
+		}
+
+		// actual function using pre-defined functions
+		char item[8192];
+		sprintf(item, OPENCL_FORMAT(
+			"#define Magnitude_S16_S16S16 Magnitude_S16_S16S16_%s\n"
+			"#define Phase_U8_S16S16 Phase_U8_S16S16_%s\n"
+			"void Magnitude_S16_S16S16 (S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"	S16x8 r;\n"
+			"	float2 f;\n"
+			"	f.s0 = (float)((((int)(p1.s0)) << 16) >> 16); f.s1 = (float)((((int)(p2.s0)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s0  = (uint)(f.s0);\n"
+			"	f.s0 = (float)(( (int)(p1.s0))        >> 16); f.s1 = (float)(( (int)(p2.s0))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s0 |= (uint)(f.s0) << 16;\n"
+			"	f.s0 = (float)((((int)(p1.s1)) << 16) >> 16); f.s1 = (float)((((int)(p2.s1)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s1  = (uint)(f.s0);\n"
+			"	f.s0 = (float)(( (int)(p1.s1))        >> 16); f.s1 = (float)(( (int)(p2.s1))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s1 |= (uint)(f.s0) << 16;\n"
+			"	f.s0 = (float)((((int)(p1.s2)) << 16) >> 16); f.s1 = (float)((((int)(p2.s2)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s2  = (uint)(f.s0);\n"
+			"	f.s0 = (float)(( (int)(p1.s2))        >> 16); f.s1 = (float)(( (int)(p2.s2))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s2 |= (uint)(f.s0) << 16;\n"
+			"	f.s0 = (float)((((int)(p1.s3)) << 16) >> 16); f.s1 = (float)((((int)(p2.s3)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s3  = (uint)(f.s0);\n"
+			"	f.s0 = (float)(( (int)(p1.s3))        >> 16); f.s1 = (float)(( (int)(p2.s3))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s3 |= (uint)(f.s0) << 16;\n"
+			"	*p0 = r;\n"
+			"}\n"
+			"\n"
+			"void Phase_U8_S16S16 (U8x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"	U8x8 r;\n"
+			"	float2 f; float4 p4;\n"
+			"	f.s0 = (float)((((int)(p1.s0)) << 16) >> 16); f.s1 = (float)((((int)(p2.s0)) << 16) >> 16); p4.s0 = atan2pi(f.s1, f.s0); p4.s0 += (p4.s0 < 0.0) ? 2.0f : 0.0; p4.s0 *= 128.0f;\n"
+			"	f.s0 = (float)(( (int)(p1.s0))        >> 16); f.s1 = (float)(( (int)(p2.s0))        >> 16); p4.s1 = atan2pi(f.s1, f.s0); p4.s1 += (p4.s1 < 0.0) ? 2.0f : 0.0; p4.s1 *= 128.0f;\n"
+			"	f.s0 = (float)((((int)(p1.s1)) << 16) >> 16); f.s1 = (float)((((int)(p2.s1)) << 16) >> 16); p4.s2 = atan2pi(f.s1, f.s0); p4.s2 += (p4.s2 < 0.0) ? 2.0f : 0.0; p4.s2 *= 128.0f;\n"
+			"	f.s0 = (float)(( (int)(p1.s1))        >> 16); f.s1 = (float)(( (int)(p2.s1))        >> 16); p4.s3 = atan2pi(f.s1, f.s0); p4.s3 += (p4.s3 < 0.0) ? 2.0f : 0.0; p4.s3 *= 128.0f;\n"
+			"	p4 = select(p4, (float4) 0.0f, p4 > 255.5f);\n"
+			"	r.s0 = amd_pack(p4);\n"
+			"	f.s0 = (float)((((int)(p1.s2)) << 16) >> 16); f.s1 = (float)((((int)(p2.s2)) << 16) >> 16); p4.s0 = atan2pi(f.s1, f.s0); p4.s0 += (p4.s0 < 0.0) ? 2.0f : 0.0; p4.s0 *= 128.0f;\n"
+			"	f.s0 = (float)(( (int)(p1.s2))        >> 16); f.s1 = (float)(( (int)(p2.s2))        >> 16); p4.s1 = atan2pi(f.s1, f.s0); p4.s1 += (p4.s1 < 0.0) ? 2.0f : 0.0; p4.s1 *= 128.0f;\n"
+			"	f.s0 = (float)((((int)(p1.s3)) << 16) >> 16); f.s1 = (float)((((int)(p2.s3)) << 16) >> 16); p4.s2 = atan2pi(f.s1, f.s0); p4.s2 += (p4.s2 < 0.0) ? 2.0f : 0.0; p4.s2 *= 128.0f;\n"
+			"	f.s0 = (float)(( (int)(p1.s3))        >> 16); f.s1 = (float)(( (int)(p2.s3))        >> 16); p4.s3 = atan2pi(f.s1, f.s0); p4.s3 += (p4.s3 < 0.0) ? 2.0f : 0.0; p4.s3 *= 128.0f;\n"
+			"	p4 = select(p4, (float4) 0.0f, p4 > 255.5f);\n"
+			"	r.s1 = amd_pack(p4);\n"
+			"	*p0 = r;\n"
+			"}\n"
+			), node->opencl_name, node->opencl_name);
+		node->opencl_code += item;
+		if (node->akernel->id == VX_KERNEL_AMD_SOBEL_MAGNITUDE_PHASE_S16U8_U8_3x3) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"void %s(S16x8 * mag, U8x8 * phase, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				"  S16x8 gx, gy;\n"
+				"  %s_GXY(&gx, &gy, x, y, lbuf, p, stride); // LinearFilter_ANYx2_U8\n"
+				"  Magnitude_S16_S16S16(mag, gx, gy);\n"
+				"  Phase_U8_S16S16(phase, gx, gy);\n"
+				"}\n"
+				), node->opencl_name, node->opencl_name);
+			node->opencl_param_discard_mask = 0;
+			node->opencl_local_buffer_usage_mask = (2 << 1);
+		}
+		else if (node->akernel->id == VX_KERNEL_AMD_SOBEL_MAGNITUDE_S16_U8_3x3) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"void %s(S16x8 * mag, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				"  S16x8 gx, gy;\n"
+				"  %s_GXY(&gx, &gy, x, y, lbuf, p, stride); // LinearFilter_ANYx2_U8\n"
+				"  Magnitude_S16_S16S16(mag, gx, gy);\n"
+				"}\n"
+				), node->opencl_name, node->opencl_name);
+			node->opencl_param_discard_mask = 0;
+			node->opencl_local_buffer_usage_mask = (1 << 1);
+		}
+		else if (node->akernel->id == VX_KERNEL_AMD_SOBEL_PHASE_U8_U8_3x3) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"void %s(U8x8 * phase, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+				"  S16x8 gx, gy;\n"
+				"  %s_GXY(&gx, &gy, x, y, lbuf, p, stride); // LinearFilter_ANYx2_U8\n"
+				"  Phase_U8_S16S16(phase, gx, gy);\n"
+				"}\n"
+				), node->opencl_name, node->opencl_name);
+			node->opencl_param_discard_mask = 0;
+			node->opencl_local_buffer_usage_mask = (1 << 1);
+		}
+		else {
+			agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_SobelSpecialCases doesn't support kernel %s\n", node->akernel->name);
+			return -1;
+		}
+		node->opencl_code += item;
+		node->opencl_code += OPENCL_FORMAT(
+			"#undef Magnitude_S16_S16S16\n"
+			"#undef Phase_U8_S16S16\n"
+			);
+	}
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following canny sobel filter kernels:
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L2NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L2NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM
+//   VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM
+//
+int HafGpu_CannySobelFilters(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+
+	// re-use LinearFilter_ANYx2_U8 for computing GX & GY
+	char opencl_name[VX_MAX_KERNEL_NAME];
+	strcpy(opencl_name, node->opencl_name);
+	sprintf(node->opencl_name, "%s_GXY", opencl_name);
+	AgoData filterGX, filterGY;
+	filterGX.ref.type = VX_TYPE_MATRIX; filterGX.u.mat.type = VX_TYPE_FLOAT32; filterGX.ref.read_only = true;
+	filterGY.ref.type = VX_TYPE_MATRIX; filterGY.u.mat.type = VX_TYPE_FLOAT32; filterGY.ref.read_only = true;
+	int N = 0;
+	if (node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM || node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L2NORM) {
+		filterGX.u.mat.columns = filterGX.u.mat.rows = 3; filterGX.buffer = (vx_uint8 *)&sobelFilter_3x3_x[0][0];
+		filterGY.u.mat.columns = filterGY.u.mat.rows = 3; filterGY.buffer = (vx_uint8 *)&sobelFilter_3x3_y[0][0];
+		N = 3;
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM || node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L2NORM) {
+		filterGX.u.mat.columns = filterGX.u.mat.rows = 5; filterGX.buffer = (vx_uint8 *)&sobelFilter_5x5_x[0][0];
+		filterGY.u.mat.columns = filterGY.u.mat.rows = 5; filterGY.buffer = (vx_uint8 *)&sobelFilter_5x5_y[0][0];
+		N = 5;
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM || node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM) {
+		filterGX.u.mat.columns = filterGX.u.mat.rows = 7; filterGX.buffer = (vx_uint8 *)&sobelFilter_7x7_x[0][0];
+		filterGY.u.mat.columns = filterGY.u.mat.rows = 7; filterGY.buffer = (vx_uint8 *)&sobelFilter_7x7_y[0][0];
+		N = 7;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_CannySobelFilters doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	status = HafGpu_LinearFilter_ANYx2_U8(node, VX_DF_IMAGE_F32_AMD, &filterGX, &filterGY, false);
+	strcpy(node->opencl_name, opencl_name);
+	if (status) {
+		return status;
+	}
+	node->opencl_param_discard_mask = 0;
+	node->opencl_local_buffer_usage_mask = (1 << 1);
+
+	// actual function using pre-defined functions
+	char item[8192];
+	if (node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM ||
+		node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM ||
+		node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM)
+	{ // L1NORM
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint CannyMagPhase(float gx, float gy) {\n"
+			"  float dx = fabs(gx), dy = fabs(gy);\n"
+			"  float dr = amd_min3((dx + dy)%s, 16383.0f, 16383.0f);\n" // magnitude /= 2 for gradient_size = 7
+			"  float d1 = dx * 0.4142135623730950488016887242097f;\n"
+			"  float d2 = dx * 2.4142135623730950488016887242097f;\n"
+			"  uint mp = select(1u, 3u, (gx * gy) < 0.0f);\n"
+			"       mp = select(mp, 0u, dy <= d1);\n"
+			"       mp = select(mp, 2u, dy >= d2);\n"
+			"  mp += (((uint)dr) << 2);\n"
+			"  return mp;\n"
+			"}\n")
+			, node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM ? "*0.5f" : "");
+		node->opencl_code += item;
+	}
+	else
+	{ // L2NORM
+		sprintf(item,
+			OPENCL_FORMAT(
+			"uint CannyMagPhase(float gx, float gy) {\n"
+			"  float dx = fabs(gx), dy = fabs(gy);\n"
+			"  float dr = amd_min3(native_sqrt(mad(gy, gy, gx * gx)%s), 16383.0f, 16383.0f);\n" // magnitude /= 2 for gradient_size = 7
+			"  float d1 = dx * 0.4142135623730950488016887242097f;\n"
+			"  float d2 = dx * 2.4142135623730950488016887242097f;\n"
+			"  uint mp = select(1u, 3u, (gx * gy) < 0.0f);\n"
+			"       mp = select(mp, 0u, dy <= d1);\n"
+			"       mp = select(mp, 2u, dy >= d2);\n"
+			"  mp += (((uint)dr) << 2);\n"
+			"  return mp;\n"
+			"}\n")
+			, node->akernel->id == VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM ? "*0.5f" : "");
+		node->opencl_code += item;
+	}
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	sprintf(item,
+		OPENCL_FORMAT(
+		"void %s(U16x8 * magphase, uint x, uint y, __local uchar * lbuf, __global uchar * p, uint stride) {\n"
+		"  F32x8 gx, gy;\n"
+		"  %s_GXY(&gx, &gy, x, y, lbuf, p, stride); // LinearFilter_ANYx2_U8\n"
+		"  uint mask = select(0xffffu, 0u, y < %d); mask = select(0u, mask, y < %d);\n" // (N >> 1), height - (N >> 1)
+		"  U16x8 r; uint mp;\n"
+		"  mp = CannyMagPhase(gx.s0, gy.s0) & mask; mp = select(mp, 0u, x < %du);                               r.s0  =  mp;\n"         // (N>>1)-0
+		"  mp = CannyMagPhase(gx.s1, gy.s1) & mask; mp = select(mp, 0u, x < %du);                               r.s0 |= (mp << 16);\n"  // (N>>1)-1
+		"  mp = CannyMagPhase(gx.s2, gy.s2) & mask; mp = select(mp, 0u, x < %du);                               r.s1  =  mp;\n"         // (N > 5) ? (N>>1)-2 : 0
+		"  mp = CannyMagPhase(gx.s3, gy.s3) & mask;                                                             r.s1 |= (mp << 16);\n"  // 
+		"  mp = CannyMagPhase(gx.s4, gy.s4) & mask;                                                             r.s2  =  mp;\n"         // 
+		"  mp = CannyMagPhase(gx.s5, gy.s5) & mask;                               mp = select(0u, mp, x < %du); r.s2 |= (mp << 16);\n"  //           width-(N>>1)-5
+		"  mp = CannyMagPhase(gx.s6, gy.s6) & mask;                               mp = select(0u, mp, x < %du); r.s3  =  mp;\n"         //           width-(N>>1)-6
+		"  mp = CannyMagPhase(gx.s7, gy.s7) & mask;                               mp = select(0u, mp, x < %du); r.s3 |= (mp << 16);\n"  //           width-(N>>1)-7
+		"  *magphase = r;\n"
+		"}\n"
+		)
+		, node->opencl_name, node->opencl_name, (N >> 1), height - (N >> 1), (N >> 1) - 0, (N >> 1) - 1, (N > 5) ? ((N >> 1) - 2) : 0, width - (N >> 1) - 5, width - (N >> 1) - 6, width - (N >> 1) - 7);
+	node->opencl_code += item;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following canny non-max supression filter kernels:
+//   VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8_U16_3x3
+//   VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3
+//
+int HafGpu_CannySuppThreshold(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// configuration
+	int work_group_width = 16;
+	int work_group_height = 16;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+
+	// local memory usage
+	int LMemSideLR = 4;
+	int LMemStride = work_group_width * (4 * 2) + 2 * 2 * 2;
+	int LMemSize = LMemStride * (work_group_height + 2);
+
+	// kernel declaration
+	char item[8192];
+	const char * xyarg = "";
+	int ioffset = 1;
+	if (node->akernel->id == VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3) {
+		xyarg = "__global char * p1_buf, uint p1_offset, uint p1_count, ";
+		ioffset = 2;
+	}
+	else if (node->akernel->id != VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8_U16_3x3) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_CannySuppThreshold doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset, %suint p2_width, uint p2_height, __global uchar * p2_buf, uint p2_stride, uint p2_offset, uint2 p3)\n" // xyarg
+		"{\n"
+		"  __local uchar lbuf[%d];\n" // LMemSize
+		"  int lx = get_local_id(0);\n"
+		"  int ly = get_local_id(1);\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  bool valid = (gx < %d) && (gy < %d);\n" // (width+3)/4, height
+		"  p0_buf += p0_offset + (gy * p0_stride) + (gx << 2);\n"
+		"  p2_buf += p2_offset;\n"
+		"  int gstride = p2_stride;\n"
+		"  __global uchar * gbuf = p2_buf;\n"
+		)
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, xyarg, LMemSize, (width + 3) / 4, height);
+	node->opencl_code = item;
+	// load U16 into local
+	if (HafGpu_Load_Local(work_group_width, work_group_height, LMemStride, work_group_height + 2, 2 * 2, 1, node->opencl_code) < 0) {
+		return -1;
+	}
+	// load U16 pixels from local and perform non-max supression
+	vx_uint32 gradient_size = node->paramList[ioffset+2] ? node->paramList[ioffset+2]->u.scalar.u.u : 3;
+	sprintf(item,
+		OPENCL_FORMAT(
+		"  __local uchar * lbuf_ptr = lbuf + ly * %d + (lx << 3);\n" // LMemStride
+		"  uint4 L0 = vload4(0, (__local uint *) lbuf_ptr);\n"
+		"  uint4 L1 = vload4(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride
+		"  uint4 L2 = vload4(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 2
+		"  uint3 NA, NB, NC; uint T, M1, M2; uint4 M;\n"
+		"  NA.s0 =         L0.s0  >> 18 ; NA.s1 =         L1.s0  >> 18 ; NA.s2 =         L2.s0  >> 18 ;\n"
+		"  NB.s0 = amd_bfe(L0.s1, 2, 14); NB.s1 = amd_bfe(L1.s1, 2, 14); NB.s2 = amd_bfe(L2.s1, 2, 14);\n"
+		"  NC.s0 =         L0.s1  >> 18 ; NC.s1 =         L1.s1  >> 18 ; NC.s2 =         L2.s1  >> 18 ;\n"
+		"  T = amd_bfe(L1.s1,  0, 2); M1 = select(NA.s1, NA.s0, T > 0); M1 = select(M1, NB.s0, T > 1); M1 = select(M1, NA.s2, T > 2); M2 = select(NC.s1, NC.s2+1, T > 0); M2 = select(M2, NB.s2, T > 1); M2 = select(M2, NC.s0+1, T > 2); M.s0 = select(0u, NB.s1, NB.s1 > M1); M.s0 = select(0u, M.s0, NB.s1 >= M2);\n"
+		"  NA.s0 = amd_bfe(L0.s2, 2, 14); NA.s1 = amd_bfe(L1.s2, 2, 14); NA.s2 = amd_bfe(L2.s2, 2, 14);\n"
+		"  T = amd_bfe(L1.s1, 16, 2); M1 = select(NB.s1, NB.s0, T > 0); M1 = select(M1, NC.s0, T > 1); M1 = select(M1, NB.s2, T > 2); M2 = select(NA.s1, NA.s2+1, T > 0); M2 = select(M2, NC.s2, T > 1); M2 = select(M2, NA.s0+1, T > 2); M.s1 = select(0u, NC.s1, NC.s1 > M1); M.s1 = select(0u, M.s1, NC.s1 >= M2);\n"
+		"  NB.s0 =         L0.s2  >> 18 ; NB.s1 =         L1.s2  >> 18 ; NB.s2 =         L2.s2  >> 18 ;\n"
+		"  T = amd_bfe(L1.s2,  0, 2); M1 = select(NC.s1, NC.s0, T > 0); M1 = select(M1, NA.s0, T > 1); M1 = select(M1, NC.s2, T > 2); M2 = select(NB.s1, NB.s2+1, T > 0); M2 = select(M2, NA.s2, T > 1); M2 = select(M2, NB.s0+1, T > 2); M.s2 = select(0u, NA.s1, NA.s1 > M1); M.s2 = select(0u, M.s2, NA.s1 >= M2);\n"
+		"  NC.s0 = amd_bfe(L0.s3, 2, 14); NC.s1 = amd_bfe(L1.s3, 2, 14); NC.s2 = amd_bfe(L2.s3, 2, 14);\n"
+		"  T = amd_bfe(L1.s2, 16, 2); M1 = select(NA.s1, NA.s0, T > 0); M1 = select(M1, NB.s0, T > 1); M1 = select(M1, NA.s2, T > 2); M2 = select(NC.s1, NC.s2+1, T > 0); M2 = select(M2, NB.s2, T > 1); M2 = select(M2, NC.s0+1, T > 2); M.s3 = select(0u, NB.s1, NB.s1 > M1); M.s3 = select(0u, M.s3, NB.s1 >= M2);\n"
+		"  uint mask = select(0u, 0xffffffffu, gx < %du); mask = select(0u, mask, gy < %du);\n" // (width+3)/4, height
+		"  M.s0 &= mask;\n"
+		"  M.s1 &= mask;\n"
+		"  M.s2 &= mask;\n"
+		"  M.s3 &= mask;\n"
+		"  uint4 P;\n"
+		"%s" // THRESHOLD /= 2 when gradient_size = 7
+		"  P.s0 = select(  0u, 127u, M.s0 > p3.s0);\n"
+		"  P.s1 = select(  0u, 127u, M.s1 > p3.s0);\n"
+		"  P.s2 = select(  0u, 127u, M.s2 > p3.s0);\n"
+		"  P.s3 = select(  0u, 127u, M.s3 > p3.s0);\n"
+		"  P.s0 = select(P.s0, 255u, M.s0 > p3.s1);\n"
+		"  P.s1 = select(P.s1, 255u, M.s1 > p3.s1);\n"
+		"  P.s2 = select(P.s2, 255u, M.s2 > p3.s1);\n"
+		"  P.s3 = select(P.s3, 255u, M.s3 > p3.s1);\n"
+		"  uint p0 = P.s0;\n"
+		"  p0 += P.s1 << 8;\n"
+		"  p0 += P.s2 << 16;\n"
+		"  p0 += P.s3 << 24;\n"
+		"  if (valid)  *(__global uint *)p0_buf = p0;\n"
+		)
+		, LMemStride, LMemStride, LMemStride * 2, (width + 3) / 4, height, (gradient_size == 7) ? "  p3.s0 = p3.s0 >> 1; p3.s1 = p3.s1 >> 1;\n" : "");
+	node->opencl_code += item;
+	if (node->akernel->id == VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3) {
+		node->opencl_code +=
+			OPENCL_FORMAT(
+			"  if (valid) {\n"
+			"    uint stack_icount;\n"
+			"    stack_icount  = select(0u, 1u, P.s0 == 255u);\n"
+			"    stack_icount += select(0u, 1u, P.s1 == 255u);\n"
+			"    stack_icount += select(0u, 1u, P.s2 == 255u);\n"
+			"    stack_icount += select(0u, 1u, P.s3 == 255u);\n"
+			"    if (stack_icount > 0) {\n"
+			"      uint pos = atomic_add((__global uint *)p1_buf, stack_icount);\n"
+			"      __global uint * p1_buf_ptr = (__global uint *)&p1_buf[p1_offset];\n"
+			"      uint xyloc = (gy << 16) + (gx << 2);\n"
+			"      if(pos < p1_count && P.s0 == 255u) p1_buf_ptr[pos++] = xyloc;\n"
+			"      if(pos < p1_count && P.s1 == 255u) p1_buf_ptr[pos++] = xyloc+1;\n"
+			"      if(pos < p1_count && P.s2 == 255u) p1_buf_ptr[pos++] = xyloc+2;\n"
+			"      if(pos < p1_count && P.s3 == 255u) p1_buf_ptr[pos++] = xyloc+3;\n"
+			"    }\n"
+			"  }\n"
+			);
+	}
+	node->opencl_code += "}\n";
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_param_discard_mask = 1 << 4;
+	node->opencl_param_atomic_mask = (ioffset > 1) ? (1 << 1) : 0;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 3) >> 2) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following harris sobel filter kernels:
+//   VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_3x3
+//   VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_5x5
+//   VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_7x7
+//
+int HafGpu_HarrisSobelFilters(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// configuration
+	int N = 0;
+	if (node->akernel->id == VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_3x3) N = 3;
+	else if (node->akernel->id == VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_5x5) N = 5;
+	else if (node->akernel->id == VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_7x7) N = 7;
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_HarrisSobelFilters doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	int work_group_width = 16;
+	int work_group_height = 16;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+
+	// use completely separate kernel
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 7) >> 3) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	// headers
+	char item[8192];
+	node->opencl_code =
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"typedef float8 F32x8;\n"
+		);
+
+	// re-use LinearFilter_ANYx2_U8 for computing GX & GY
+	sprintf(node->opencl_name, "LinearFilter_ANYx2_U8");
+	AgoData filterGX, filterGY;
+	filterGX.ref.type = VX_TYPE_MATRIX; filterGX.u.mat.type = VX_TYPE_FLOAT32; filterGX.ref.read_only = true;
+	filterGY.ref.type = VX_TYPE_MATRIX; filterGY.u.mat.type = VX_TYPE_FLOAT32; filterGY.ref.read_only = true;
+	if (N == 3) {
+		filterGX.u.mat.columns = filterGX.u.mat.rows = 3; filterGX.buffer = (vx_uint8 *)&sobelFilter_3x3_x[0][0];
+		filterGY.u.mat.columns = filterGY.u.mat.rows = 3; filterGY.buffer = (vx_uint8 *)&sobelFilter_3x3_y[0][0];
+	}
+	else if (N == 5) {
+		filterGX.u.mat.columns = filterGX.u.mat.rows = 5; filterGX.buffer = (vx_uint8 *)&sobelFilter_5x5_x[0][0];
+		filterGY.u.mat.columns = filterGY.u.mat.rows = 5; filterGY.buffer = (vx_uint8 *)&sobelFilter_5x5_y[0][0];
+	}
+	else if (N == 7) {
+		filterGX.u.mat.columns = filterGX.u.mat.rows = 7; filterGX.buffer = (vx_uint8 *)&sobelFilter_7x7_x[0][0];
+		filterGY.u.mat.columns = filterGY.u.mat.rows = 7; filterGY.buffer = (vx_uint8 *)&sobelFilter_7x7_y[0][0];
+	}
+	status = HafGpu_LinearFilter_ANYx2_U8(node, VX_DF_IMAGE_F32_AMD, &filterGX, &filterGY, false);
+	if (status) {
+		return status;
+	}
+
+	// kernel body
+	sprintf(item,
+		OPENCL_FORMAT(
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset, uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset)\n"
+		"{\n"
+		"  uint x = get_global_id(0) << 3;\n"
+		"  uint y = get_global_id(1);\n"
+		"  __local uchar lbuf[%d];\n"
+		"  F32x8 gx, gy;\n"
+		"  LinearFilter_ANYx2_U8(&gx, &gy, x, y, lbuf, p1_buf + p1_offset, p1_stride); // LinearFilter_ANYx2_U8\n"
+		"  if ((x < %d) && (y < %d)) {\n" // width, height
+		"    p0_buf += p0_offset + y * p0_stride + (x << 2);\n"
+		"    vstore8(gx * gx, 0, (__global float *)&p0_buf[0]);\n"
+		"    vstore8(gx * gy, 0, (__global float *)&p0_buf[%d]);\n" // width * 4
+		"    vstore8(gy * gy, 0, (__global float *)&p0_buf[%d]);\n" // width * 4 * 2
+		"  }\n"
+		"}\n"
+		)
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, node->opencl_local_buffer_size_in_bytes, width, height, width * 4, width * 4 * 2);
+	node->opencl_code += item;
+
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_local_buffer_usage_mask = (1 << 1);
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following harris score filter kernels:
+//   VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_3x3
+//   VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_5x5
+//   VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_7x7
+//
+int HafGpu_HarrisScoreFilters(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// configuration
+	int N = 0;
+	if (node->akernel->id == VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_3x3) N = 3;
+	else if (node->akernel->id == VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_5x5) N = 5;
+	else if (node->akernel->id == VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_7x7) N = 7;
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_HarrisScoreFilters doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	int work_group_width = 16;
+	int work_group_height = 16;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	vx_float32 sensitivity = node->paramList[2]->u.scalar.u.f;
+	vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+	vx_int32 gradient_size = node->paramList[4]->u.scalar.u.i;
+	vx_float32 normFactor = 255.0f * (1 << (gradient_size - 1)) * N;
+
+	// local memory usage
+	int kO = (N >> 1) & 1;
+	int LMemSideLR = (N >> 1) + kO;
+	int LMemStride = (16 * 4 + LMemSideLR * 2) * 4;
+	int LMemSize = LMemStride * (16 + N - 1);
+
+	// kernel declaration
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"typedef float8 F32x8;\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset, uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset, float p2, float p3)\n"
+		"{\n"
+		"  __local uchar lbuf[%d];\n"
+		"  int lx = get_local_id(0);\n"
+		"  int ly = get_local_id(1);\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  int gstride = p1_stride;\n"
+		"  p0_buf += p0_offset + (gy * p0_stride) + (gx << 4); p1_buf += p1_offset;\n"
+		)
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, LMemSize);
+	node->opencl_code = item;
+
+	for (int component = 0; component < 3; component++) {
+		// load component into LDS
+		if (component == 0) {
+			sprintf(item, "  __global uchar * gbuf = p1_buf; __local uchar * lbuf_ptr; float2 v2;\n");
+		}
+		else {
+			sprintf(item,
+				"  barrier(CLK_LOCAL_MEM_FENCE);\n"
+				"  gbuf = p1_buf + %d;\n" // width * 4 * component
+				, width * 4 * component);
+		}
+		node->opencl_code += item;
+		if (HafGpu_Load_Local(work_group_width, work_group_height, LMemStride, 16 + N - 1, LMemSideLR * 4, (N >> 1), node->opencl_code) < 0) {
+			return -1;
+		}
+		// horizontal sum
+		sprintf(item,
+			"  float4 sum%d;\n" // component
+			"  lbuf_ptr = &lbuf[ly * %d + (lx << 4)];\n" // LMemStride
+			, component, LMemStride);
+		node->opencl_code += item;
+		for (int i = 0; i < 2 + (N >> 1); i++) {
+			if (kO) {
+				sprintf(item, "  v2 = vload2(0, (__local float *)&lbuf_ptr[%d]);\n", (i * 2 + kO) * 4);
+			}
+			else {
+				sprintf(item, "  v2 = *(__local float2 *)&lbuf_ptr[%d];\n", (i * 2) * 4);
+			}
+			node->opencl_code += item;
+			for (int k = i*2; k < i*2+2; k++) {
+				for (int j = max(k-N+1,0); j <= min(k,3); j++) {
+					sprintf(item, "  sum%d.s%d %c= v2.s%d;\n", component, j, (k == j) ? ' ' : '+', k & 1);
+					node->opencl_code += item;
+				}
+			}
+		}
+		sprintf(item, "  *(__local float4 *)lbuf_ptr = sum%d;\n", component);
+		node->opencl_code += item;
+		sprintf(item, "  if (ly < %d) {\n", N - 1);
+		node->opencl_code += item;
+		for (int i = 0; i < 2 + (N >> 1); i++) {
+			if (kO) {
+				sprintf(item, "    v2 = vload2(0, (__local float *)&lbuf_ptr[%d]);\n", LMemStride * work_group_height + (i * 2 + kO) * 4);
+			}
+			else {
+				sprintf(item, "    v2 = *(__local float2 *)&lbuf_ptr[%d];\n", LMemStride * work_group_height + (i * 2) * 4);
+			}
+			node->opencl_code += item;
+			for (int k = i * 2; k < i * 2 + 2; k++) {
+				for (int j = max(k-N+1, 0); j <= min(k, 3); j++) {
+					sprintf(item, "    sum%d.s%d %c= v2.s%d;\n", component, j, (k == j) ? ' ' : '+', k & 1);
+					node->opencl_code += item;
+				}
+			}
+		}
+		sprintf(item, 
+			"    *(__local float4 *)&lbuf_ptr[%d] = sum%d;\n"
+			"  }\n"
+			"  barrier(CLK_LOCAL_MEM_FENCE);\n"
+			, LMemStride * work_group_height, component);
+		node->opencl_code += item;
+		// vertical sum
+		sprintf(item, "  sum%d = *(__local float4 *)lbuf_ptr;\n", component);
+		node->opencl_code += item;
+		for (int i = 1; i < N; i++) {
+			sprintf(item, "  sum%d += *(__local float4 *)&lbuf_ptr[%d];\n", component, i * LMemStride);
+			node->opencl_code += item;
+		}
+	}
+
+	int border = (gradient_size >> 1) + (N >> 1);
+	sprintf(item,
+		OPENCL_FORMAT(
+		"  gx = gx << 2;\n"
+		"  if ((gx < %d) && (gy < %d)) {\n" // width, height
+		"    float4 score = (float4)0.0f;\n"
+		"    if ((gy >= %d) && (gy < %d)) {\n" // border, height - border
+		"      score = sum0 * sum2 - sum1 * sum1;\n"
+		"      sum0 += sum2;\n"
+		"      sum0 *= sum0;\n"
+		"      score = mad(sum0, (float4)-p2, score);\n"
+		"      score *= (float4)%.12ef;\n" // (1/normFactor)^4
+		"      score = select((float4)0.0f, score, score > (float4)p3);\n"
+		"      score.s0 = select(score.s0, 0.0f, gx < %d);\n" // border
+		"      score.s1 = select(score.s1, 0.0f, gx < %d);\n" // border-1
+		"      score.s2 = select(score.s2, 0.0f, gx < %d);\n" // border-2
+		"      score.s3 = select(score.s3, 0.0f, gx < %d);\n" // border-3
+		"      score.s0 = select(score.s0, 0.0f, gx > %d);\n" // width-1-border
+		"      score.s1 = select(score.s1, 0.0f, gx > %d);\n" // width-2-border
+		"      score.s2 = select(score.s2, 0.0f, gx > %d);\n" // width-3-border
+		"      score.s3 = select(score.s3, 0.0f, gx > %d);\n" // width-4-border
+		"    }\n"
+		"    *(__global float4 *)p0_buf = score;\n"
+		"  }\n"
+		"}\n"
+		),
+		width, height, border, height - border, (float)(1.0 / (normFactor*normFactor*normFactor*normFactor)),
+		border, border - 1, border - 2, border - 3, width - 1 - border, width - 2 - border, width - 3 - border, width - 4 - border);
+	node->opencl_code += item;
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_param_discard_mask = (1 << 4);
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 3) >> 2) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following non-max supression filter kernels:
+//   VX_KERNEL_AMD_NON_MAX_SUPP_XY_ANY_3x3
+//
+int HafGpu_NonMaxSupp_XY_ANY_3x3(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// configuration
+	int work_group_width = 16;
+	int work_group_height = 16;
+	int width = node->paramList[1]->u.img.width;
+	int height = node->paramList[1]->u.img.height;
+
+	// local memory usage
+	int LMemSideLR = 1 * 4;
+	int LMemSideTB = 1;
+	int LMemStride = work_group_width * 2 * 4 + LMemSideLR * 2;
+	int LMemSize = LMemStride * (work_group_height + 2);
+
+	// kernel declaration
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(__global char * p0_buf, uint p0_offset, uint p0_count, uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset)\n"
+		"{\n"
+		"  int lx = get_local_id(0);\n"
+		"  int ly = get_local_id(1);\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  int gstride = p1_stride;\n"
+		"  __global uchar * gbuf = p1_buf + p1_offset;\n"
+		"  __local uchar lbuf[%d];\n" // LMemSize
+		)
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, LMemSize);
+	node->opencl_code = item;
+	// load into local
+	if (HafGpu_Load_Local(work_group_width, work_group_height, LMemStride, work_group_height + 2, LMemSideLR, LMemSideTB, node->opencl_code) < 0) {
+		return -1;
+	}
+	// load pixels from local and perform non-max supression
+	sprintf(item,
+		OPENCL_FORMAT(
+		"  __local uchar * lbuf_ptr = lbuf + ly * %d + (lx << 3);\n" // LMemStride
+		"  float4 L0 = vload4(0, (__local float *) lbuf_ptr);\n"
+		"  float4 L1 = vload4(0, (__local float *)&lbuf_ptr[%d]);\n" // LMemStride
+		"  float4 L2 = vload4(0, (__local float *)&lbuf_ptr[%d]);\n" // LMemStride * 2
+		"  float2 T = L1.s12;\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >= L1.s0);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >  L1.s2);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >= L0.s0);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >= L0.s1);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >= L0.s2);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >  L2.s0);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >  L2.s1);\n"
+		"  T.s0 = select(0.0f, T.s0, T.s0 >  L2.s2);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >= L1.s1);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >  L1.s3);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >= L0.s1);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >= L0.s2);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >= L0.s3);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >  L2.s1);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >  L2.s2);\n"
+		"  T.s1 = select(0.0f, T.s1, T.s1 >  L2.s3);\n"
+		"  T.s0 = select(0.0f, T.s0, gx < %d);\n" // (width+1)/2
+		"  T.s1 = select(0.0f, T.s1, gx < %d);\n" // width/2
+		"  T.s0 = select(0.0f, T.s0, gy < %d);\n" // height
+		"  T.s1 = select(0.0f, T.s1, gy < %d);\n" // height
+		"  gx = gx + gx + select(0, 1, T.s1 > 0.0f);\n"
+		"  T.s0 = select(T.s0, T.s1, T.s1 > 0.0f);\n"
+		"  if (T.s0 > 0.0f) {\n"
+		"    uint pos = atomic_inc((__global uint *)p0_buf);\n"
+		"    if(pos < p0_count) {\n"
+		"      *(__global uint2 *)&p0_buf[p0_offset + (pos << 3)] = (uint2)(gx | (gy << 16), as_uint(T.s0));\n"
+		"    }\n"
+		"  }\n"
+		"}\n"
+		)
+		, LMemStride, LMemStride, LMemStride * 2, (width + 1) / 2, width / 2, height, height);
+	node->opencl_code += item;
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_param_atomic_mask = (1 << 0);
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 1)/2) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following half scale gaussian filters:
+//   VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_3x3
+//   VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5
+//
+int HafGpu_ScaleGaussianHalf(AgoNode * node)
+{
+	int status = VX_SUCCESS;
+	// configuration
+	int work_group_width = 16;
+	int work_group_height = 16;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	int N = 0;
+	if (node->akernel->id == VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_3x3) {
+		N = 3;
+	}
+	else if (node->akernel->id == VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5) {
+		N = 5;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_ScaleGaussianHalf doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	// local memory usage
+	int LMemSideLR = ((N >> 1) + 3) & ~3;
+	int LMemSideTB =  (N >> 1);
+	int LMemStride = work_group_width * 8 + LMemSideLR * 2;
+	int LMemSize = LMemStride * (work_group_height * 2 - 1 + LMemSideTB * 2);
+
+	// kernel declaration
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset, uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset)\n"
+		"{\n"
+		"  __local uchar lbuf[%d];\n" // LMemSize
+		"  int lx = get_local_id(0);\n"
+		"  int ly = get_local_id(1);\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  p0_buf += p0_offset + (gy * p0_stride) + (gx << 2);\n"
+		"  int gstride = p1_stride;\n"
+		"  __global uchar * gbuf = p1_buf + p1_offset + (((gy - ly) << 1) + 1) * gstride + ((gx - lx) << 3);\n"
+		"  bool valid = ((gx < %d) && (gy < %d)) ? true : false;\n" // (width+3)/4, height
+		"  gx = lx; gy = ly;\n"
+		)
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, LMemSize, (width + 3) / 4, height);
+	node->opencl_code = item;
+	// load input image into local
+	if (HafGpu_Load_Local(work_group_width, work_group_height, LMemStride, work_group_height * 2 - 1 + LMemSideTB * 2, LMemSideLR, LMemSideTB, node->opencl_code) < 0) {
+		return -1;
+	}
+	// perform filtering
+	if (N == 3) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uchar * lbuf_ptr = lbuf + ly * %d + (lx << 3);\n" // LMemStride * 2
+			"  uint3 L0 = vload3(0, (__local uint *)&lbuf_ptr[4]);\n"
+			"  uint3 L1 = vload3(0, (__local uint *)&lbuf_ptr[%d+4]);\n" // LMemStride
+			"  uint3 L2 = vload3(0, (__local uint *)&lbuf_ptr[%d+4]);\n" // LMemStride * 2
+			"  float4 sum; float v;\n"
+			"  v = amd_unpack0(L0.s0); v = mad(amd_unpack0(L1.s0), 2.0f, v); v += amd_unpack0(L2.s0); sum.s0 = v;\n"
+			"  v = amd_unpack1(L0.s0); v = mad(amd_unpack1(L1.s0), 2.0f, v); v += amd_unpack1(L2.s0); sum.s0 = mad(v, 2.0f, sum.s0);\n"
+			"  v = amd_unpack2(L0.s0); v = mad(amd_unpack2(L1.s0), 2.0f, v); v += amd_unpack2(L2.s0); sum.s1 = v; sum.s0 += v;\n"
+			"  v = amd_unpack3(L0.s0); v = mad(amd_unpack3(L1.s0), 2.0f, v); v += amd_unpack3(L2.s0); sum.s1 = mad(v, 2.0f, sum.s1);\n"
+			"  v = amd_unpack0(L0.s1); v = mad(amd_unpack0(L1.s1), 2.0f, v); v += amd_unpack0(L2.s1); sum.s2 = v; sum.s1 += v;\n"
+			"  v = amd_unpack1(L0.s1); v = mad(amd_unpack1(L1.s1), 2.0f, v); v += amd_unpack1(L2.s1); sum.s2 = mad(v, 2.0f, sum.s2);\n"
+			"  v = amd_unpack2(L0.s1); v = mad(amd_unpack2(L1.s1), 2.0f, v); v += amd_unpack2(L2.s1); sum.s3 = v; sum.s2 += v;\n"
+			"  v = amd_unpack3(L0.s1); v = mad(amd_unpack3(L1.s1), 2.0f, v); v += amd_unpack3(L2.s1); sum.s3 = mad(v, 2.0f, sum.s3);\n"
+			"  v = amd_unpack0(L0.s2); v = mad(amd_unpack0(L1.s2), 2.0f, v); v += amd_unpack0(L2.s2); sum.s3 += v;\n"
+			"  sum = sum * (float4)0.0625f;\n"
+			"  if (valid) {;\n"
+			"    *(__global uint *)p0_buf = amd_pack(sum);\n"
+			"  }\n"
+			"}\n"
+			)
+			, LMemStride * 2, LMemStride, LMemStride * 2);
+		node->opencl_code += item;
+	}
+	else if (N == 5) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uchar * lbuf_ptr = lbuf + ly * %d + (lx << 3);\n" // LMemStride
+			"  float4 sum; float v;\n"
+			"  uint4 L0 = vload4(0, (__local uint *) lbuf_ptr);\n"
+			"  v = amd_unpack3(L0.s0);                                             sum.s0 = v;\n"
+			"  v = amd_unpack0(L0.s1);                                             sum.s0 = mad(v, 4.0f, sum.s0);\n"
+			"  v = amd_unpack1(L0.s1);              sum.s0 = mad(v, 6.0f, sum.s0); sum.s1 = v;\n"
+			"  v = amd_unpack2(L0.s1);              sum.s0 = mad(v, 4.0f, sum.s0); sum.s1 = mad(v, 4.0f, sum.s1);\n"
+			"  v = amd_unpack3(L0.s1); sum.s0 += v; sum.s1 = mad(v, 6.0f, sum.s1); sum.s2 = v;\n"
+			"  v = amd_unpack0(L0.s2);              sum.s1 = mad(v, 4.0f, sum.s1); sum.s2 = mad(v, 4.0f, sum.s2);\n"
+			"  v = amd_unpack1(L0.s2); sum.s1 += v; sum.s2 = mad(v, 6.0f, sum.s2); sum.s3 = v;\n"
+			"  v = amd_unpack2(L0.s2);              sum.s2 = mad(v, 4.0f, sum.s2); sum.s3 = mad(v, 4.0f, sum.s3);\n"
+			"  v = amd_unpack3(L0.s2); sum.s2 += v; sum.s3 = mad(v, 6.0f, sum.s3);\n"
+			"  v = amd_unpack0(L0.s3);              sum.s3 = mad(v, 4.0f, sum.s3);\n"
+			"  v = amd_unpack1(L0.s3); sum.s3 += v;\n"
+			"  L0.s0 = (uint)sum.s0 + (((uint)sum.s1) << 16);\n"
+			"  L0.s1 = (uint)sum.s2 + (((uint)sum.s3) << 16);\n"
+			"  *(__local uint2 *)lbuf_ptr = L0.s01;\n"
+			"  L0 = vload4(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride*16
+			"  v = amd_unpack3(L0.s0);                                             sum.s0 = v;\n"
+			"  v = amd_unpack0(L0.s1);                                             sum.s0 = mad(v, 4.0f, sum.s0);\n"
+			"  v = amd_unpack1(L0.s1);              sum.s0 = mad(v, 6.0f, sum.s0); sum.s1 = v;\n"
+			"  v = amd_unpack2(L0.s1);              sum.s0 = mad(v, 4.0f, sum.s0); sum.s1 = mad(v, 4.0f, sum.s1);\n"
+			"  v = amd_unpack3(L0.s1); sum.s0 += v; sum.s1 = mad(v, 6.0f, sum.s1); sum.s2 = v;\n"
+			"  v = amd_unpack0(L0.s2);              sum.s1 = mad(v, 4.0f, sum.s1); sum.s2 = mad(v, 4.0f, sum.s2);\n"
+			"  v = amd_unpack1(L0.s2); sum.s1 += v; sum.s2 = mad(v, 6.0f, sum.s2); sum.s3 = v;\n"
+			"  v = amd_unpack2(L0.s2);              sum.s2 = mad(v, 4.0f, sum.s2); sum.s3 = mad(v, 4.0f, sum.s3);\n"
+			"  v = amd_unpack3(L0.s2); sum.s2 += v; sum.s3 = mad(v, 6.0f, sum.s3);\n"
+			"  v = amd_unpack0(L0.s3);              sum.s3 = mad(v, 4.0f, sum.s3);\n"
+			"  v = amd_unpack1(L0.s3); sum.s3 += v;\n"
+			"  L0.s0 = (uint)sum.s0 + (((uint)sum.s1) << 16);\n"
+			"  L0.s1 = (uint)sum.s2 + (((uint)sum.s3) << 16);\n"
+			"  *(__local uint2 *)&lbuf_ptr[%d] = L0.s01;\n" // LMemStride*16
+			"  if (ly < 3) {\n"
+			"    L0 = vload4(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride*32
+			"    v = amd_unpack3(L0.s0);                                             sum.s0 = v;\n"
+			"    v = amd_unpack0(L0.s1);                                             sum.s0 = mad(v, 4.0f, sum.s0);\n"
+			"    v = amd_unpack1(L0.s1);              sum.s0 = mad(v, 6.0f, sum.s0); sum.s1 = v;\n"
+			"    v = amd_unpack2(L0.s1);              sum.s0 = mad(v, 4.0f, sum.s0); sum.s1 = mad(v, 4.0f, sum.s1);\n"
+			"    v = amd_unpack3(L0.s1); sum.s0 += v; sum.s1 = mad(v, 6.0f, sum.s1); sum.s2 = v;\n"
+			"    v = amd_unpack0(L0.s2);              sum.s1 = mad(v, 4.0f, sum.s1); sum.s2 = mad(v, 4.0f, sum.s2);\n"
+			"    v = amd_unpack1(L0.s2); sum.s1 += v; sum.s2 = mad(v, 6.0f, sum.s2); sum.s3 = v;\n"
+			"    v = amd_unpack2(L0.s2);              sum.s2 = mad(v, 4.0f, sum.s2); sum.s3 = mad(v, 4.0f, sum.s3);\n"
+			"    v = amd_unpack3(L0.s2); sum.s2 += v; sum.s3 = mad(v, 6.0f, sum.s3);\n"
+			"    v = amd_unpack0(L0.s3);              sum.s3 = mad(v, 4.0f, sum.s3);\n"
+			"    v = amd_unpack1(L0.s3); sum.s3 += v;\n"
+			"    L0.s0 = (uint)sum.s0 + (((uint)sum.s1) << 16);\n"
+			"    L0.s1 = (uint)sum.s2 + (((uint)sum.s3) << 16);\n"
+			"    *(__local uint2 *)&lbuf_ptr[%d] = L0.s01;\n" // LMemStride*32
+			"  }\n"
+			"  barrier(CLK_LOCAL_MEM_FENCE);\n"
+			"  lbuf_ptr += ly * %d;\n" // LMemStride
+			"  L0.s01 = vload2(0, (__local uint *) lbuf_ptr);\n"
+			"  sum.s0 = (float)(L0.s0 & 0xffff); sum.s1 = (float)(L0.s0 >> 16); sum.s2 = (float)(L0.s1 & 0xffff); sum.s3 = (float)(L0.s1 >> 16);\n"
+			"  L0.s01 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride
+			"  sum.s0 = mad((float)(L0.s0 & 0xffff), 4.0f, sum.s0); sum.s1 = mad((float)(L0.s0 >> 16), 4.0f, sum.s1); sum.s2 = mad((float)(L0.s1 & 0xffff), 4.0f, sum.s2); sum.s3 = mad((float)(L0.s1 >> 16), 4.0f, sum.s3);\n"
+			"  L0.s01 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 2
+			"  sum.s0 = mad((float)(L0.s0 & 0xffff), 6.0f, sum.s0); sum.s1 = mad((float)(L0.s0 >> 16), 6.0f, sum.s1); sum.s2 = mad((float)(L0.s1 & 0xffff), 6.0f, sum.s2); sum.s3 = mad((float)(L0.s1 >> 16), 6.0f, sum.s3);\n"
+			"  L0.s01 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 3
+			"  sum.s0 = mad((float)(L0.s0 & 0xffff), 4.0f, sum.s0); sum.s1 = mad((float)(L0.s0 >> 16), 4.0f, sum.s1); sum.s2 = mad((float)(L0.s1 & 0xffff), 4.0f, sum.s2); sum.s3 = mad((float)(L0.s1 >> 16), 4.0f, sum.s3);\n"
+			"  L0.s01 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 4
+			"  sum.s0 += (float)(L0.s0 & 0xffff); sum.s1 += (float)(L0.s0 >> 16); sum.s2 += (float)(L0.s1 & 0xffff); sum.s3 += (float)(L0.s1 >> 16);\n"
+			"  sum = sum * (float4)0.00390625f;\n"
+			"  if (valid) {;\n"
+			"    *(__global uint *)p0_buf = amd_pack(sum);\n"
+			"  }\n"
+			"}\n"
+			)
+			, LMemStride, LMemStride * 16, LMemStride * 16, LMemStride * 32, LMemStride * 32, LMemStride, LMemStride, LMemStride * 2, LMemStride * 3, LMemStride * 4);
+		node->opencl_code += item;
+	}
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 3) >> 2) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Generate OpenCL code for the following gaussian scale filters:
+//   VX_KERNEL_AMD_SCALE_GAUSSIAN_ORB_U8_U8_5x5 (interpolation = VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR)
+//
+int HafGpu_ScaleGaussianOrb(AgoNode * node, vx_interpolation_type_e interpolation)
+{
+	int status = VX_SUCCESS;
+	// configuration
+	int work_group_width = 16;
+	int work_group_height = 16;
+	int width = node->paramList[0]->u.img.width;
+	int height = node->paramList[0]->u.img.height;
+	float xscale = (float)node->paramList[1]->u.img.width / (float)width, xoffset = xscale * 0.5f;
+	float yscale = (float)node->paramList[1]->u.img.height / (float)height, yoffset = yscale * 0.5f;
+	int N = 0;
+	if (node->akernel->id == VX_KERNEL_AMD_SCALE_GAUSSIAN_ORB_U8_U8_5x5) {
+		N = 5;
+	}
+	else {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: HafGpu_ScaleGaussian doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+
+	// local memory usage
+	int LMemStride = 128;
+	int LMemHeight = 19 + N - 1;
+	int LMemSize = LMemStride * LMemHeight;
+
+	// kernel declaration
+	char item[8192];
+	sprintf(item,
+		OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\n"
+		"void %s(uint p0_width, uint p0_height, __global uchar * p0_buf, uint p0_stride, uint p0_offset, uint p1_width, uint p1_height, __global uchar * p1_buf, uint p1_stride, uint p1_offset)\n"
+		"{\n"
+		"  __local uchar lbuf[%d];\n" // LMemSize
+		"  int lx = get_local_id(0);\n"
+		"  int ly = get_local_id(1);\n"
+		"  int gx = get_global_id(0);\n"
+		"  int gy = get_global_id(1);\n"
+		"  bool outputValid = ((gx < %d) && (gy < %d)) ? true : false;\n" // (width+3)/4, height
+		"  p0_buf += p0_offset + (gy * p0_stride) + (gx << 2);\n"
+		"  int gstride = p1_stride;\n"
+		"  float fx =  mad((float)(gx - lx), %.12ef, %.12ef);\n" // xscale * 4, xoffset
+		"  float fy =  mad((float)(gy - ly), %.12ef, %.12ef);\n" // yscale, yoffset
+		"  gx = (uint)fx; fx -= (float)gx;\n"
+		"  gy = (uint)fy; fy -= (float)gy;\n"
+		"  gx = gx - 2 + 4;\n"
+		"  gy = gy - 2;\n"
+		"  uint lxalign = gx & 3;\n"
+		"  __global uchar * gbuf = p1_buf + p1_offset + (gx & ~3) + gy * gstride;\n"
+		"  gx = lx; gy = ly;\n"
+		)
+		, work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME, LMemSize, (width + 3) / 4, height, xscale * 4.0f, xoffset, yscale, yoffset);
+	node->opencl_code = item;
+	// load input image into local
+	if (HafGpu_Load_Local(work_group_width, work_group_height, LMemStride, LMemHeight, 4, 0, node->opencl_code) < 0) {
+		return -1;
+	}
+	// perform filtering
+	if (N == 5) {
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  __local uchar * lbuf_ptr = lbuf + ly * %d;\n" // LMemStride
+			"  float flx = mad((float)(lx << 2), %.12ef, fx + (float)lxalign);\n" // xscale
+			"  uint2 L0, isum; float fsum; uint ilx;\n"
+			"  ilx = (uint)flx; L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n"
+			"  fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"  isum.s0 = (uint)fsum;\n"
+			"  ilx = (uint)(flx + %.12ef); L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n" // xscale
+			"  fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"  isum.s0 |= (((uint)fsum) << 16);\n"
+			"  ilx = (uint)(flx + %.12ef); L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n" // xscale * 2
+			"  fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"  isum.s1 = (uint)fsum;\n"
+			"  ilx = (uint)(flx + %.12ef); L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n" // xscale * 3
+			"  fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"  isum.s1 |= (((uint)fsum) << 16);\n"
+			"  ((__local uint2 *)lbuf_ptr)[lx] = isum;\n"
+			"  if (ly < 7) {\n"
+			"    lbuf_ptr += %d;\n" // LMemStride * 16
+			"    ilx = (uint)flx; L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n"
+			"    fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"    isum.s0 = (uint)fsum;\n"
+			"    ilx = (uint)(flx + %.12ef); L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n" // xscale
+			"    fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"    isum.s0 |= (((uint)fsum) << 16);\n"
+			"    ilx = (uint)(flx + %.12ef); L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n" // xscale * 2
+			"    fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"    isum.s1 = (uint)fsum;\n"
+			"    ilx = (uint)(flx + %.12ef); L0 = vload2(0, (__local uint *)&lbuf_ptr[ilx & ~3]); L0.s0 = amd_bytealign(L0.s1, L0.s0, ilx); L0.s1 = amd_bytealign(L0.s1, L0.s1, ilx);\n" // xscale * 3
+			"    fsum = amd_unpack0(L0.s0); fsum = mad(amd_unpack1(L0.s0), 4.0f, fsum); fsum = mad(amd_unpack2(L0.s0), 6.0f, fsum); fsum = mad(amd_unpack3(L0.s0), 4.0f, fsum); fsum += amd_unpack0(L0.s1);\n"
+			"    isum.s1 |= (((uint)fsum) << 16);\n"
+			"    ((__local uint2 *)lbuf_ptr)[lx] = isum;\n"
+			"    lbuf_ptr -= %d;\n" // LMemStride * 16
+			"  }\n"
+			"  barrier(CLK_LOCAL_MEM_FENCE);\n"
+			"  float fly = fy + (float)ly * %.12ef; float4 sum;\n" // yscale
+			"  lbuf_ptr = lbuf + (uint)fly * %d + (lx << 3);\n" // LMemStride
+			"  L0 = vload2(0, (__local uint *) lbuf_ptr);\n"
+			"  sum.s0 = (float)(L0.s0 & 0xffff); sum.s1 = (float)(L0.s0 >> 16); sum.s2 = (float)(L0.s1 & 0xffff); sum.s3 = (float)(L0.s1 >> 16);\n"
+			"  L0 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride
+			"  sum.s0 = mad((float)(L0.s0 & 0xffff), 4.0f, sum.s0); sum.s1 = mad((float)(L0.s0 >> 16), 4.0f, sum.s1); sum.s2 = mad((float)(L0.s1 & 0xffff), 4.0f, sum.s2); sum.s3 = mad((float)(L0.s1 >> 16), 4.0f, sum.s3);\n"
+			"  L0 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 2
+			"  sum.s0 = mad((float)(L0.s0 & 0xffff), 6.0f, sum.s0); sum.s1 = mad((float)(L0.s0 >> 16), 6.0f, sum.s1); sum.s2 = mad((float)(L0.s1 & 0xffff), 6.0f, sum.s2); sum.s3 = mad((float)(L0.s1 >> 16), 6.0f, sum.s3);\n"
+			"  L0 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 3
+			"  sum.s0 = mad((float)(L0.s0 & 0xffff), 4.0f, sum.s0); sum.s1 = mad((float)(L0.s0 >> 16), 4.0f, sum.s1); sum.s2 = mad((float)(L0.s1 & 0xffff), 4.0f, sum.s2); sum.s3 = mad((float)(L0.s1 >> 16), 4.0f, sum.s3);\n"
+			"  L0 = vload2(0, (__local uint *)&lbuf_ptr[%d]);\n" // LMemStride * 4
+			"  sum.s0 += (float)(L0.s0 & 0xffff); sum.s1 += (float)(L0.s0 >> 16); sum.s2 += (float)(L0.s1 & 0xffff); sum.s3 += (float)(L0.s1 >> 16);\n"
+			"  sum = sum * (float4)0.00390625f;\n"
+			"  if (outputValid) {;\n"
+			"    *(__global uint *)p0_buf = amd_pack(sum);\n"
+			"  }\n"
+			"}\n"
+			)
+			, LMemStride, xscale, xscale, xscale * 2.0f, xscale * 3.0f, 
+			  LMemStride * 16, xscale, xscale * 2.0f, xscale * 3.0f, LMemStride * 16, 
+			  yscale, LMemStride, LMemStride, LMemStride * 2, LMemStride * 3, LMemStride * 4);
+		node->opencl_code += item;
+	}
+
+	// use completely separate kernel
+	node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL;
+	node->opencl_param_discard_mask = 0;
+	node->opencl_work_dim = 2;
+	node->opencl_global_work[0] = (((width + 3) >> 2) + work_group_width - 1) & ~(work_group_width - 1);
+	node->opencl_global_work[1] = (height + work_group_height - 1) & ~(work_group_height - 1);
+	node->opencl_global_work[2] = 0;
+	node->opencl_local_work[0] = work_group_width;
+	node->opencl_local_work[1] = work_group_height;
+	node->opencl_local_work[2] = 0;
+
+	return status;
+}
+
+#endif
diff --git a/openvx/ago/ago_interface.cpp b/openvx/ago/ago_interface.cpp
new file mode 100644
index 0000000..430c600
--- /dev/null
+++ b/openvx/ago/ago_interface.cpp
@@ -0,0 +1,2050 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+static DWORD WINAPI agoGraphThreadFunction(LPVOID graph_)
+{
+	AgoGraph * graph = (AgoGraph *)graph_;
+	while (WaitForSingleObject(graph->hSemToThread, INFINITE) == WAIT_OBJECT_0) {
+		if (graph->threadThreadTerminationState)
+			break;
+
+		// execute graph
+		graph->status = agoProcessGraph(graph);
+
+		// inform caller
+		graph->threadExecuteCount++;
+		ReleaseSemaphore(graph->hSemFromThread, 1, nullptr);
+	}
+	// inform caller about termination
+	graph->threadThreadTerminationState = 2;
+	ReleaseSemaphore(graph->hSemFromThread, 1, nullptr);
+	return 0;
+}
+
+AgoContext * agoCreateContext()
+{
+	CAgoLockGlobalContext lock;
+
+	// check if CPU hardware supports
+	bool isHardwareSupported = agoIsCpuHardwareSupported();
+	if (!isHardwareSupported) {
+		agoAddLogEntry(NULL, VX_FAILURE, "ERROR: Unsupported CPU\n");
+		return NULL;
+	}
+
+	// create context and initialize
+	AgoContext * acontext = new AgoContext;
+	if (agoPublishKernels(acontext)) {
+		delete acontext;
+		acontext = NULL;
+	}
+	if (acontext) {
+		agoResetReference(&acontext->ref, VX_TYPE_CONTEXT, acontext, NULL);
+		acontext->ref.external_count++;
+		// initialize thread config
+		char textBuffer[1024];
+		if (agoGetEnvironmentVariable("AGO_THREAD_CONFIG", textBuffer, sizeof(textBuffer))) {
+			acontext->thread_config = atoi(textBuffer);
+		}
+	}
+	return (AgoContext *)acontext;
+}
+
+int agoReleaseContext(AgoContext * acontext)
+{
+	CAgoLockGlobalContext lock;
+
+	if (!agoIsValidContext(acontext))
+		return -1;
+
+	EnterCriticalSection(&acontext->cs);
+	acontext->ref.external_count--;
+	if (acontext->ref.external_count == 0) {
+		// release all the resources
+		LeaveCriticalSection(&acontext->cs);
+		delete acontext;
+	}
+	else {
+		LeaveCriticalSection(&acontext->cs);
+	}
+	return 0;
+}
+
+AgoGraph * agoCreateGraph(AgoContext * acontext)
+{
+	AgoGraph * agraph = new AgoGraph;
+	if (!agraph) {
+		return nullptr;
+	}
+
+	// initialize
+	agoResetReference(&agraph->ref, VX_TYPE_GRAPH, acontext, NULL);
+	agraph->attr_affinity = acontext->attr_affinity;
+	char textBuffer[256];
+	if (agoGetEnvironmentVariable("VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS", textBuffer, sizeof(textBuffer))) {
+		if (sscanf(textBuffer, "%i", &agraph->optimizer_flags) == 1) {
+			agoAddLogEntry(&agraph->ref, VX_SUCCESS, "DEBUG: VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS = 0x%08x\n", agraph->optimizer_flags);
+		}
+	}
+
+	{ // link graph to the context
+		CAgoLock lock(acontext->cs);
+		agoAddGraph(&acontext->graphList, agraph);
+		agraph->ref.external_count++;
+	}
+
+	if (acontext->thread_config & 1) {
+		// create semaphore and thread for graph scheduling: limit 1000 pending requests
+		agraph->hSemToThread = CreateSemaphore(nullptr, 0, 1000, nullptr);
+		agraph->hSemFromThread = CreateSemaphore(nullptr, 0, 1000, nullptr);
+		if (agraph->hSemToThread == NULL || agraph->hSemFromThread == NULL) { 
+			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: CreateSemaphore() failed\n");
+			agoReleaseGraph(agraph); 
+			return nullptr; 
+		}
+		agraph->hThread = CreateThread(NULL, 0, agoGraphThreadFunction, agraph, 0, NULL);
+#if _WIN32 // TBD: need to enable this check for non-windows platforms
+		if (agraph->hThread == NULL) { 
+			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: CreateThread() failed\n");
+			agoReleaseGraph(agraph); 
+			return nullptr; 
+		}
+#if _DEBUG
+		agoAddLogEntry(&agraph->ref, VX_SUCCESS, "OK: enabled graph scheduling in separate threads\n");
+#endif
+#endif
+	}
+
+	return (AgoGraph *)agraph;
+}
+
+int agoReleaseGraph(AgoGraph * agraph)
+{
+	CAgoLock lock(agraph->ref.context->cs);
+
+	int status = 0;
+	agraph->ref.external_count--;
+	if (agraph->ref.external_count == 0) {
+		EnterCriticalSection(&agraph->cs);
+		// stop graph thread
+		if (agraph->hThread) {
+			if (agraph->hThread) {
+				agraph->threadThreadTerminationState = 1;
+				ReleaseSemaphore(agraph->hSemToThread, 1, nullptr);
+				while (agraph->threadThreadTerminationState == 1) {
+					std::this_thread::sleep_for(std::chrono::milliseconds(1));
+				}
+				CloseHandle(agraph->hThread);
+			}
+			if (agraph->hSemToThread) {
+				CloseHandle(agraph->hSemToThread);
+			}
+			if (agraph->hSemFromThread) {
+				CloseHandle(agraph->hSemFromThread);
+			}
+		}
+		// deinitialize the graph
+		for (AgoNode * node = agraph->nodeList.head; node; node = node->next)
+		{
+			status = agoShutdownNode(node);
+			if (status) {
+				break;
+			}
+		}
+		if (!status) {
+			// remove graph from context
+			if (agoRemoveGraph(&agraph->ref.context->graphList, agraph) != agraph) {
+				status = -1;
+				LeaveCriticalSection(&agraph->cs);
+			}
+			else {
+#if ENABLE_OPENCL
+				// Releasing the command queue for the graph because it is not needed
+				agoGpuOclReleaseGraph(agraph);
+#endif
+				LeaveCriticalSection(&agraph->cs);
+				// move graph to garbage list
+				agraph->next = agraph->ref.context->graph_garbage_list;
+				agraph->ref.context->graph_garbage_list = agraph;
+			}
+		}
+		else {
+			LeaveCriticalSection(&agraph->cs);
+		}
+	}
+
+	return status;
+}
+
+int agoOptimizeGraph(AgoGraph * agraph)
+{
+	if (!agraph->status) {
+		CAgoLock lock(agraph->cs);
+		CAgoLock lock2(agraph->ref.context->cs);
+
+		// run DRAMA graph optimizer
+		agraph->status = agoOptimizeDrama(agraph);
+	}
+
+	return agraph->status;
+}
+
+int agoWriteGraph(AgoGraph * agraph, AgoReference * * ref, int num_ref, FILE * fp, const char * comment)
+{
+	CAgoLock lock(agraph->cs);
+	CAgoLock lock2(agraph->ref.context->cs);
+
+#if ENABLE_DEBUG_MESSAGES
+	agoOptimizeDramaMarkDataUsage(agraph);
+#endif
+
+	bool * imported = new bool[agraph->ref.context->num_active_modules + 1];
+	for (vx_uint32 i = 0; i < agraph->ref.context->num_active_modules; i++)
+		imported[i] = false;
+	fprintf(fp, "# ago graph dump BEGIN [%s]\n", comment ? comment : "");
+	for (auto aus = agraph->ref.context->userStructList.begin(); aus != agraph->ref.context->userStructList.end(); aus++) {
+		if (aus->importing_module_index_plus1) {
+			if (!imported[aus->importing_module_index_plus1 - 1]) {
+				fprintf(fp, "import %s\n", agraph->ref.context->modules[aus->importing_module_index_plus1 - 1].module_name);
+				imported[aus->importing_module_index_plus1 - 1] = true;
+			}
+		}
+		else {
+			if (!aus->name.length()) {
+				vx_char name[64];
+				sprintf(name, "AUTO-USER-STRUCT!%03d!", aus->id - VX_TYPE_USER_STRUCT_START + 1);
+				aus->name = name;
+			}
+			fprintf(fp, "type %s userstruct:" VX_FMT_SIZE "\n", aus->name.c_str(), aus->size);
+		}
+	}
+	for (AgoKernel * akernel = agraph->ref.context->kernelList.head; akernel; akernel = akernel->next) {
+		if (akernel->flags & AGO_KERNEL_FLAG_GROUP_USER) {
+			if (akernel->importing_module_index_plus1) {
+				if (!imported[akernel->importing_module_index_plus1 - 1]) {
+					fprintf(fp, "import %s\n", agraph->ref.context->modules[akernel->importing_module_index_plus1 - 1].module_name);
+					imported[akernel->importing_module_index_plus1 - 1] = true;
+				}
+			}
+		}
+	}
+	for (AgoData * adata = agraph->ref.context->dataList.head; adata; adata = adata->next) {
+		// check if data is part of specified ref[] arguments
+		int index = -1;
+		for (int i = 0; i < num_ref; i++) {
+			if (adata == (AgoData *)ref[i]) {
+				index = i;
+				break;
+			}
+		}
+		// output data statements for non ref[] and non internal generated data objects
+		if (index < 0 && adata->name.length() > 0 && adata->name[0] != '!' && !adata->parent) {
+			char desc[1024] = "*ERROR*";
+			agoGetDescriptionFromData(agraph->ref.context, desc, adata);
+			fprintf(fp, "data %s = %s", adata->name.length() ? adata->name.c_str() : "*UNKNOWN*", desc);
+#if ENABLE_DEBUG_MESSAGES
+			if (adata->inputUsageCount | adata->outputUsageCount | adata->inoutUsageCount)
+				fprintf(fp, " #usageCount[%d,%d,%d]", adata->inputUsageCount, adata->outputUsageCount, adata->inoutUsageCount);
+#endif
+			fprintf(fp, "\n");
+		}
+	}
+	for (AgoData * adata = agraph->dataList.head; adata; adata = adata->next) {
+		// check if data is part of specified ref[] arguments
+		int index = -1;
+		for (int i = 0; i < num_ref; i++) {
+			if (adata == (AgoData *)ref[i]) {
+				index = i;
+				break;
+			}
+		}
+		// output data statements for non ref[] and non internal generated data objects
+		if (index < 0 && adata->name.length() > 0 && adata->name[0] != '!' && !adata->parent) {
+			char desc[1024] = "*ERROR*";
+			agoGetDescriptionFromData(agraph->ref.context, desc, adata);
+			fprintf(fp, "data %s = %s", adata->name.length() ? adata->name.c_str() : "*UNKNOWN*", desc);
+#if ENABLE_DEBUG_MESSAGES
+			if (adata->inputUsageCount | adata->outputUsageCount | adata->inoutUsageCount)
+				fprintf(fp, " #usageCount[%d,%d,%d]", adata->inputUsageCount, adata->outputUsageCount, adata->inoutUsageCount);
+			fprintf(fp, " #(virtual)");
+#endif
+			fprintf(fp, "\n");
+		}
+	}
+	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+		fprintf(fp, "node %s", anode->akernel->name);
+		vx_uint32 paramCount = anode->paramCount;
+		while (paramCount > 0 && !anode->paramList[paramCount - 1])
+			paramCount--;
+		for (vx_uint32 i = 0; i < paramCount; i++) {
+			AgoData * data = anode->paramList[i];
+			if (!data) {
+				fprintf(fp, " null");
+			}
+			else {
+				// check if data is part of specified ref[] arguments, if so use $1..$N in output
+				int index = -1;
+				for (int i = 0; i < num_ref; i++) {
+					if (data == (AgoData *)ref[i]) {
+						index = i;
+						break;
+					}
+				}
+				if (index >= 0) {
+					fprintf(fp, " $%d", index + 1);
+				}
+				else {
+					char name[1024];
+					agoGetDataName(name, data);
+					if (name[0]) {
+						fprintf(fp, " %s", name);
+					}
+					else {
+						char desc[1024];
+						agoGetDescriptionFromData(agraph->ref.context, desc, data);
+						fprintf(fp, " %s", desc);
+					}
+				}
+			}
+		}
+		if (anode->attr_border_mode.mode == VX_BORDER_MODE_REPLICATE) fprintf(fp, " attr:BORDER_MODE:REPLICATE");
+		else if (anode->attr_border_mode.mode == VX_BORDER_MODE_CONSTANT) fprintf(fp, " attr:BORDER_MODE:CONSTANT,0x%08x", anode->attr_border_mode.constant_value);
+		if (anode->attr_affinity.device_type) {
+			fprintf(fp, " attr:AFFINITY:%s", (anode->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) ? "GPU" : "CPU");
+			if (anode->attr_affinity.device_info) 
+				fprintf(fp, "%d", anode->attr_affinity.device_info);
+			if (anode->attr_affinity.group)
+				fprintf(fp, ",%d", anode->attr_affinity.group);
+		}
+#if _DEBUG || ENABLE_DEBUG_MESSAGES
+		fprintf(fp, " #L%d", anode->hierarchical_level);
+#endif
+		fprintf(fp, "\n");
+	}
+	fprintf(fp, "# ago graph dump END [%s]\n", comment ? comment : "");
+	fflush(fp);
+	delete[] imported;
+
+	return 0;
+}
+
+static const char * agoReadLine(char * line, int size, const char * str)
+{
+	if (!str || !*str)
+		return NULL;
+	line[0] = 0; size -= 2;
+	for (int i = 0; i < size; i++) {
+		char c = line[i] = *str++;
+		if (c == 0) {
+			str--;
+			break;
+		}
+		else if (c == '\n') {
+			line[i + 1] = 0;
+			break;
+		}
+	}
+	return str;
+}
+
+static void agoUpdateLine(char * line, std::vector< std::pair< std::string, std::string > >& vars, std::string localPrefix)
+{
+	char lineOriginal[2048]; strcpy(lineOriginal, line);
+	int ki = 0;
+	for (int i = 0; lineOriginal[i]; i++, ki++) {
+		line[ki] = lineOriginal[i];
+		if (lineOriginal[i] == '$' && lineOriginal[i + 1] >= 'A' && lineOriginal[i + 1] <= 'Z') {
+			// get variable name
+			char * s = &lineOriginal[i + 1];
+			int k = 1;
+			for (; (s[k] >= 'A' && s[k] <= 'Z') || (s[k] >= 'a' && s[k] <= 'z') || (s[k] >= '0' && s[k] <= '9') || s[k] == '_'; k++)
+				;
+			// search variable name
+			for (std::vector< std::pair< std::string, std::string > >::iterator it = vars.begin(); it != vars.end(); ++it) {
+				if (!strncmp(it->first.c_str(), s, k)) {
+					strcpy(&line[ki], it->second.c_str());
+					ki = (int)strlen(line) - 1;
+					i += k;
+					break;
+				}
+			}
+		}
+		else if (lineOriginal[i] == '$' && lineOriginal[i + 1] == '!') {
+			strcpy(&line[ki], localPrefix.c_str());
+			ki = (int)strlen(line) - 1;
+			line[++ki] = '!';
+			i += 1;
+		}
+	}
+	line[ki] = 0;
+}
+
+static void agoUpdateN(char * output, char * input, int N, int Nchar)
+{
+	int ki = 0;
+	for (int i = 0; input[i]; i++, ki++) {
+		output[ki] = input[i];
+		if (input[i] == '{') {
+			// get variable name
+			char * s = &input[i + 1];
+			int k = 0;
+			int index = 0, v = 0, op = '+';
+			for (; (s[k] >= '0' && s[k] <= '9') || (Nchar && s[k] == Nchar) || (s[k] == '+') || (s[k] == '-'); k++) {
+				if (s[k] == Nchar) v = N;
+				else if (s[k] == '+' || s[k] == '-') {
+					index += (op == '+') ? v : -v;
+					op = s[k];
+					v = 0;
+				}
+				else v = v * 10 + s[k] - '0';
+			}
+			index += (op == '+') ? v : -v;
+			if (s[k] == '}') {
+				// replace $[expr] with index
+				sprintf(&output[ki], "%d", index);
+				ki = (int)strlen(output) - 1;
+				i += k + 1;
+			}
+		}
+	}
+	output[ki] = 0;
+}
+
+static void agoReadGraphFromStringInternal(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, const char * str, vx_int32 dumpToConsole, std::vector< std::pair< std::string, std::string > >& vars, std::string localPrefix)
+{
+	vx_context context = agraph->ref.context;
+	std::vector< std::pair< std::string, std::string > > aliases;
+	// set default values to for/if constructs
+	vx_int32 Nbegin = 0, Nend = 0, Nstep = 1, Nchar = '\0', forConstruct = 0;
+	vx_uint32 ifdepth = 0, ifcur = 0, ifall = 0;
+	// process one line at a time
+	char line[2048];
+	for (int lineno = 1; (str = agoReadLine(line, sizeof(line) - 16, str)) != NULL; lineno++)
+	{
+		int N = (int)strlen(line);
+		while (N > 0 && (line[N - 1] == '\r' || line[N - 1] == '\n'))
+			line[--N] = 0;
+		if (dumpToConsole) agoAddLogEntry(NULL, VX_SUCCESS, "%s\n", line);
+		while (N > 0 && line[N - 1] == '\\') {
+			int pos = N - 1;
+			if (!(str = agoReadLine(line + pos, sizeof(line) - 16 - pos, str))) break;
+			N = (int)strlen(line);
+			while (N > 0 && (line[N - 1] == '\r' || line[N - 1] == '\n'))
+				line[--N] = 0;
+			if (dumpToConsole) agoAddLogEntry(NULL, VX_SUCCESS, "%s\n", line+pos);
+			lineno++;
+		}
+		agoUpdateLine(line, vars, localPrefix);
+		char lineCopy[sizeof(line)]; strcpy(lineCopy, line);
+		char * s = strstr(line, "#");
+		if (s) { *s = 0; N = (int)strlen(line); }
+		char * argv[64] = { 0 };
+		int narg = 0;
+		for (s = line; narg < 64;)
+		{
+			while (*s && (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n')) s++;
+			if (!*s) break;
+			argv[narg++] = s;
+			while (*s && !(*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n')) s++;
+			if (*s) *s++ = 0;
+			else break;
+		}
+		// process for construct
+		if (!forConstruct) {
+			// reset for-loop parameters to single iteration
+			Nbegin = 0, Nend = 0, Nstep = 1, Nchar = '\0';
+			if (narg == 4 && !strcmp(argv[0], "for") && !strcmp(argv[2], "in") && strlen(argv[1]) == 1 && (argv[1][0] >= 'a' && argv[1][0] <= 'z')) {
+				// set for-loop parameters
+				Nchar = argv[1][0];
+				char range[128]; agoUpdateN(range, argv[3], 0, '\0');
+				if (sscanf(range, "%d:%d,%d", &Nbegin, &Nend, &Nstep) < 2 || Nstep <= 0) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid for syntax: should be 'for i in <begin>:<end>[,<step>]'\n>>>> %s\n", lineno, lineCopy);
+					agraph->status = -1;
+					break;
+				}
+				forConstruct = 1;
+				continue;
+			}
+		}
+		else if (narg == 1 && !strcmp(argv[0], "endfor")) {
+			// reset for-loop parameters to single iteration
+			Nbegin = 0, Nend = 0, Nstep = 1, Nchar = '\0';
+			forConstruct = 0;
+			continue;
+		}
+		if (narg == 4 && (!strcmp(argv[0], "if") || !strcmp(argv[0], "elseif"))) {
+			char expr1[128]; agoUpdateN(expr1, argv[1], 0, '\0');
+			char expr2[128]; agoUpdateN(expr2, argv[3], 0, '\0');
+			int value1 = atoi(expr1);
+			int value2 = atoi(expr2);
+			bool result = false;
+			if (!strcmp(argv[2], "==")) result = (value1 == value2);
+			else if (!strcmp(argv[2], "!=")) result = (value1 != value2);
+			else if (!strcmp(argv[2], "<=")) result = (value1 <= value2);
+			else if (!strcmp(argv[2], ">=")) result = (value1 >= value2);
+			else if (!strcmp(argv[2], "<")) result = (value1 < value2);
+			else if (!strcmp(argv[2], ">")) result = (value1 > value2);
+			else {
+				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid if-command syntax: should be '[else]if <value1> ==|!=|<|>|<=|>= <value2>'\n>>>> %s\n", lineno, lineCopy);
+				agraph->status = -1;
+				break;
+			}
+			if (!strcmp(argv[0], "if")) {
+				// increase the depth and mark result in lowest bit (0:true, 1:false)
+				ifdepth++;
+				ifcur <<= 1;
+				ifall <<= 1;
+				if (!result) {
+					ifcur += 1;
+					ifall += 1;
+				}
+			}
+			else {
+				// if previously if/elseif resulted in true, mark result as false
+				if (!(ifall & 1))
+					result = false;
+				// set lowest bit of both ifcur and ifall
+				if (result) {
+					ifcur &= ~1;
+					ifall &= ~1;
+				}
+				else {
+					ifcur |= 1;
+				}
+			}
+			continue;
+		}
+		else if (narg == 1 && !strcmp(argv[0], "else")) {
+			if (ifdepth == 0) {
+				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: found else without matching if statement'\n>>>> %s\n", lineno, lineCopy);
+				agraph->status = -1;
+				break;
+			}
+			ifcur = (ifcur & ~1) | !(ifall & 1);
+			continue;
+		}
+		else if (narg == 1 && !strcmp(argv[0], "endif")) {
+			if (ifdepth == 0) {
+				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: found endif without matching if statement'\n>>>> %s\n", lineno, lineCopy);
+				agraph->status = -1;
+				break;
+			}
+			ifdepth--;
+			ifcur >>= 1;
+			ifall >>= 1;
+			continue;
+		}
+		// check skip if earlier conditional statements required to do so
+		if (ifcur)
+			continue;
+		// process command with optional for-command-prefix support
+		for (int N = Nbegin; N <= Nend; N += Nstep) {
+			// create arguments with {N} expression substitution
+			char argBuf[2048] = { 0 }, *arg[64] = { 0 };
+			for (int i = 0, j = 0; i < narg; i++) {
+				arg[i] = argBuf + j;
+				agoUpdateN(arg[i], argv[i], N, Nchar);
+				j += (int)strlen(arg[i]) + 1;
+			}
+			// process the actual commands
+			if (narg == 4 && !strcmp(arg[0], "data") && !strcmp(arg[2], "=")) {
+				// create new AgoData and add it to the dataList
+				AgoData * data = agoCreateDataFromDescription(context, agraph, arg[3], false);
+				if (!data) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: data type not supported\n>>>> %s\n", lineno, lineCopy);
+					agraph->status = -1;
+					break;
+				}
+				data->name = arg[1];
+				agoAddData(data->isVirtual ? &agraph->dataList : &context->dataList, data);
+				// if data has children (e.g., pyramid, delay, image), add them too
+				if (data->children) {
+					for (vx_uint32 i = 0; i < data->numChildren; i++) {
+						if (data->children[i]) {
+							for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
+								if (data->children[i]->children[j]) {
+									agoAddData(data->isVirtual ? &agraph->dataList : &context->dataList, data->children[i]->children[j]);
+								}
+							}
+							agoAddData(data->isVirtual ? &agraph->dataList : &context->dataList, data->children[i]);
+						}
+					}
+				}
+				// inform application about data -- ignore this for scalar strings
+				if (callback_f && !(data->ref.type == VX_TYPE_SCALAR && data->u.scalar.type == VX_TYPE_STRING_AMD)) {
+					// skip till ':'
+					const char * param = arg[3];
+					for (; *param && *param != ':'; param++)
+						;
+					if (*param == ':') {
+						// still till another ':'
+						for (param++; *param && *param != ':'; param++)
+							;
+						if (*param == ':') {
+							param++;
+							// invoke the application callback with object name and parameter strings
+							data->ref.external_count++;
+							callback_f(callback_obj, &data->ref, data->name.c_str(), param);
+						}
+					}
+				}
+			}
+			else if ((narg >= 3 && !strcmp(arg[0], "node")) || (narg >= 3 && !strcmp(arg[0], "macro")) || (narg >= 2 && !strcmp(arg[0], "file"))) {
+				std::string localSuffix = "!";
+				AgoKernel * akernel = NULL;
+				AgoNode * node = NULL;
+				char * str_subgraph = NULL;
+				AgoReference * ref_subgraph[AGO_MAX_PARAMS] = { 0 };
+				if (!strcmp(arg[0], "node")) {
+					if (!(akernel = agoFindKernelByName(context, arg[1]))) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: kernel not supported\n>>>> %s\n", lineno, lineCopy);
+						agraph->status = -1;
+						break;
+					}
+					// create a new AgoNode and add it to the nodeList
+					node = agoCreateNode(agraph, akernel);
+				}
+				else if (!strcmp(arg[0], "macro")) {
+					for (auto it = context->macros.begin(); it != context->macros.end(); ++it) {
+						if (!strcmp(it->name, arg[1])) {
+							localSuffix += it->name;
+							str_subgraph = it->text;
+							break;
+						}
+					}
+					if (!str_subgraph) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: unable to find macro '%s'\n>>>> %s\n", lineno, arg[1], lineCopy);
+						agraph->status = -1;
+						break;
+					}
+				}
+				else {
+					FILE * fp = fopen(arg[1], "rb");
+					if (!fp) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: unable to open '%s'\n>>>> %s\n", lineno, arg[1], lineCopy);
+						agraph->status = -1;
+						break;
+					}
+					fseek(fp, 0L, SEEK_END); long size = ftell(fp); fseek(fp, 0L, SEEK_SET);
+					str_subgraph = (char *)calloc(1, size + 1); if (!str_subgraph) { 
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "FATAL: calloc(1,%d) failed\n", (int)size + 1);
+						agraph->status = -1; 
+						break; 
+					}
+					(void)fread(str_subgraph, sizeof(char), size, fp);
+					fclose(fp);
+					// update suffix
+					const char * name = arg[1];
+					for (char *p = arg[1]; *p; p++) {
+						if (*p == '/' || *p == '\\' || *p == ':')
+							name = p + 1;
+						else if (*p == '.') *p = '\0';
+					}
+					localSuffix += name;
+				}
+				// look through all parameters
+				for (int p = 0; p < narg - 2; p++)
+				{
+					if (node && strncmp(arg[2 + p], "attr:", 5) == 0) {
+						if (!strncmp(&arg[2 + p][5], "BORDER_MODE:", 12)) {
+							if (!strcmp(&arg[2 + p][17], "UNDEFINED")) {
+								node->attr_border_mode.mode = VX_BORDER_MODE_UNDEFINED;
+								node->attr_border_mode.constant_value = 0;
+							}
+							else if (!strcmp(&arg[2 + p][17], "REPLICATE")) {
+								node->attr_border_mode.mode = VX_BORDER_MODE_REPLICATE;
+								node->attr_border_mode.constant_value = 0;
+							}
+							else if (!strncmp(&arg[2 + p][17], "CONSTANT,", 9)) {
+								node->attr_border_mode.mode = VX_BORDER_MODE_CONSTANT;
+								node->attr_border_mode.constant_value = 0;
+								(void)sscanf(&arg[2 + p][17 + 9], "%i", &node->attr_border_mode.constant_value);
+							}
+							else {
+								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid/unsupported border mode attribute -- arg#%d\n>>>> %s\n", lineno, p, lineCopy);
+								agraph->status = -1;
+								break;
+							}
+						}
+						else if (!strncmp(&arg[2 + p][5], "AFFINITY:", 9)) {
+							vx_uint32 group = 0;
+							char device[64] = "CPU";
+							const char * szGroup = strstr(&arg[2 + p][14], ",");
+							if (szGroup) {
+								group = atoi(&szGroup[1]);
+							}
+							node->attr_affinity.group = group;
+							(void)sscanf(&arg[2 + p][14], "%s", device);
+							if (!strncmp(device, "CPU", 3)) {
+								node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
+								node->attr_affinity.device_info = atoi(&device[3]);
+							}
+							else if (!strncmp(device, "GPU", 3)) {
+								node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
+								node->attr_affinity.device_info = atoi(&device[3]);
+							}
+							else {
+								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid/unsupported affinity attribute -- arg#%d\n>>>> %s\n", lineno, p, lineCopy);
+								agraph->status = -1;
+								break;
+							}
+						}
+					}
+					else if (!strcmp(arg[0], "file") && strncmp(arg[2 + p], "/def-var:", 9) == 0) {
+						char command[256]; sprintf(command, "def-var %s\n", &arg[2 + p][9]);
+						char * equal = strstr(command, "=");
+						if (!equal) {
+							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid def-var syntax: expected /def-var:<NAME>=<VALUE>\n>>>> %s\n", lineno, lineCopy);
+							agraph->status = -1;
+							break;
+						}
+						*equal = ' ';
+						agoReadGraphFromStringInternal(agraph, ref, num_ref, callback_f, callback_obj, command, 0, vars, localPrefix);
+						if (agraph->status)
+							break;
+					}
+					else {
+						AgoData * data = NULL;
+						if (arg[2 + p][0] == '$') {
+							int index = atoi(&arg[2 + p][1]) - 1;
+							if (index >= 0 && index < num_ref) {
+								data = (AgoData *)ref[index];
+							}
+						}
+						else if (strcmp(arg[2 + p], "null") != 0) {
+							char name[128]; strcpy(name, arg[2 + p]);
+							// check if there is an name alias
+							for (std::vector< std::pair< std::string, std::string > >::iterator it = aliases.begin(); it != aliases.end(); ++it) {
+								if (!strcmp(it->first.c_str(), name)) {
+									strcpy(name, it->second.c_str());
+									if (name[0] == '$') {
+										int index = atoi(&name[1]) - 1;
+										if (index >= 0 && index < num_ref) {
+											data = (AgoData *)ref[index];
+										}
+									}
+									break;
+								}
+							}
+							// get data object
+							if (!data) {
+								data = agoFindDataByName(context, agraph, name);
+							}
+							if (!data) {
+								// create new AgoData and add it to the dataList
+								data = agoCreateDataFromDescription(context, agraph, name, false);
+								if (!data) {
+									agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: data type not supported -- arg#%d\n>>>> %s\n", lineno, p, lineCopy);
+									agraph->status = -1;
+									break;
+								}
+								agoAddData(&agraph->dataList, data);
+								// if data has children (e.g., pyramid), add them too
+								if (data->children) {
+									for (vx_uint32 i = 0; i < data->numChildren; i++) {
+										if (data->children[i]) {
+											char childname[256];
+											sprintf(childname, "%s[%d]", data->name.c_str(), i);
+											data->children[i]->name = childname;
+											agoAddData(&agraph->dataList, data->children[i]);
+										}
+									}
+								}
+							}
+						}
+						if (data) {
+							if (node) {
+								node->paramList[p] = data;
+								// check if specified data type is correct
+								// NOTE: kernel can specify to ignore this checking by setting argType[] to ZERO
+								if (akernel->argType[p] && (akernel->argType[p] != data->ref.type)) {
+									char type_expected_buf[64], type_specified_buf[64];
+									const char * type_expected = agoEnum2Name(akernel->argType[p]);
+									const char * type_specified = agoEnum2Name(data->ref.type);
+									if (!type_expected) { sprintf(type_expected_buf, "0x%08x", akernel->argType[p]); type_expected = type_expected_buf; }
+									if (!type_specified) { sprintf(type_specified_buf, "0x%08x", data->ref.type); type_specified = type_specified_buf; }
+									agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: data type %s expected -- arg#%d has %s\n>>>> %s\n", lineno, type_expected, p, type_specified, lineCopy);
+									agraph->status = -1;
+									break;
+								}
+							}
+							else if (p < AGO_MAX_PARAMS)
+								ref_subgraph[p] = &data->ref;
+							else {
+								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: number of arguments exceeded MAX(%d)\n>>>> %s\n", lineno, AGO_MAX_PARAMS, lineCopy);
+								agraph->status = -1;
+								break;
+							}
+						}
+					}
+				}
+				if (str_subgraph && !agraph->status) {
+					agoReadGraphFromStringInternal(agraph, ref_subgraph, narg - 2, callback_f, callback_obj, str_subgraph, (dumpToConsole > 0) ? dumpToConsole - 1 : vx_false_e, vars, localPrefix + localSuffix);
+				}
+				if (str_subgraph && !strcmp(arg[0], "file"))
+					free(str_subgraph);
+				if (agraph->status)
+					break;
+			}
+			else if (narg == 2 && !strcmp(arg[0], "import")) {
+				char * module_name = arg[1];
+				if (agoLoadModule(context, module_name)) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: import: unable to load module: %s\n", module_name);
+					agraph->status = -1;
+					break;
+				}
+			}
+			else if (narg == 3 && !strcmp(arg[0], "type") && !strncmp(arg[2], "userstruct:", 11)) {
+				vx_enum user_struct_id = 0;
+				char * name = arg[1];
+				if (agoGetUserStructSize(context, name) > 0) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: type: name already in-use: %s\n", name);
+					agraph->status = -1;
+					break;
+				}
+				vx_size size = atoi(&arg[2][11]);
+				if (agoAddUserStruct(context, size, name) == VX_TYPE_INVALID) {
+					agraph->status = -1;
+				}
+			}
+			else if (narg == 2 && !strcmp(arg[0], "def-macro")) {
+				char macro_name[256]; strncpy(macro_name, arg[1], sizeof(macro_name));
+				const char * str_begin = str;
+				const char * str_end = str;
+				for (; (str = agoReadLine(line, sizeof(line)-16, str)) != NULL; lineno++) {
+					if (dumpToConsole) agoAddLogEntry(NULL, VX_SUCCESS, "%s", line);
+					agoUpdateLine(line, vars, localPrefix);
+					char word[256];
+					if (sscanf(line, "%s", word) == 1 && !strcmp(word, "endmacro"))
+						break;
+					str_end = str;
+				}
+				if (!str) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: incomplete macro definition: %s\n>>>> %s\n", lineno, macro_name, lineCopy);
+					agraph->status = -1;
+					break;
+				}
+				else {
+					for (auto it = context->macros.begin(); it != context->macros.end(); ++it) {
+						if (!strcmp(it->name, macro_name)) {
+							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: macro already exists: %s\n>>>> %s\n", lineno, macro_name, lineCopy);
+							agraph->status = -1;
+							break;
+						}
+					}
+					if (agraph->status)
+						break;
+					else {
+						MacroData macro;
+						macro.text = macro.text_allocated = (char *)calloc(1, str_end - str_begin + 1);
+						strncpy(macro.name, macro_name, sizeof(macro.name) - 1);
+						strncpy(macro.text, str_begin, str_end - str_begin);
+						context->macros.push_back(macro);
+					}
+				}
+			}
+			else if ((narg == 2 || narg == 3) && (!strcmp(arg[0], "def-var") || !strcmp(arg[0], "def-var-default"))) {
+				for (vx_uint32 i = 0; arg[1][i]; i++) {
+					char c = arg[1][i];
+					if (!(i == 0 && c >= 'A' && c <= 'Z') && !(i > 0 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_'))) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: variable names can only have [A-Z][A-Za-z0-9_]* format: %s\n>>>> %s\n", lineno, arg[1], lineCopy);
+						agraph->status = -1;
+						break;
+					}
+				}
+				if (agraph->status)
+					break;
+				bool found = false;
+				for (std::vector< std::pair< std::string, std::string > >::iterator it = vars.begin(); it != vars.end(); ++it) {
+					if (!strcmp(it->first.c_str(), arg[1])) {
+						found = true;
+						if (!strcmp(arg[0], "def-var")) {
+							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: variable already exists: %s\n>>>> %s\n", lineno, arg[1], lineCopy);
+							agraph->status = -1;
+						}
+						break;
+					}
+				}
+				if (agraph->status)
+					break;
+				else if (!found) {
+					char value[2048];
+					if (narg == 2) {
+						value[0] = 0;
+					}
+					else {
+						strcpy(value, arg[2]);
+						agoEvaluateIntegerExpression(value);
+					}
+					if ((!strncmp(value, "WIDTH(", 6) || !strncmp(value, "HEIGHT(", 7) || !strncmp(value, "FORMAT(", 7)) && value[strlen(value) - 1] == ')') {
+						char * name = strstr(value, "(") + 1; value[strlen(value) - 1] = 0;
+						AgoData * pdata = agoFindDataByName(context, agraph, name);
+						if (!pdata && name[0] == '$' && name[1] >= '1' && name[1] <= '9') {
+							int v = atoi(&name[1]) - 1;
+							if (v < num_ref)
+								pdata = (AgoData *)ref[v];
+							if (!pdata) {
+								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: specified argument is not available: %s\n>>>> %s\n", lineno, arg[2], lineCopy);
+								agraph->status = -1;
+								break;
+							}
+						}
+						if (!pdata || (pdata->ref.type != VX_TYPE_IMAGE && pdata->ref.type != VX_TYPE_PYRAMID)) {
+							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid data name specified: %s\n>>>> %s\n", lineno, name, lineCopy);
+							agraph->status = -1;
+							break;
+						}
+						if (!strncmp(value, "WIDTH(", 6)) {
+							int v = 0;
+							if (pdata->ref.type == VX_TYPE_IMAGE) v = pdata->u.img.width;
+							else if (pdata->ref.type == VX_TYPE_PYRAMID) v = pdata->u.pyr.width;
+							sprintf(value, "%d", v);
+						}
+						else if (!strncmp(value, "HEIGHT(", 7)) {
+							int v = 0;
+							if (pdata->ref.type == VX_TYPE_IMAGE) v = pdata->u.img.height;
+							else if (pdata->ref.type == VX_TYPE_PYRAMID) v = pdata->u.pyr.height;
+							sprintf(value, "%d", v);
+						}
+						else if (!strncmp(value, "FORMAT(", 7)) {
+							vx_df_image v = VX_DF_IMAGE_U8;
+							if (pdata->ref.type == VX_TYPE_IMAGE) v = pdata->u.img.format;
+							else if (pdata->ref.type == VX_TYPE_PYRAMID) v = pdata->u.pyr.format;
+							sprintf(value, "%4.4s", FORMAT_STR(v));
+						}
+					}
+					vars.push_back(std::pair< std::string, std::string >(arg[1], value));
+					// special AGO flags
+					if (!strcmp(arg[1], "AgoOptimizerFlags") && value[0] >= '0' && value[0] <= '9') {
+						agraph->optimizer_flags = atoi(value);
+						break;
+					}
+				}
+			}
+			else if (narg == 2 && !strcmp(arg[0], "affinity")) {
+				if (!strcmp(arg[1], "GPU")) {
+					agraph->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
+				}
+				else if (!strcmp(arg[1], "CPU")) {
+					agraph->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
+				}
+				else {
+					agraph->attr_affinity.device_type = 0;
+				}
+			}
+			else if (narg == 3 && !strcmp(arg[0], "alias")) {
+				for (vx_uint32 i = 0; arg[1][i]; i++) {
+					char c = arg[1][i];
+					if (!(i == 0 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) && !(i > 0 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_'))) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: alias names can only have [A-Z][A-Za-z0-9_]* format: %s\n>>>> %s\n", lineno, arg[1], lineCopy);
+						agraph->status = -1;
+						break;
+					}
+				}
+				if (agraph->status)
+					break;
+				char name1[128]; agoUpdateN(name1, arg[1], 0, '\0');
+				char name2[128]; agoUpdateN(name2, arg[2], 0, '\0');
+				for (std::vector< std::pair< std::string, std::string > >::iterator it = aliases.begin(); it != aliases.end(); ++it) {
+					if (!strcmp(it->first.c_str(), name1)) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: alias already exists: %s\n>>>> %s\n", lineno, name1, lineCopy);
+						agraph->status = -1;
+						break;
+					}
+				}
+				if (agraph->status)
+					break;
+				aliases.push_back(std::pair< std::string, std::string >(name1, name2));
+			}
+			else if (narg > 0 && !strcmp(arg[0], "set-args")) {
+				if (narg - 1 > num_ref) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: set-args: number of argument (%d) exceeded internal buffer (%d)\n>>>> %s\n", lineno, narg - 1, num_ref, lineCopy);
+					agraph->status = -1;
+					break;
+				}
+				// clear all previous arguments
+				for (int i = 0; i < num_ref; i++) {
+					// TBD handle memory leaks
+					ref[i] = NULL;
+				}
+				for (int j = 0; j < narg - 1; j++) {
+					// create new AgoData and add it to the dataList
+					AgoData * data = agoCreateDataFromDescription(context, agraph, arg[j + 1], false);
+					if (!data) {
+						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: set-args: invalid object description: %s\n>>>> %s\n", lineno, arg[j + 1], lineCopy);
+						agraph->status = -1;
+						break;
+					}
+					agoAddData(data->isVirtual ? &agraph->dataList : &agraph->ref.context->dataList, data);
+					// if data has children (e.g., pyramid), add them too
+					if (data->children) {
+						for (vx_uint32 i = 0; i < data->numChildren; i++) {
+							if (data->children[i]) {
+								agoAddData(data->isVirtual ? &agraph->dataList : &agraph->ref.context->dataList, data->children[i]);
+							}
+						}
+					}
+					ref[j] = &data->ref;
+				}
+				if (agraph->status)
+					break;
+			}
+			else if (narg == 3 && !strcmp(arg[0], "directive")) {
+				agraph->status = -1;
+				char name1[128]; agoUpdateN(name1, arg[1], 0, '\0');
+				char name2[128]; agoUpdateN(name2, arg[2], 0, '\0');
+				AgoData * data = agoFindDataByName(context, agraph, name1);
+				if (data) {
+					vx_enum directive = agoName2Enum(name2);
+					if (!directive) {
+						(void)sscanf(name2, "%i", &directive);
+					}
+					if (directive) {
+						agraph->status = agoDirective((vx_reference)data, directive);
+					}
+				}
+				if (agraph->status) {
+					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid object or directive: directive %s %s\n>>>> %s\n", lineno, name1, name2, lineCopy);
+					break;
+				}
+			}
+			else if (narg >= 1 && !strcmp(arg[0], "exit")) {
+				break;
+			}
+			else if (narg > 0) {
+				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: syntax error\n>>>> %s\n", lineno, lineCopy);
+				agraph->status = -1;
+				break;
+			}
+		}
+		if (agraph->status)
+			break;
+	}
+}
+
+int agoReadGraph(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, FILE * fp, vx_int32 dumpToConsole)
+{
+	if (!agraph) return -1;
+	vx_context context = agraph->ref.context;
+	CAgoLock lock(agraph->cs);
+	CAgoLock lock2(context->cs);
+
+	// read the whole file into a local buffer
+	long cur = ftell(fp); fseek(fp,  0L, SEEK_END);
+	long end = ftell(fp); fseek(fp, cur, SEEK_SET);
+	long size = end - cur; if (size < 1) return agraph->status;
+	char * str = (char *)calloc(1, size + 1); if (!str) return -1;
+	(void)fread(str, sizeof(char), size, fp);
+
+	// read the graph from file
+	std::vector< std::pair< std::string, std::string > > vars;
+	agoReadGraphFromStringInternal(agraph, ref, num_ref, callback_f, callback_obj, str, dumpToConsole, vars, "L");
+	free(str);
+
+	// mark the scope of all virtual data to graph
+	for (AgoData * data = agraph->dataList.head; data; data = data->next) {
+		data->ref.scope = &agraph->ref;
+	}
+	// check if graph is a valid graph
+	if (agraph->status == VX_SUCCESS) {
+		agraph->status = agoVerifyGraph(agraph);
+		if (agraph->status) {
+			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: agoVerifyGraph() => %d (failed)\n", agraph->status);
+		}
+	}
+	return agraph->status;
+}
+
+int agoReadGraphFromString(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, char * str, vx_int32 dumpToConsole)
+{
+	if (!agraph) return -1;
+	vx_context context = agraph->ref.context;
+	CAgoLock lock(agraph->cs);
+	CAgoLock lock2(context->cs);
+
+	// read the graph from string
+	std::vector< std::pair< std::string, std::string > > vars;
+	agoReadGraphFromStringInternal(agraph, ref, num_ref, callback_f, callback_obj, str, dumpToConsole, vars, "L");
+
+	// mark the scope of all virtual data to graph
+	for (AgoData * data = agraph->dataList.head; data; data = data->next) {
+		data->ref.scope = &agraph->ref;
+	}
+	// check if graph is a valid graph
+	if (agraph->status == VX_SUCCESS) {
+		agraph->status = agoVerifyGraph(agraph);
+		if (agraph->status) {
+			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: agoVerifyGraph() => %d (failed)\n", agraph->status);
+		}
+	}
+	return agraph->status;
+}
+
+int agoLoadModule(AgoContext * context, const char * module)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		char filePath[1024]; sprintf(filePath, SHARED_LIBRARY_PREFIX "%s" SHARED_LIBRARY_EXTENSION, module);
+		ago_module hmodule = agoOpenModule(filePath);
+		if (hmodule == NULL) {
+			agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: Unable to load module %s\n", filePath);
+		}
+		else {
+			vx_publish_kernels_f publish_kernels_f = (vx_publish_kernels_f)agoGetFunctionAddress(hmodule, "vxPublishKernels");
+			if (!publish_kernels_f) {
+				agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: vxPublishKernels symbol missing in %s\n", filePath);
+			}
+			else {
+				vx_uint32 count = context->kernelList.count;
+				context->importing_module_index_plus1 = context->num_active_modules + 1;
+				status = publish_kernels_f(context);
+				context->importing_module_index_plus1 = 0;
+				if (status == VX_SUCCESS) {
+					agoAddLogEntry(&context->ref, VX_SUCCESS, "OK: loaded %d kernels from %s\n", context->kernelList.count - count, filePath);
+					ModuleData data;
+					strncpy(data.module_name, module, sizeof(data.module_name)-1);
+					strncpy(data.module_path, filePath, sizeof(data.module_path) - 1);
+					data.hmodule = hmodule;
+					context->modules.push_back(data);
+					context->num_active_modules++;
+				}
+				else {
+					agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: vxPublishKernels => %d (failed) -- %s\n", status, filePath);
+				}
+			}
+		}
+	}
+	return status;
+}
+
+vx_status agoVerifyNode(AgoNode * node)
+{
+	AgoGraph * graph = (AgoGraph *)node->ref.scope;
+	AgoKernel * kernel = node->akernel;
+	vx_status status = VX_SUCCESS;
+
+	// check if node has required arguments and initialize data required for further graph processing
+	node->hierarchical_level = 0;
+	for (vx_uint32 arg = 0; arg < AGO_MAX_PARAMS; arg++) {
+		AgoData * data = node->paramList[arg];
+		if (!data || (arg >= node->paramCount)) {
+			if (((kernel->argConfig[arg] & AGO_KERNEL_ARG_OPTIONAL_FLAG) == 0) && ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) != 0)) {
+				agoAddLogEntry(&kernel->ref, VX_ERROR_NOT_SUFFICIENT, "ERROR: agoVerifyGraph: kernel %s: missing argument#%d\n", kernel->name, arg);
+				return VX_ERROR_NOT_SUFFICIENT;
+			}
+		}
+		else if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == 0) {
+			agoAddLogEntry(&kernel->ref, VX_ERROR_NOT_SUFFICIENT, "ERROR: agoVerifyGraph: kernel %s: unexpected argument#%d\n", kernel->name, arg);
+			return VX_ERROR_NOT_SUFFICIENT;
+		}
+		if (data) {
+			data->hierarchical_level = 0;
+			// reset meta data of the node for output argument processing
+			if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) {
+				vx_meta_format meta = &node->metaList[arg];
+				meta->type = VX_TYPE_META_FORMAT;
+				meta->data.ref.magic = AGO_MAGIC_VALID;
+				meta->data.ref.context = node->ref.context;
+				meta->data.ref.scope = node->ref.scope;
+				meta->data.ref.type = kernel->argType[arg];
+				meta->data.ref.external_count = 1;
+				if (data->ref.type == VX_TYPE_IMAGE) {
+					meta->data.u.img.rect_valid.start_x = 0;
+					meta->data.u.img.rect_valid.start_y = 0;
+					meta->data.u.img.rect_valid.end_x = INT_MAX;
+					meta->data.u.img.rect_valid.end_y = INT_MAX;
+				}
+				else if (data->ref.type == VX_TYPE_PYRAMID) {
+					meta->data.u.pyr.rect_valid.start_x = 0;
+					meta->data.u.pyr.rect_valid.start_y = 0;
+					meta->data.u.pyr.rect_valid.end_x = INT_MAX;
+					meta->data.u.pyr.rect_valid.end_y = INT_MAX;
+				}
+			}
+		}
+	}
+
+	// check if node arguments are valid
+	if (kernel->func) {
+		// validate arguments for built-in kernel functions
+		vx_status status = kernel->func(node, ago_kernel_cmd_validate);
+		if (status) {
+			agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: ago_kernel_cmd_validate failed (%d)\n", kernel->name, status);
+			return status;
+		}
+	}
+	else {
+		// check if node input arguments are valid
+		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+			if (node->paramList[arg]) {
+				if (kernel->argConfig[arg] & AGO_KERNEL_ARG_INPUT_FLAG) {
+					if (kernel->input_validate_f) {
+						vx_status status = kernel->input_validate_f(node, arg);
+						if (status) {
+							agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: input_validate failed (%d) for argument#%d\n", kernel->name, status, arg);
+							return status;
+						}
+					}
+				}
+			}
+		}
+		// check if node output arguments are valid
+		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+			AgoData * data = node->paramList[arg];
+			if (data) {
+				if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) {
+					if (kernel->output_validate_f) {
+						vx_meta_format meta = &node->metaList[arg];
+						vx_status status = kernel->output_validate_f(node, arg, meta);
+						if (status) {
+							agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: output_validate failed (%d) for argument#%d\n", kernel->name, status, arg);
+							return status;
+						}
+					}
+				}
+			}
+		}
+	}
+	// check if node output arguments are valid
+	node->rect_valid.start_x = 0;
+	node->rect_valid.start_y = 0;
+	node->rect_valid.end_x = INT_MAX;
+	node->rect_valid.end_y = INT_MAX;
+	for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+		AgoData * data = node->paramList[arg];
+		if (data) {
+			if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) {
+				vx_meta_format meta = &node->metaList[arg];
+				if (kernel->argType[arg] && (meta->data.ref.type != kernel->argType[arg])) {
+					agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: output argument type mismatch for argument#%d\n", kernel->name, arg);
+					return VX_ERROR_INVALID_TYPE;
+				}
+				else if (meta->data.ref.type == VX_TYPE_IMAGE) {
+					bool updated = false;
+					if (data->isVirtual) {
+						// update format/width/height if not specified
+						if (data->u.img.format == VX_DF_IMAGE_VIRT) {
+							data->u.img.format = meta->data.u.img.format;
+							updated = true;
+						}
+						if (data->u.img.width == 0) {
+							data->u.img.width = meta->data.u.img.width;
+							updated = true;
+						}
+						if (data->u.img.height == 0) {
+							data->u.img.height = meta->data.u.img.height;
+							updated = true;
+						}
+					}
+					// make sure that the data come from output validator matches with object
+					if (data->u.img.format != meta->data.u.img.format) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_FORMAT, "ERROR: agoVerifyGraph: kernel %s: invalid format for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_FORMAT;
+					}
+					else if (data->u.img.width != meta->data.u.img.width || data->u.img.height != meta->data.u.img.height) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_DIMENSION, "ERROR: agoVerifyGraph: kernel %s: invalid dimension for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_DIMENSION;
+					}
+					// re-initialize, if updated
+					if (updated) {
+						char desc[64]; sprintf(desc, "image-virtual:%4.4s,%d,%d", FORMAT_STR(data->u.img.format), data->u.img.width, data->u.img.height);
+						data->isNotFullyConfigured = vx_true_e;
+						if (agoGetDataFromDescription(graph->ref.context, graph, data, desc)) {
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoVerifyGraph: agoVerifyGraph update failed for virtual-image: %4.4s %dx%d\n", FORMAT_STR(data->u.img.format), data->u.img.width, data->u.img.height);
+							return -1;
+						}
+						if (data->children) {
+							for (vx_uint32 i = 0; i < data->numChildren; i++) {
+								agoAddData(&graph->dataList, data->children[i]);
+							}
+						}
+						data->isNotFullyConfigured = vx_false_e;
+					}
+					// update valid rectangle
+					data->u.img.rect_valid = meta->data.u.img.rect_valid;
+					if (data->u.img.rect_valid.end_x == INT_MAX)
+						data->u.img.rect_valid.end_x = data->u.img.width;
+					if (data->u.img.rect_valid.end_y == INT_MAX)
+						data->u.img.rect_valid.end_y = data->u.img.height;
+					node->rect_valid.start_x = max(node->rect_valid.start_x, data->u.img.rect_valid.start_x);
+					node->rect_valid.start_y = max(node->rect_valid.start_y, data->u.img.rect_valid.start_y);
+					node->rect_valid.end_x = min(node->rect_valid.end_x, data->u.img.rect_valid.end_x);
+					node->rect_valid.end_y = min(node->rect_valid.end_y, data->u.img.rect_valid.end_y);
+					// check for VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL attribute
+					if (meta->data.u.img.enableUserBufferOpenCL) {
+						// supports only virtual images with single color plane and without ROI
+						if (!data->isVirtual || data->u.img.planes != 1 || data->u.img.isROI) {
+							agoAddLogEntry(&kernel->ref, VX_ERROR_NOT_SUPPORTED, "ERROR: agoVerifyGraph: kernel %s: VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL is not supported for argument#%d\n", kernel->name, arg);
+							return VX_ERROR_NOT_SUPPORTED;
+						}
+						data->u.img.enableUserBufferOpenCL = meta->data.u.img.enableUserBufferOpenCL;
+					}
+				}
+				else if (meta->data.ref.type == VX_TYPE_PYRAMID) {
+					bool updated = false;
+					if (data->isVirtual) {
+						// update format/width/height if not specified
+						if (data->u.pyr.format == VX_DF_IMAGE_VIRT) {
+							data->u.pyr.format = meta->data.u.pyr.format;
+							updated = true;
+						}
+						if (data->u.pyr.width == 0) {
+							data->u.pyr.width = meta->data.u.pyr.width;
+							updated = true;
+						}
+						if (data->u.pyr.height == 0) {
+							data->u.pyr.height = meta->data.u.pyr.height;
+							updated = true;
+						}
+					}
+					// make sure that the data come from output validator matches with object
+					if (data->u.pyr.levels != meta->data.u.pyr.levels || data->u.pyr.scale != meta->data.u.pyr.scale) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_VALUE, "ERROR: agoVerifyGraph: kernel %s: invalid value for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_VALUE;
+					}
+					else if (data->u.pyr.format != meta->data.u.pyr.format) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_FORMAT, "ERROR: agoVerifyGraph: kernel %s: invalid format for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_FORMAT;
+					}
+					else if (data->u.pyr.width != meta->data.u.pyr.width || data->u.pyr.height != meta->data.u.pyr.height) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_DIMENSION, "ERROR: agoVerifyGraph: kernel %s: invalid dimension for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_DIMENSION;
+					}
+					// re-initialize, if updated
+					if (updated) {
+						char scale[64], desc[64];
+						if (data->u.pyr.scale == VX_SCALE_PYRAMID_HALF) sprintf(scale, "HALF");
+						else if (data->u.pyr.scale == VX_SCALE_PYRAMID_ORB) sprintf(scale, "ORB");
+						else sprintf(scale, "%g", data->u.pyr.scale);
+						sprintf(desc, "pyramid-virtual:%4.4s,%d,%d," VX_FMT_SIZE ",%s", FORMAT_STR(data->u.pyr.format), data->u.pyr.width, data->u.pyr.height, data->u.pyr.levels, scale);
+						data->isNotFullyConfigured = vx_true_e;
+						if (agoGetDataFromDescription(graph->ref.context, graph, data, desc)) {
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoVerifyGraph: agoVerifyGraph update failed for %s\n", desc);
+							return -1;
+						}
+						if (data) {
+							agoAddData(&graph->dataList, data);
+							// add the children too
+							for (vx_uint32 i = 0; i < data->numChildren; i++) {
+								agoAddData(&graph->dataList, data->children[i]);
+								for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
+									if (data->children[i]->children[j]) {
+										agoAddData(&graph->dataList, data->children[i]->children[j]);
+									}
+								}
+							}
+						}
+						data->isNotFullyConfigured = vx_false_e;
+					}
+					// update valid rectangle
+					data->u.pyr.rect_valid = meta->data.u.pyr.rect_valid;
+					if (data->u.pyr.rect_valid.end_x == INT_MAX)
+						data->u.pyr.rect_valid.end_x = data->u.pyr.width;
+					if (data->u.pyr.rect_valid.end_y == INT_MAX)
+						data->u.pyr.rect_valid.end_y = data->u.pyr.height;
+					node->rect_valid.start_x = max(node->rect_valid.start_x, data->u.pyr.rect_valid.start_x);
+					node->rect_valid.start_y = max(node->rect_valid.start_y, data->u.pyr.rect_valid.start_y);
+					node->rect_valid.end_x = min(node->rect_valid.end_x, data->u.pyr.rect_valid.end_x);
+					node->rect_valid.end_y = min(node->rect_valid.end_y, data->u.pyr.rect_valid.end_y);
+					// propagate valid rectangle to all images inside the pyramid
+					for (vx_uint32 i = 0; i < data->numChildren; i++) {
+						AgoData * img = data->children[i];
+						if (img) {
+							vx_float32 xscale = (vx_float32)img->u.img.width / (vx_float32)data->u.pyr.width;
+							vx_float32 yscale = (vx_float32)img->u.img.height / (vx_float32)data->u.pyr.height;
+							img->u.img.rect_valid.start_x = (vx_uint32)ceilf(data->u.pyr.rect_valid.start_x * xscale);
+							img->u.img.rect_valid.start_y = (vx_uint32)ceilf(data->u.pyr.rect_valid.start_y * yscale);
+							img->u.img.rect_valid.end_x = (vx_uint32)floorf(data->u.pyr.rect_valid.end_x * xscale);
+							img->u.img.rect_valid.end_y = (vx_uint32)floorf(data->u.pyr.rect_valid.end_y * yscale);
+						}
+					}
+				}
+				else if (meta->data.ref.type == VX_TYPE_ARRAY) {
+					bool updated = false;
+					if (data->isVirtual) {
+						// update itemtype/capacity if not specified
+						if (data->u.arr.itemtype == VX_TYPE_INVALID) {
+							data->u.arr.itemtype = meta->data.u.arr.itemtype;
+							updated = true;
+						}
+						if (data->u.arr.capacity == 0) {
+							data->u.arr.capacity = meta->data.u.arr.capacity;
+							updated = true;
+						}
+					}
+					// make sure that the data come from output validator matches with object
+					if (data->u.arr.itemtype != meta->data.u.arr.itemtype) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid type for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_TYPE;
+					}
+					else if (!data->u.arr.capacity || (meta->data.u.arr.capacity && meta->data.u.arr.capacity > data->u.arr.capacity)) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_DIMENSION, "ERROR: agoVerifyGraph: kernel %s: invalid dimension for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_DIMENSION;
+					}
+					if (updated) {
+						data->isNotFullyConfigured = vx_true_e;
+						char desc[64]; sprintf(desc, "array-virtual:%u,%u", data->u.arr.itemtype, (vx_uint32)data->u.arr.capacity);
+						if (agoGetDataFromDescription(graph->ref.context, graph, data, desc)) {
+							agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoVerifyGraph: agoVerifyGraph update failed for %s\n", desc);
+							return -1;
+						}
+						data->isNotFullyConfigured = vx_false_e;
+					}
+				}
+				else if (meta->data.ref.type == VX_TYPE_SCALAR) {
+					// make sure that the data come from output validator matches with object
+					if (data->u.scalar.type != meta->data.u.scalar.type) {
+						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid type for argument#%d\n", kernel->name, arg);
+						return VX_ERROR_INVALID_TYPE;
+					}
+				}
+				else if (meta->data.ref.type == VX_TYPE_DISTRIBUTION) {
+					// nothing to do
+				}
+				else if (meta->data.ref.type == VX_TYPE_LUT) {
+					// nothing to do
+				}
+				else if (meta->data.ref.type == AGO_TYPE_CANNY_STACK) {
+					// nothing to do
+				}
+				else if (meta->data.ref.type == AGO_TYPE_MINMAXLOC_DATA) {
+					// nothing to do
+				}
+				else if (meta->data.ref.type == AGO_TYPE_MEANSTDDEV_DATA) {
+					// nothing to do
+				}
+				else if (kernel->argType[arg]) {
+					agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid type for argument#%d\n", kernel->name, arg);
+					return VX_ERROR_INVALID_TYPE;
+				}
+			}
+			else if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) {
+#if 0 // TBD: disabled temporarily as a quick workaround for bidirectional buffer issue
+				// virtual objects can not be used as bidirectional arguments
+				if (data->isVirtual) {
+					agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_PARAMETERS, "ERROR: agoVerifyGraph: kernel %s: bidirectional argument shouldn't be virtual for argument#%d (%s)\n", kernel->name, arg, data->name);
+					return VX_ERROR_INVALID_PARAMETERS;
+				}
+#endif
+			}
+		}
+	}
+
+	return status;
+}
+
+int agoVerifyGraph(AgoGraph * graph)
+{
+	// compute node hierarchy in the graph: this takes care of
+	//    - single writers
+	//    - no loops
+	vx_status status = agoOptimizeDramaComputeGraphHierarchy(graph);
+	if (status) {
+		return status;
+	}
+	agoOptimizeDramaSortGraphHierarchy(graph);
+
+	// initialize valid region every input image/pyramid to its full region
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		for (vx_uint32 i = 0; i < node->paramCount; i++) {
+			AgoData * data = node->paramList[i];
+			if (data) {
+				if (data->ref.type == VX_TYPE_IMAGE) {
+					data->u.img.rect_valid.start_x = 0;
+					data->u.img.rect_valid.start_y = 0;
+					data->u.img.rect_valid.end_x = data->u.img.width;
+					data->u.img.rect_valid.end_y = data->u.img.height;
+				}
+				else if (data->ref.type == VX_TYPE_PYRAMID) {
+					data->u.pyr.rect_valid.start_x = 0;
+					data->u.pyr.rect_valid.start_y = 0;
+					data->u.pyr.rect_valid.end_x = data->u.pyr.width;
+					data->u.pyr.rect_valid.end_y = data->u.pyr.height;
+				}
+			}
+		}
+	}
+
+	////////////////////////////////////////////////
+	// validate node arguments
+	////////////////////////////////////////////////
+	graph->detectedInvalidNode = false;
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		status = agoVerifyNode(node);
+		if (status) {
+			return status;
+		}
+	}
+
+	// compute node hierarchy in the graph: this takes care of
+	//    - single writers
+	//    - no loops
+	status = agoOptimizeDramaComputeGraphHierarchy(graph);
+
+	return status;
+}
+
+int agoInitializeGraph(AgoGraph * graph)
+{
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next)
+	{
+		AgoKernel * kernel = node->akernel;
+		vx_status status = VX_SUCCESS;
+		if (kernel->func) {
+			status = kernel->func(node, ago_kernel_cmd_initialize);
+		}
+		else if (kernel->initialize_f) {
+			status = kernel->initialize_f(node, (vx_reference *)node->paramList, node->paramCount);
+		}
+		if (status) {
+			return status;
+		}
+		else {
+			if (node->localDataSize > 0 && node->localDataPtr == nullptr) {
+				if (node->localDataPtr_allocated)
+					delete[] node->localDataPtr_allocated;
+				node->localDataPtr = node->localDataPtr_allocated = (vx_uint8 *)agoAllocMemory(node->localDataSize);
+				if (!node->localDataPtr) {
+					return VX_ERROR_NO_MEMORY;
+				}
+				memset(node->localDataPtr, 0, node->localDataSize);
+			}
+			node->initialized = true;
+			// keep a copy of paramList into paramListForAgeDelay
+			memcpy(node->paramListForAgeDelay, node->paramList, sizeof(node->paramListForAgeDelay));
+		}
+	}
+	return VX_SUCCESS;
+}
+
+#if ENABLE_OPENCL
+static int agoWaitForNodesCompletion(AgoGraph * graph)
+{
+	int status = VX_SUCCESS;
+	if (!graph->opencl_nodeListQueued.empty()) {
+		for (vx_size i = 0; i < graph->opencl_nodeListQueued.size(); i++) {
+			AgoNode * node = graph->opencl_nodeListQueued[i];
+			if (node->supernode) {
+				if (!node->supernode->launched || agoGpuOclSuperNodeWait(graph, node->supernode) < 0) {
+					return VX_FAILURE;
+				}
+				agoPerfCaptureStop(&node->perf);
+				for (size_t index = 0; index < node->supernode->nodeList.size(); index++) {
+					AgoNode * anode = node->supernode->nodeList[index];
+					// node callback
+					if (anode->callback) {
+						vx_action action = anode->callback(anode);
+						if (action == VX_ACTION_ABANDON) {
+							status = VX_ERROR_GRAPH_ABANDONED;
+							break;
+						}
+					}
+				}
+			}
+			else {
+				if (agoGpuOclSingleNodeWait(graph, node) < 0) {
+					return VX_FAILURE;
+				}
+				agoPerfCaptureStop(&node->perf);
+				// node callback
+				if (node->callback) {
+					vx_action action = node->callback(node);
+					if (action == VX_ACTION_ABANDON) {
+						status = VX_ERROR_GRAPH_ABANDONED;
+						break;
+					}
+				}
+			}
+		}
+		graph->opencl_nodeListQueued.clear();
+	}
+	return status;
+}
+
+static int agoDataSyncFromGpuToCpu(AgoGraph * graph, AgoNode * node, AgoData * dataToSync)
+{
+	if (dataToSync->opencl_buffer && !(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+		if (node->flags & AGO_KERNEL_FLAG_DEVICE_GPU) {
+			if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
+				int64_t stime = agoGetClockCounter();
+				if (dataToSync->ref.type == VX_TYPE_LUT) {
+					size_t origin[3] = { 0, 0, 0 };
+					size_t region[3] = { 256, 1, 1 };
+					cl_int err = clEnqueueWriteImage(graph->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
+					if (err) { 
+						agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
+						return -1; 
+					}
+				}
+				else {
+					vx_size size = dataToSync->size;
+					if (dataToSync->ref.type == VX_TYPE_ARRAY) {
+						// transfer only valid data
+						size = dataToSync->u.arr.itemsize * dataToSync->u.arr.numitems;
+					}
+					if (size > 0) {
+						cl_int err = clEnqueueWriteBuffer(graph->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
+						if (err) { 
+							agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
+							return -1; 
+						}
+					}
+				}
+				dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+				int64_t etime = agoGetClockCounter();
+				graph->opencl_perf.buffer_write += etime - stime;
+			}
+		}
+		else {
+			if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL)) {
+				int64_t stime = agoGetClockCounter();
+				if (dataToSync->ref.type == VX_TYPE_LUT) {
+					size_t origin[3] = { 0, 0, 0 };
+					size_t region[3] = { 256, 1, 1 };
+					cl_int err = clEnqueueReadImage(graph->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
+					if (err) { 
+						agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueReadImage(lut) => %d\n", err);
+						return -1; 
+					}
+				}
+				else {
+					vx_size size = dataToSync->size;
+					if (dataToSync->ref.type == VX_TYPE_ARRAY) {
+						// transfer only region that has valid data
+						size = dataToSync->u.arr.numitems * dataToSync->u.arr.itemsize;
+					}
+					if (size > 0) {
+						cl_int err = clEnqueueReadBuffer(graph->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
+						if (err) { 
+							agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueReadBuffer() => %d\n", err);
+							return -1; 
+						}
+					}
+				}
+				dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+				int64_t etime = agoGetClockCounter();
+				graph->opencl_perf.buffer_read += etime - stime;
+			}
+		}
+	}
+	return 0;
+}
+#endif
+
+int agoExecuteGraph(AgoGraph * graph)
+{
+	if (graph->detectedInvalidNode)
+		return VX_FAILURE;
+	else if (!graph->nodeList.head)
+		return VX_SUCCESS;
+	int status = VX_SUCCESS;
+
+	agoPerfCaptureStart(&graph->perf);
+
+	// update delay slots
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
+			AgoData * data = node->paramList[arg];
+			if (data && agoIsPartOfDelay(data)) {
+				// get the trace to delay object from original node parameter without vxAgeDelay changes
+				int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0;
+				AgoData * delay = agoGetSiblingTraceToDelay(node->paramListForAgeDelay[arg], siblingTrace, siblingTraceCount);
+				if (delay) {
+					// get the data 
+					data = agoGetDataFromSiblingTrace(delay, siblingTrace, siblingTraceCount);
+					if (data) {
+						// update the node parameter
+						node->paramList[arg] = data;
+					}
+					else return VX_FAILURE;
+				}
+				else return VX_FAILURE;
+			}
+		}
+	}
+#if ENABLE_OPENCL
+	for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) {
+		for (size_t arg = 0; arg < supernode->dataList.size(); arg++) {
+			AgoData * data = supernode->dataList[arg];
+			if (data && agoIsPartOfDelay(data)) {
+				// get the trace to delay object from original node parameter without vxAgeDelay changes
+				int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0;
+				AgoData * delay = agoGetSiblingTraceToDelay(supernode->dataListForAgeDelay[arg], siblingTrace, siblingTraceCount);
+				if (delay) {
+					// get the data 
+					data = agoGetDataFromSiblingTrace(delay, siblingTrace, siblingTraceCount);
+					if (data) {
+						// update the supernode parameter
+						supernode->dataList[arg] = data;
+					}
+					else return VX_FAILURE;
+				}
+				else return VX_FAILURE;
+			}
+		}
+	}
+#endif
+
+#if ENABLE_OPENCL
+	// clear opencl_buffer for all virtual images with enableUserBufferOpenCL == true
+	for (AgoData * data = graph->dataList.head; data; data = data->next) {
+		if (data->ref.type == VX_TYPE_IMAGE && data->u.img.enableUserBufferOpenCL) {
+			data->opencl_buffer = nullptr;
+			data->opencl_buffer_offset = 0;
+		}
+	}
+#endif
+	// mark that none of the supernode has been launched
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		if (node->supernode) {
+			node->supernode->launched = false;
+		}
+	}
+#if ENABLE_OPENCL
+	graph->opencl_nodeListQueued.clear();
+	vx_uint32 nodeLaunchHierarchicalLevel = 0;
+	memset(&graph->opencl_perf, 0, sizeof(graph->opencl_perf));
+#endif
+	// execute one nodes in one hierarchical level at a time
+	for (auto enode = graph->nodeList.head; enode;) {
+		// get snode..enode with next hierarchical_level 
+		auto hierarchical_level = enode->hierarchical_level;
+		auto snode = enode; enode = enode->next;
+		while (enode && enode->hierarchical_level == hierarchical_level)
+			enode = enode->next;
+#if ENABLE_OPENCL
+		// process GPU nodes at current hierarchical level
+		for (auto node = snode; node != enode; node = node->next) {
+			if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) {
+				bool launched = true;
+				agoPerfCaptureStart(&node->perf);
+				if (!node->supernode) {
+					// launch the single node
+					if (agoGpuOclSingleNodeLaunch(graph, node) < 0) {
+						return -1;
+					}
+				}
+				else if (!node->supernode->launched) {
+					// launch the super node
+					if (agoGpuOclSuperNodeLaunch(graph, node->supernode) < 0) {
+						return -1;
+					}
+					node->supernode->launched = true;
+				}
+				else {
+					launched = false;
+				}
+				if (launched) {
+					graph->opencl_nodeListQueued.push_back(node);
+					if (nodeLaunchHierarchicalLevel == 0) {
+						nodeLaunchHierarchicalLevel = node->hierarchical_level;
+					}
+				}
+			}
+		}
+#endif
+		// process CPU nodes at current hierarchical level
+		for (auto node = snode; node != enode; node = node->next) {
+			if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU) {
+#if ENABLE_OPENCL
+				if (nodeLaunchHierarchicalLevel > 0 && nodeLaunchHierarchicalLevel < node->hierarchical_level) {
+					status = agoWaitForNodesCompletion(graph);
+					if (status != VX_SUCCESS)
+                        return status;
+					nodeLaunchHierarchicalLevel = 0;
+				}
+				// make sure that all input buffers are synched
+				for (vx_uint32 i = 0; i < node->paramCount; i++) {
+					AgoData * data = node->paramList[i];
+					if (data && (node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL)) {
+						auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
+						status = agoDataSyncFromGpuToCpu(graph, node, dataToSync);
+						for (vx_uint32 j = 0; !status && j < dataToSync->numChildren; j++) {
+							AgoData * jdata = dataToSync->children[j];
+							if (jdata)
+								status = agoDataSyncFromGpuToCpu(graph, node, jdata);
+						}
+						if (status)
+							return status;
+					}
+				}
+#endif
+				// execute node
+				agoPerfCaptureStart(&node->perf);
+				AgoKernel * kernel = node->akernel;
+				status = VX_SUCCESS;
+				if (kernel->func) {
+					status = kernel->func(node, ago_kernel_cmd_execute);
+					if (status == AGO_ERROR_KERNEL_NOT_IMPLEMENTED)
+						status = VX_ERROR_NOT_IMPLEMENTED;
+				}
+				else if (kernel->kernel_f) {
+					status = kernel->kernel_f(node, (vx_reference *)node->paramList, node->paramCount);
+				}
+				if (status) {
+					return status;
+				}
+				agoPerfCaptureStop(&node->perf);
+#if ENABLE_OPENCL
+				// mark that node outputs are dirty
+				for (vx_uint32 i = 0; i < node->paramCount; i++) {
+					AgoData * data = node->paramList[i];
+					if (data && data->opencl_buffer && !data->u.img.enableUserBufferOpenCL && 
+						(node->parameters[i].direction == VX_OUTPUT || node->parameters[i].direction == VX_BIDIRECTIONAL))
+					{
+						auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
+						dataToSync->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+						dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE;
+					}
+				}
+#endif
+				// node callback
+				if (node->callback) {
+					vx_action action = node->callback(node);
+					if (action == VX_ACTION_ABANDON) {
+						return VX_ERROR_GRAPH_ABANDONED;
+					}
+				}
+			}
+		}
+	}
+#if ENABLE_OPENCL
+	if (nodeLaunchHierarchicalLevel > 0) {
+		status = agoWaitForNodesCompletion(graph);
+		if (status != VX_SUCCESS)
+			return status;
+	}
+	graph->opencl_perf_total.kernel_enqueue += graph->opencl_perf.kernel_enqueue;
+	graph->opencl_perf_total.kernel_wait += graph->opencl_perf.kernel_wait;
+	graph->opencl_perf_total.buffer_read += graph->opencl_perf.buffer_read;
+	graph->opencl_perf_total.buffer_write += graph->opencl_perf.buffer_write;
+#endif
+	agoPerfCaptureStop(&graph->perf);
+	return status;
+}
+
+vx_status agoDirective(vx_reference reference, vx_enum directive)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidReference(reference)) {
+		vx_context context = reference->context;
+		if (agoIsValidContext(context)) {
+			CAgoLock lock(context->cs);
+			status = VX_SUCCESS;
+			switch (directive)
+			{
+			case VX_DIRECTIVE_ENABLE_LOGGING:
+				reference->enable_logging = true;
+				break;
+			case VX_DIRECTIVE_DISABLE_LOGGING:
+				reference->enable_logging = false;
+				break;
+			case VX_DIRECTIVE_AMD_READ_ONLY:
+				if (reference->type == VX_TYPE_CONVOLUTION || reference->type == VX_TYPE_MATRIX) {
+					if (((AgoData *)reference)->buffer) {
+						reference->read_only = true;
+					}
+					else {
+						status = VX_ERROR_NOT_SUPPORTED;
+					}
+				}
+				else {
+					status = VX_ERROR_NOT_SUPPORTED;
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_DIRECTIVE_AMD_COPY_TO_OPENCL:
+				status = VX_ERROR_NOT_SUPPORTED;
+				if (reference->context->opencl_cmdq) {
+					auto data = (AgoData *)reference;
+					auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
+					if (dataToSync->ref.type == VX_TYPE_LUT) {
+						if (dataToSync->opencl_buffer) {
+							size_t origin[3] = { 0, 0, 0 };
+							size_t region[3] = { 256, 1, 1 };
+							cl_int err = clEnqueueWriteImage(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
+							if (err) { 
+								agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
+								return VX_FAILURE; 
+							}
+							dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+							status = VX_SUCCESS;
+						}
+					}
+					else if (dataToSync->ref.type == VX_TYPE_IMAGE && dataToSync->numChildren > 0) {
+						for (vx_uint32 plane = 0; plane < dataToSync->numChildren; plane++) {
+							AgoData * img = dataToSync->children[plane];
+							if (img && img->opencl_buffer) {
+								cl_int err = clEnqueueWriteBuffer(img->ref.context->opencl_cmdq, img->opencl_buffer, CL_TRUE, img->opencl_buffer_offset, img->size, img->buffer, 0, NULL, NULL);
+								if (err) { 
+									agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
+									return VX_FAILURE; 
+								}
+								img->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+								status = VX_SUCCESS;
+							}
+						}
+					}
+					else {
+						if (dataToSync->opencl_buffer) {
+							vx_size size = dataToSync->size;
+							if (dataToSync->ref.type == VX_TYPE_ARRAY) {
+								// transfer only valid data
+								size = dataToSync->u.arr.itemsize * dataToSync->u.arr.numitems;
+							}
+							if (size > 0) {
+								cl_int err = clEnqueueWriteBuffer(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
+								if (err) { 
+									agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
+									return VX_FAILURE; 
+								}
+							}
+							dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+							status = VX_SUCCESS;
+						}
+					}
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+vx_status agoGraphDumpPerformanceProfile(AgoGraph * graph, const char * fileName)
+{
+	bool use_stdout = true;
+	FILE * fp = stdout;
+	if (fileName && strcmp(fileName, "stdout") != 0) {
+		use_stdout = false;
+		fp = fopen(fileName, "w");
+		if (!fp) {
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: unable to create: %s\n", fileName);
+			return VX_FAILURE;
+		}
+	}
+	fprintf(fp, " COUNT,tmp(ms),avg(ms),min(ms),max(ms),DEV,KERNEL\n");
+	int64_t freq = agoGetClockFrequency();
+	float factor = 1000.0f / (float)freq; // to convert clock counter to ms
+	if (graph->perf.num > 0) {
+		fprintf(fp, "%6d,%7.3f,%7.3f,%7.3f,%7.3f,%s,%s\n",
+			(int)graph->perf.num, (float)graph->perf.tmp * factor,
+			(float)graph->perf.sum * factor / (float)graph->perf.num,
+			(float)graph->perf.min * factor, (float)graph->perf.max * factor,
+			graph->attr_affinity.device_type == AGO_TARGET_AFFINITY_GPU ? "GPU" : "CPU",
+			"GRAPH");
+	}
+	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
+		if (node->perf.num > 0) {
+			fprintf(fp, "%6d,%7.3f,%7.3f,%7.3f,%7.3f,%s,%s\n",
+				(int)node->perf.num, (float)node->perf.tmp * factor,
+				(float)node->perf.sum * factor / (float)node->perf.num,
+				(float)node->perf.min * factor, (float)node->perf.max * factor,
+				node->attr_affinity.device_type == AGO_TARGET_AFFINITY_GPU ? "GPU" : "CPU",
+				node->akernel->name);
+		}
+	}
+	fflush(fp);
+	if (!use_stdout) {
+		fclose(fp);
+	}
+	return VX_SUCCESS;
+}
+
+int agoProcessGraph(AgoGraph * graph)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph)) {
+		CAgoLock lock(graph->cs);
+
+		// make sure that graph is verified
+		status = VX_SUCCESS;
+		if (!graph->verified) {
+			status = vxVerifyGraph(graph);
+		}
+
+		// execute graph if possible
+		if (status == VX_SUCCESS) {
+			status = VX_FAILURE;
+			if (graph->verified && graph->isReadyToExecute) {
+				status = agoExecuteGraph(graph);
+			}
+		}
+	}
+	return status;
+}
+
+int agoScheduleGraph(AgoGraph * graph)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph)) {
+		status = VX_SUCCESS;
+		graph->threadScheduleCount++;
+		if (graph->hThread) {
+			if (!graph->verified) {
+				// make sure to verify the graph in master thread
+				CAgoLock lock(graph->cs);
+				status = vxVerifyGraph(graph);
+			}
+			if (status == VX_SUCCESS) {
+				// inform graph thread to execute
+				if (!ReleaseSemaphore(graph->hSemToThread, 1, nullptr)) {
+					status = VX_ERROR_NO_RESOURCES;
+				}
+			}
+		}
+		else {
+			status = agoProcessGraph(graph);
+		}
+	}
+	return status;
+}
+
+int agoWaitGraph(AgoGraph * graph)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph)) {
+		status = VX_SUCCESS;
+		graph->threadWaitCount++;
+		if (graph->hThread) {
+			while (graph->threadExecuteCount != graph->threadScheduleCount) {
+				if (WaitForSingleObject(graph->hSemFromThread, INFINITE) != WAIT_OBJECT_0) {
+					status = VX_FAILURE;
+					break;
+				}
+			}
+		}
+		if (status == VX_SUCCESS) {
+			status = graph->status;
+		}
+	}
+	return status;
+}
diff --git a/openvx/ago/ago_internal.h b/openvx/ago/ago_internal.h
new file mode 100644
index 0000000..f02577e
--- /dev/null
+++ b/openvx/ago/ago_internal.h
@@ -0,0 +1,781 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __ago_internal_h__
+#define __ago_internal_h__
+
+#include "ago_platform.h"
+#include "ago_kernels.h"
+#include "ago_haf_cpu.h"
+#include "vx_ext_amd.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// configuration flags and constants
+//
+
+// version
+#define AGO_VERSION "0.9.0"
+
+// debug configuration
+#define ENABLE_DEBUG_MESSAGES                 0 // 0:disable 1:enable
+#define SHOW_DEBUG_HIERARCHICAL_LEVELS        0 // 0:disable 1:enable debug hierarchical levels
+#define ENABLE_LOG_MESSAGES_DEFAULT        true // default logging directive VX_DIRECTIVE_ENABLE_LOGGING 
+
+// fused OpenCL kernel workgroup size
+#define AGO_OPENCL_WORKGROUP_SIZE_0          16 // workgroup_size[0]
+#define AGO_OPENCL_WORKGROUP_SIZE_1          16 // workgroup_size[1]
+#define AGO_OPENCL_WORKGROUP_SIZE_2           1 // workgroup_size[2]
+
+// Flag to enable BMI2 instructions in the primitives
+#define USE_BMI2 0
+
+// AGO configuration
+#define USE_AGO_CANNY_SOBEL_SUPP_THRESHOLD    0 // 0:seperate-sobel-and-nonmaxsupression 1:combine-sobel-and-nonmaxsupression
+#define AGO_MEMORY_ALLOC_EXTRA_PADDING       64 // extra bytes to the left and right of buffer allocations
+#define AGO_MAX_DEPTH_FROM_DELAY_OBJECT       4 // number of levels from delay object to low-level object
+
+// AGO internal error codes for debug
+#define AGO_SUCCESS                           0 // operation is successful
+#define AGO_ERROR_FEATURE_NOT_IMPLEMENTED    -1 // TBD: this needs to be set to -ve number
+#define AGO_ERROR_KERNEL_NOT_IMPLEMENTED     -1 // TBD: this needs to be set to -ve number
+#define AGO_ERROR_HAFCPU_NOT_IMPLEMENTED     -1 // TBD: this needs to be set to -ve number
+
+// AGO kernel flags that are part of kernel configuration
+#define AGO_KERNEL_FLAG_GROUP_MASK       0x000f // kernel group mask
+#define AGO_KERNEL_FLAG_GROUP_AMDLL      0x0000 // kernel group: AMD low-level kernels
+#define AGO_KERNEL_FLAG_GROUP_OVX10      0x0001 // kernel group: OpenVX 1.0 built-in kernels
+#define AGO_KERNEL_FLAG_GROUP_USER       0x0002 // kernel group: User kernels
+#define AGO_KERNEL_FLAG_DEVICE_MASK      0x00f0 // kernel device mask
+#define AGO_KERNEL_FLAG_DEVICE_CPU       0x0010 // kernel device: CPU (shall be same as AGO_TARGET_AFFINITY_CPU)
+#define AGO_KERNEL_FLAG_DEVICE_GPU       0x0020 // kernel device: GPU (shall be same as AGO_TARGET_AFFINITY_GPU)
+#define AGO_KERNEL_FLAG_GPU_INTEG_MASK   0x0f00 // kernel GPU integration type mask
+#define AGO_KERNEL_FLAG_GPU_INTEG_NONE   0x0000 // kernel GPU integration: no integration needed
+#define AGO_KERNEL_FLAG_GPU_INTEG_FULL   0x0100 // kernel GPU integration: full OpenCL kernel supplied
+#define AGO_KERNEL_FLAG_GPU_INTEG_M2R    0x0200 // kernel GPU integration: need OpenCL kernel generation (MEM2REG)
+#define AGO_KERNEL_FLAG_GPU_INTEG_R2R    0x0400 // kernel GPU integration: need OpenCL kernel generation (REG2REG)
+#define AGO_KERNEL_FLAG_SUBGRAPH         0x1000 // kernel is a subgraph
+
+// AGO default target priority
+#if ENABLE_OPENCL
+#define AGO_KERNEL_TARGET_DEFAULT        AGO_KERNEL_FLAG_DEVICE_GPU // pick CPU or GPU
+#else
+#define AGO_KERNEL_TARGET_DEFAULT        AGO_KERNEL_FLAG_DEVICE_CPU // pick CPU or GPU
+#endif
+
+// AGO kernel argument flags
+#define AGO_KERNEL_ARG_INPUT_FLAG          0x01 // argument is input
+#define AGO_KERNEL_ARG_OUTPUT_FLAG         0x02 // argument is output
+#define AGO_KERNEL_ARG_OPTIONAL_FLAG       0x04 // argument is optional
+
+// AGO kernel operation type info
+#define AGO_KERNEL_OP_TYPE_UNKNOWN            0 // unknown
+#define AGO_KERNEL_OP_TYPE_ELEMENT_WISE       1 // element wise operation
+#define AGO_KERNEL_OP_TYPE_FIXED_NEIGHBORS    2 // filtering operation with fixed neighborhood
+
+// AGO magic code
+#define AGO_MAGIC_VALID              0xC001C0DE // magic code: reference is valid
+#define AGO_MAGIC_INVALID            0xC0FFC0DE // magic code: reference is invalid
+
+// AGO limites
+#define AGO_MAX_CONVOLUTION_DIM               9 // maximum size of convolution matrix
+#define AGO_OPTICALFLOWPYRLK_MAX_DIM         15 // maximum size of opticalflow block size
+
+// AGO remap data precision
+#define AGO_REMAP_FRACTIONAL_BITS             3 // number of fractional bits in re-map locations
+#define AGO_REMAP_CONSTANT_BORDER_VALUE  0xffff // corrdinate value indicating out of border for constant fills
+
+// AGO buffer sync flags
+#define AGO_BUFFER_SYNC_FLAG_DIRTY_MASK         0x0000000f // dirty bit mask
+#define AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT    0x00000001 // buffer dirty by user
+#define AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE      0x00000002 // buffer dirty by node
+#define AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL   0x00000004 // OpenCL buffer dirty by node
+#define AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED      0x00000008 // OpenCL buffer has been synced
+
+// AGO graph optimizer
+#define AGO_GRAPH_OPTIMIZER_FLAG_NO_DIVIDE                0x00000001 // don't run drama divide
+#define AGO_GRAPH_OPTIMIZER_FLAG_NO_REMOVE_COPY_NODES     0x00000002 // don't remove unnecessary copy operations
+#define AGO_GRAPH_OPTIMIZER_FLAG_NO_REMOVE_UNUSED_OUTPUTS 0x00000004 // don't remove nodes with unused outputs
+#define AGO_GRAPH_OPTIMIZER_FLAG_NO_NODE_MERGE            0x00000008 // don't perform node merge
+#define AGO_GRAPH_OPTIMIZER_FLAG_NO_CONVERT_8BIT_TO_1BIT  0x00000010 // don't convert 8-bit images to 1-bit images
+#define AGO_GRAPH_OPTIMIZER_FLAG_NO_SUPERNODE_MERGE       0x00000020 // don't merge supernodes
+#define AGO_GRAPH_OPTIMIZER_FLAGS_DEFAULT                 0x00000000 // default options
+
+#if ENABLE_OPENCL
+// bit-fields of opencl_type
+#define NODE_OPENCL_TYPE_REG2REG              1 // register to register
+#define NODE_OPENCL_TYPE_MEM2REG              2 // memory to register
+#define NODE_OPENCL_TYPE_NEED_IMGSIZE         8 // need image size as argument for memory operation
+#define NODE_OPENCL_TYPE_FULL_KERNEL         16 // node is a single kernel
+#define NODE_OPENCL_TYPE_ATOMIC              32 // argument has atomic output
+// additional bit-fields for dataFlags[]
+#define DATA_OPENCL_FLAG_BUFFER        (1 <<  8) // marks that the data is a buffer
+#define DATA_OPENCL_FLAG_NEED_LOAD_R2R (1 <<  9) // marks that the data needs to load for REG2REG
+#define DATA_OPENCL_FLAG_NEED_LOAD_M2R (1 << 10) // marks that the data needs to load for MEM2REG
+#define DATA_OPENCL_FLAG_NEED_LOCAL    (1 << 11) // marks that the data needs to load into local buffer
+#define DATA_OPENCL_FLAG_DISCARD_PARAM (1 << 12) // marks that the data needs to be discarded
+// kernel name
+#define NODE_OPENCL_KERNEL_NAME  "OpenVX_kernel"
+// opencl related constants
+#define DATA_OPENCL_ARRAY_OFFSET             16  // first 16 bytes of array buffer will be used for numitems
+// opencl configuration flags
+#define CONFIG_OPENCL_USE_1_2              0x0001  // use OpenCL 1.2
+#define CONFIG_OPENCL_SVM_MASK             0x00F0  // OpenCL SVM flags mask
+#define CONFIG_OPENCL_SVM_ENABLE           0x0010  // use OpenCL SVM
+#define CONFIG_OPENCL_SVM_AS_FGS           0x0020  // use OpenCL SVM as fine grain system
+#define CONFIG_OPENCL_SVM_AS_CLMEM         0x0040  // use OpenCL SVM as cl_mem
+#endif
+
+// thread scheduling configuration
+#define CONFIG_THREAD_DEFAULT                 1  // 0:disable 1:enable separate threads for graph scheduling
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// helpful macros
+//
+#define dimof(x)                    (sizeof(x)/sizeof(x[0]))
+#define FORMAT_STR(fmt)             ((const char *)&(fmt))
+#if ENABLE_DEBUG_MESSAGES
+#define debug_printf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define debug_printf(fmt, ...)
+#endif
+//   ALIGN16 - aligns data to 16 multiple
+#define ALIGN16(x)		((((size_t)(x))+15)&~15)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// ago data types
+//
+#define AgoReference  _vx_reference
+#define AgoContext    _vx_context
+#define AgoGraph      _vx_graph
+#define AgoKernel     _vx_kernel
+#define AgoNode       _vx_node
+#define AgoParameter  _vx_parameter
+#define AgoMetaFormat _vx_meta_format
+typedef enum {
+	ago_kernel_cmd_execute,
+	ago_kernel_cmd_validate,
+	ago_kernel_cmd_get_image_plane_nonusage,
+	ago_kernel_cmd_initialize,
+	ago_kernel_cmd_shutdown,
+	ago_kernel_cmd_query_target_support,
+#if ENABLE_OPENCL
+	ago_kernel_cmd_opencl_codegen,
+#endif
+} AgoKernelCommand;
+struct AgoNode;
+struct AgoContext;
+struct AgoData;
+struct AgoReference {
+	void * dispatchTbl;           // dispatch table to support Installable Client Driver (ICD) loader
+	vx_uint32    magic;           // shall be always be AGO_MAGIC
+	vx_enum      type;            // object type
+	AgoContext * context;         // context
+	AgoReference * scope;         // scope parent -- for virtual objects, this will be graph
+	vx_uint32    external_count;  // user usage count -- can't be free when > 0, can't be access when == 0
+	vx_uint32    internal_count;  // framework usage count -- can't be free when > 0
+	vx_uint32    read_count;      // number of times object has been read
+	vx_uint32    write_count;     // number of times object has been written
+	bool         hint_serialize;  // serialize hint
+	bool         enable_logging;  // enable logging
+	bool         read_only;       // read only
+	vx_status    status;          // error status
+public:
+	AgoReference();
+	~AgoReference();
+};
+struct AgoConfigDelay {
+	vx_enum type;
+	vx_int32 age;
+	vx_uint32 count;
+};
+struct AgoConfigArray {
+	vx_enum itemtype;
+	vx_size numitems;
+	vx_size capacity;
+	vx_size itemsize;
+};
+struct AgoConfigConvolution {
+	vx_size rows;
+	vx_size columns;
+	vx_uint32 shift;
+	bool is_separable;
+};
+struct AgoConfigDistribution {
+	vx_size numbins;
+	vx_int32 offset;
+	vx_uint32 range;
+	vx_uint32 window;
+};
+struct AgoConfigImage {
+	vx_uint32 width;
+	vx_uint32 height;
+	vx_df_image format;
+	vx_uint32 stride_in_bytes;
+	vx_size pixel_size_in_bits;
+	vx_size components;
+	vx_size planes;
+	vx_bool isVirtual;
+	vx_bool isUniform;
+	vx_size uniform[4];
+	vx_bool isROI;
+	vx_rectangle_t rect_roi;
+	vx_rectangle_t rect_valid;
+	AgoData * roiMasterImage;
+	vx_bool hasMinMax;
+	vx_int32 minValue;
+	vx_int32 maxValue;
+	vx_color_space_e color_space;
+	vx_channel_range_e channel_range;
+	vx_uint32 x_scale_factor_is_2; // will be 0 or 1
+	vx_uint32 y_scale_factor_is_2; // will be 0 or 1
+	vx_bool enableUserBufferOpenCL;
+};
+struct AgoConfigLut {
+	vx_enum type;
+	vx_size count;
+};
+struct AgoConfigMatrix {
+	vx_enum type;
+	vx_size columns;
+	vx_size rows;
+	vx_size itemsize;
+};
+struct AgoConfigPyramid {
+	vx_uint32 width;
+	vx_uint32 height;
+	vx_df_image format;
+	vx_float32 scale;
+	vx_size levels;
+	vx_bool isVirtual;
+	vx_rectangle_t rect_valid;
+};
+struct AgoConfigRemap {
+	vx_uint32 src_width;
+	vx_uint32 src_height;
+	vx_uint32 dst_width;
+	vx_uint32 dst_height;
+	vx_uint32 remap_fractional_bits;
+};
+struct AgoConfigScalar {
+	vx_enum type;
+	union {
+		vx_enum e;
+		vx_float32 f;
+		vx_int32 i;
+		vx_uint32 u;
+		vx_df_image df;
+		vx_size s;
+		vx_int64 i64;
+		vx_uint64 u64;
+		vx_float64 f64;
+	} u;
+	vx_size itemsize;
+};
+struct AgoConfigThreshold {
+	vx_enum thresh_type;
+	vx_enum data_type;
+	vx_int32 threshold_lower, threshold_upper;
+	vx_int32 true_value, false_value;
+};
+struct AgoConfigCannyStack {
+	vx_uint32 count;
+	vx_uint32 stackTop;
+};
+struct AgoConfigScaleMatrix {
+	vx_float32 xscale;
+	vx_float32 yscale;
+	vx_float32 xoffset;
+	vx_float32 yoffset;
+};
+struct AgoTargetAffinityInfo_ { // NOTE: make sure that this data structure is identical to AgoTargetAffinityInfo in vx_amd_ext.h
+	vx_uint32 device_type;
+	vx_uint32 device_info;
+	vx_uint32 group;
+	vx_uint32 reserved;
+};
+struct MappedData {
+	void * ptr;
+	vx_enum usage;
+	bool used_external_ptr;
+	vx_size stride;
+};
+struct AgoData {
+	AgoReference ref;
+	AgoData * next;
+	std::string name;
+	union {
+		AgoConfigDelay delay;
+		AgoConfigArray arr;
+		AgoConfigConvolution conv;
+		AgoConfigDistribution dist;
+		AgoConfigImage img;
+		AgoConfigLut lut;
+		AgoConfigMatrix mat;
+		AgoConfigPyramid pyr;
+		AgoConfigRemap remap;
+		AgoConfigScalar scalar;
+		AgoConfigThreshold thr;
+		AgoConfigCannyStack cannystack;
+		AgoConfigScaleMatrix scalemat;
+	} u;
+	vx_delta_rectangle_t delta;
+	vx_size size;
+	vx_import_type_e import_type;
+	vx_uint8 * buffer;
+	vx_uint8 * buffer_allocated;
+	vx_uint8 * reserved;
+	vx_uint8 * reserved_allocated;
+	vx_uint32  buffer_sync_flags;
+#if ENABLE_OPENCL
+	cl_mem     opencl_buffer;
+	cl_mem     opencl_buffer_allocated;
+#endif
+	vx_uint8 * opencl_svm_buffer;
+	vx_uint8 * opencl_svm_buffer_allocated;
+	vx_uint32  opencl_buffer_offset;
+	vx_bool isVirtual;
+	vx_bool isDelayed;
+	vx_bool isNotFullyConfigured;
+	vx_bool isInitialized;
+	vx_int32 siblingIndex;
+	vx_uint32 numChildren;
+	AgoData ** children;
+	AgoData * parent;
+	vx_uint32 inputUsageCount, outputUsageCount, inoutUsageCount;
+	std::list<MappedData> mapped;
+	vx_uint32 hierarchical_level;
+	vx_uint32 hierarchical_life_start;
+	vx_uint32 hierarchical_life_end;
+public:
+	AgoData();
+	~AgoData();
+};
+struct AgoDataList {
+	vx_uint32 count;
+	AgoData * head;
+	AgoData * tail;
+	AgoData * trash;
+};
+struct AgoMetaFormat {
+	vx_enum type;
+	AgoData data;
+};
+struct AgoParameter {
+	AgoReference ref;
+	AgoReference * scope;
+	vx_uint32 index;
+	vx_direction_e direction;
+	vx_enum type;
+	vx_parameter_state_e state;
+public:
+	AgoParameter();
+	~AgoParameter();
+};
+struct AgoKernel {
+	AgoReference ref;
+	AgoKernel * next;
+	vx_enum id;
+	vx_char name[VX_MAX_KERNEL_NAME];
+	vx_uint32 flags;
+	int(*func)(AgoNode * node, AgoKernelCommand cmd);
+	vx_uint32 argCount;
+	vx_uint8 argConfig[AGO_MAX_PARAMS];
+	vx_enum argType[AGO_MAX_PARAMS];
+	vx_uint8 kernOpType;
+	vx_uint8 kernOpInfo;
+	AgoParameter parameters[AGO_MAX_PARAMS];
+	vx_size localDataSize;
+	vx_uint8 * localDataPtr;
+	bool external_kernel;
+	bool finalized;
+	vx_kernel_f kernel_f;
+	vx_kernel_input_validate_f input_validate_f;
+	vx_kernel_output_validate_f output_validate_f;
+	vx_kernel_initialize_f initialize_f;
+	vx_kernel_deinitialize_f deinitialize_f;
+	amd_kernel_query_target_support_f query_target_support_f;
+	amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f;
+	amd_kernel_node_regen_callback_f regen_callback_f;
+	vx_uint32 importing_module_index_plus1;
+public:
+	AgoKernel();
+	~AgoKernel();
+};
+struct AgoSuperNodeDataInfo {
+	vx_uint32 data_type_flags;
+	bool needed_as_a_kernel_argument;
+	vx_uint32 argument_usage[3]; // VX_INPUT, VX_OUTPUT, VX_BIDIRECTIONAL
+	vx_uint32 local_buffer_size_in_bytes;
+};
+struct AgoSuperNode {
+	AgoSuperNode * next;
+	vx_uint32 group;
+	vx_uint32 width;
+	vx_uint32 height;
+	std::vector<AgoNode *> nodeList;
+	std::vector<AgoData *> dataList;
+	std::vector<AgoData *> dataListForAgeDelay;
+	std::vector<AgoSuperNodeDataInfo> dataInfo;
+	std::string opencl_code;
+	bool launched;
+	bool isGpuOclSuperNode;
+#if ENABLE_OPENCL
+	cl_command_queue opencl_cmdq;
+	cl_program opencl_program;
+	cl_kernel opencl_kernel;
+	cl_event opencl_event;
+	size_t opencl_global_work[2];
+#endif
+	vx_status status;
+	vx_perf_t perf;
+public:
+	AgoSuperNode();
+	~AgoSuperNode();
+};
+struct AgoNode {
+	AgoReference ref;
+	AgoNode * next;
+	AgoKernel * akernel;
+	vx_uint32 flags;
+	vx_border_mode_t attr_border_mode;
+	AgoTargetAffinityInfo_ attr_affinity;
+	vx_size localDataSize;
+	vx_uint8 * localDataPtr;
+	vx_uint8 * localDataPtr_allocated;
+	vx_uint32 paramCount;
+	AgoData * paramList[AGO_MAX_PARAMS];
+	AgoData * paramListForAgeDelay[AGO_MAX_PARAMS];
+	AgoParameter parameters[AGO_MAX_PARAMS];
+	AgoMetaFormat metaList[AGO_MAX_PARAMS];
+	vx_int32 funcExchange[AGO_MAX_PARAMS];
+	vx_nodecomplete_f callback;
+	AgoSuperNode * supernode;
+	bool initialized;
+	vx_rectangle_t rect_valid;
+	vx_uint32 target_support_flags;
+	vx_uint32 hierarchical_level;
+	vx_status status;
+	vx_perf_t perf;
+#if ENABLE_OPENCL
+	vx_uint32 opencl_type;
+	char opencl_name[VX_MAX_KERNEL_NAME];
+	std::string opencl_code;
+	std::string opencl_build_options;
+	struct { bool enable; int paramIndexScalar; int paramIndexArray; } opencl_scalar_array_output_sync;
+	vx_uint32 opencl_param_mem2reg_mask;
+	vx_uint32 opencl_param_discard_mask;
+	vx_uint32 opencl_param_atomic_mask;
+	vx_uint32 opencl_local_buffer_usage_mask;
+	vx_uint32 opencl_local_buffer_size_in_bytes;
+	vx_uint32 opencl_work_dim;
+	size_t opencl_global_work[3];
+	size_t opencl_local_work[3];
+	vx_uint32 opencl_compute_work_multiplier;
+	vx_uint32 opencl_compute_work_param_index;
+	vx_uint32 opencl_output_array_param_index_plus1;
+	cl_program opencl_program;
+	cl_kernel opencl_kernel;
+	cl_event opencl_event;
+#endif
+public:
+	AgoNode();
+	~AgoNode();
+};
+struct AgoUserStruct {
+	vx_enum id;
+	vx_size size;
+	std::string name;
+	vx_uint32 importing_module_index_plus1;
+};
+struct AgoKernelList {
+	vx_uint32 count;
+	AgoKernel * head;
+	AgoKernel * tail;
+};
+struct AgoNodeList {
+	vx_uint32 count;
+	AgoNode * head;
+	AgoNode * tail;
+	AgoNode * trash;
+};
+struct AgoGraph {
+	AgoReference ref;
+	AgoGraph * next;
+	CRITICAL_SECTION cs;
+	HANDLE hThread, hSemToThread, hSemFromThread;
+	vx_int32 threadScheduleCount, threadExecuteCount, threadWaitCount, threadThreadTerminationState;
+	AgoDataList dataList;
+	AgoNodeList nodeList;
+	vx_bool isReadyToExecute;
+	bool detectedInvalidNode;
+	vx_int32 status;
+	vx_perf_t perf;
+	struct AgoGraphPerfInternalInfo_ { // shall be identical to AgoGraphPerfInternalInfo in amd_ext_amd.h
+		vx_uint64 kernel_enqueue;
+		vx_uint64 kernel_wait;
+		vx_uint64 buffer_read;
+		vx_uint64 buffer_write;
+	};
+	AgoGraphPerfInternalInfo_ opencl_perf, opencl_perf_total;
+	vx_uint32 virtualDataGenerationCount;
+	vx_uint32 optimizer_flags;
+	bool verified;
+	std::vector<vx_parameter> parameters;
+#if ENABLE_OPENCL
+	std::vector<AgoNode *> opencl_nodeListQueued;
+	AgoSuperNode * supernodeList;
+	cl_command_queue opencl_cmdq;
+	cl_device_id opencl_device;
+#endif
+	AgoTargetAffinityInfo_ attr_affinity;
+public:
+	AgoGraph();
+	~AgoGraph();
+};
+struct AgoGraphList {
+	vx_uint32 count;
+	AgoGraph * head;
+	AgoGraph * tail;
+};
+struct ModuleData {
+	char module_name[256];
+	char module_path[1024];
+	ago_module hmodule;
+};
+struct MacroData {
+	char name[256];
+	char * text;
+	char * text_allocated;
+};
+struct AgoContext {
+	AgoReference ref;
+	vx_uint64 perfNormFactor;
+	CRITICAL_SECTION cs;
+	AgoKernelList kernelList;
+	AgoDataList dataList;
+	AgoGraphList graphList;
+	std::vector<AgoUserStruct> userStructList;
+	vx_uint32 dataGenerationCount;
+	vx_enum nextUserStructId;
+	vx_uint32 num_active_modules;
+	vx_uint32 num_active_references;
+	vx_border_mode_t immediate_border_mode;
+	vx_log_callback_f callback_log;
+	vx_bool callback_reentrant;
+	vx_uint32 thread_config;
+	vx_char extensions[256];
+	std::vector<ModuleData> modules;
+	std::vector<MacroData> macros;
+	std::vector<AgoNodeMergeRule> merge_rules;
+	vx_uint32 importing_module_index_plus1;
+	AgoData * graph_garbage_data;
+	AgoNode * graph_garbage_node;
+	AgoGraph * graph_garbage_list;
+#if ENABLE_OPENCL
+	bool opencl_context_imported;
+	cl_context   opencl_context;
+	cl_command_queue opencl_cmdq;
+	vx_uint32 opencl_config_flags;
+	char opencl_extensions[1024];
+	cl_device_svm_capabilities opencl_svmcaps;
+	cl_uint      opencl_num_devices;
+	cl_device_id opencl_device_list[16];
+	char opencl_build_options[256];
+#endif
+	AgoTargetAffinityInfo_ attr_affinity;
+public:
+	AgoContext();
+	~AgoContext();
+};
+struct AgoAllocInfo {
+	void * allocated;
+	vx_size requested_size;
+	vx_int32 retain_count;
+	vx_int32 allocate_id;
+};
+
+struct _vx_array { AgoData d; };
+struct _vx_convolution { AgoData d; };
+struct _vx_delay { AgoData d; };
+struct _vx_distribution { AgoData d; };
+struct _vx_image { AgoData d; };
+struct _vx_lut { AgoData d; };
+struct _vx_matrix { AgoData d; };
+struct _vx_pyramid { AgoData d; };
+struct _vx_remap { AgoData d; };
+struct _vx_scalar { AgoData d; };
+struct _vx_threshold { AgoData d; };
+
+// framework
+void * agoAllocMemory(vx_size size);
+void agoRetainMemory(void * mem);
+void agoReleaseMemory(void * mem);
+int agoChannelEnum2Index(vx_enum channel);
+const char * agoEnum2Name(vx_enum e);
+size_t agoType2Size(vx_context context, vx_enum type);
+vx_enum agoName2Enum(const char * name);
+void agoResetReference(AgoReference * ref, vx_enum type, vx_context context, vx_reference scope);
+void agoAddData(AgoDataList * dataList, AgoData * data);
+void agoAddNode(AgoNodeList * nodeList, AgoNode * node);
+void agoAddKernel(AgoKernelList * kernelList, AgoKernel * kernel);
+void agoAddGraph(AgoGraphList * graphList, AgoGraph * graph);
+vx_enum agoAddUserStruct(AgoContext * acontext, vx_size size, vx_char * name);
+AgoGraph * agoRemoveGraph(AgoGraphList * list, AgoGraph * item);
+int agoRemoveNode(AgoNodeList * nodeList, AgoNode * node, bool moveToTrash);
+int agoShutdownNode(AgoNode * node);
+int agoRemoveData(AgoDataList * list, AgoData * item, AgoData ** trash);
+AgoKernel * agoRemoveKernel(AgoKernelList * list, AgoKernel * item);
+void agoRemoveDataInGraph(AgoGraph * agraph, AgoData * data);
+void agoReplaceDataInGraph(AgoGraph * agraph, AgoData * dataFind, AgoData * dataReplace);
+void agoResetDataList(AgoDataList * dataList);
+void agoResetNodeList(AgoNodeList * nodeList);
+void agoResetKernelList(AgoKernelList * kernelList);
+vx_size agoGetUserStructSize(AgoContext * acontext, vx_char * name);
+vx_size agoGetUserStructSize(AgoContext * acontext, vx_enum id);
+vx_enum agoGetUserStructType(AgoContext * acontext, vx_char * name);
+const char * agoGetUserStructName(AgoContext * acontext, vx_enum id);
+AgoKernel * agoFindKernelByEnum(AgoContext * acontext, vx_enum kernel_id);
+AgoKernel * agoFindKernelByName(AgoContext * acontext, const vx_char * name);
+AgoData * agoFindDataByName(AgoContext * acontext, AgoGraph * agraph, vx_char * name);
+void agoMarkChildrenAsPartOfDelay(AgoData * adata);
+bool agoIsPartOfDelay(AgoData * adata);
+AgoData * agoGetSiblingTraceToDelay(AgoData * data, int trace[], int& traceCount);
+AgoData * agoGetDataFromSiblingTrace(AgoData * data, int trace[], int traceCount);
+void agoGetDescriptionFromData(AgoContext * acontext, char * desc, AgoData * data);
+int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData * data, const char * desc);
+AgoData * agoCreateDataFromDescription(AgoContext * acontext, AgoGraph * agraph, const char * desc, bool isForExternalUse);
+void agoGenerateDataName(AgoContext * acontext, const char * postfix, std::string& name);
+void agoGenerateVirtualDataName(AgoGraph * agraph, const char * postfix, std::string& name);
+int agoGetImageComponentsAndPlanes(vx_df_image format, vx_size * pComponents, vx_size * pPlanes, vx_size * pPixelSizeInBits, vx_color_space_e * pColorSpace, vx_channel_range_e * pChannelRange);
+int agoGetImagePlaneFormat(vx_df_image format, vx_uint32 width, vx_uint32 height, vx_uint32 plane, vx_df_image *pFormat, vx_uint32 * pWidth, vx_uint32 * pHeight);
+void agoGetDataName(vx_char * name, AgoData * data);
+int agoAllocData(AgoData * data);
+void agoRetainData(AgoGraph * graph, AgoData * data, bool isForExternalUse);
+int agoReleaseData(AgoData * data, bool isForExternalUse);
+int agoReleaseKernel(AgoKernel * kernel, bool isForExternalUse);
+AgoNode * agoCreateNode(AgoGraph * graph, AgoKernel * kernel);
+AgoNode * agoCreateNode(AgoGraph * graph, vx_enum kernel_id);
+int agoReleaseNode(AgoNode * node);
+vx_status agoVerifyNode(AgoNode * node);
+// sanity checks
+int agoDataSanityCheckAndUpdate(AgoData * data);
+bool agoIsValidReference(AgoReference * ref);
+bool agoIsValidContext(AgoContext * context);
+bool agoIsValidGraph(AgoGraph * graph);
+bool agoIsValidKernel(AgoKernel * kernel);
+bool agoIsValidNode(AgoNode * node);
+bool agoIsValidParameter(AgoParameter * parameter);
+bool agoIsValidData(AgoData * data, vx_enum type);
+// kernels
+int agoPublishKernels(AgoContext * acontext);
+// drama
+int agoOptimizeDrama(AgoGraph * agraph);
+void agoOptimizeDramaMarkDataUsage(AgoGraph * agraph);
+int agoOptimizeDramaComputeGraphHierarchy(AgoGraph * graph);
+void agoOptimizeDramaSortGraphHierarchy(AgoGraph * graph);
+int agoOptimizeDramaCheckArgs(AgoGraph * agraph);
+int agoOptimizeDramaDivide(AgoGraph * agraph);
+int agoOptimizeDramaRemove(AgoGraph * agraph);
+int agoOptimizeDramaAnalyze(AgoGraph * agraph);
+int agoOptimizeDramaMerge(AgoGraph * agraph);
+int agoOptimizeDramaAlloc(AgoGraph * agraph);
+// import
+void agoImportKernelConfig(AgoKernel * kernel, vx_kernel vxkernel);
+void agoImportNodeConfig(AgoNode * node, vx_node vxnode);
+void agoImportDataConfig(AgoData * data, vx_reference vxref, AgoGraph * graph);
+// string processing
+void agoEvaluateIntegerExpression(char * expr);
+// performance
+void agoPerfCaptureReset(vx_perf_t * perf);
+void agoPerfCaptureStart(vx_perf_t * perf);
+void agoPerfCaptureStop(vx_perf_t * perf);
+void agoPerfCopyNormalize(AgoContext * context, vx_perf_t * perfDst, vx_perf_t * perfSrc);
+// log
+void agoAddLogEntry(AgoReference * ref, vx_status status, const char *message, ...);
+#if ENABLE_OPENCL
+// OpenCL
+int agoGpuOclDataSetBufferAsKernelArg(AgoData * data, cl_kernel opencl_kernel, vx_uint32 kernelArgIndex, vx_uint32 group);
+int agoGpuOclReleaseContext(AgoContext * context);
+int agoGpuOclReleaseGraph(AgoGraph * graph);
+int agoGpuOclReleaseSuperNode(AgoSuperNode * supernode);
+int agoGpuOclReleaseData(AgoData * data);
+int agoGpuOclCreateContext(AgoContext * context, cl_context opencl_context);
+int agoGpuOclAllocBuffer(AgoData * data);
+int agoGpuOclAllocBuffers(AgoGraph * graph, AgoNode * node);
+int agoGpuOclSuperNodeMerge(AgoGraph * graph, AgoSuperNode * supernode, AgoNode * node);
+int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode);
+int agoGpuOclSuperNodeLaunch(AgoGraph * graph, AgoSuperNode * supernode);
+int agoGpuOclSuperNodeWait(AgoGraph * graph, AgoSuperNode * supernode);
+int agoGpuOclSingleNodeFinalize(AgoGraph * graph, AgoNode * node);
+int agoGpuOclSingleNodeLaunch(AgoGraph * graph, AgoNode * node);
+int agoGpuOclSingleNodeWait(AgoGraph * graph, AgoNode * node);
+#endif
+
+///////////////////////////////////////////////////////////
+// high-level functions
+extern "C" typedef void (VX_CALLBACK * ago_data_registry_callback_f) (void * obj, vx_reference ref, const char * name, const char * app_params);
+AgoContext * agoCreateContext();
+AgoGraph * agoCreateGraph(AgoContext * acontext);
+int agoReleaseGraph(AgoGraph * agraph);
+int agoReleaseContext(AgoContext * acontext);
+int agoVerifyGraph(AgoGraph * agraph);
+int agoOptimizeGraph(AgoGraph * agraph);
+int agoInitializeGraph(AgoGraph * agraph);
+int agoShutdownGraph(AgoGraph * graph);
+int agoExecuteGraph(AgoGraph * agraph);
+// scheduling
+int agoProcessGraph(AgoGraph * agraph);
+int agoScheduleGraph(AgoGraph * agraph);
+int agoWaitGraph(AgoGraph * agraph);
+int agoWriteGraph(AgoGraph * agraph, AgoReference * * ref, int num_ref, FILE * fp, const char * comment);
+int agoReadGraph(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, FILE * fp, vx_int32 dumpToConsole);
+int agoReadGraphFromString(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, char * str, vx_int32 dumpToConsole);
+int agoLoadModule(AgoContext * context, const char * module);
+vx_status agoGraphDumpPerformanceProfile(AgoGraph * graph, const char * fileName);
+vx_status agoDirective(vx_reference reference, vx_enum directive);
+
+///////////////////////////////////////////////////////////
+// locks
+void agoLockGlobalContext();
+void agoUnlockGlobalContext();
+class CAgoLockGlobalContext {
+public:
+	CAgoLockGlobalContext() { agoLockGlobalContext(); }
+	~CAgoLockGlobalContext() { agoUnlockGlobalContext(); }
+};
+class CAgoLock {
+public:
+	CAgoLock(CRITICAL_SECTION& cs) { m_cs = &cs; EnterCriticalSection(m_cs); }
+	~CAgoLock() { LeaveCriticalSection(m_cs); }
+private:
+	CRITICAL_SECTION * m_cs;
+};
+
+inline int leftmostbit(unsigned int n) {
+	int pos = 31;
+	while (pos >= 0 && !(n & (1 << pos)))
+		pos--;
+	return pos;
+}
+
+#endif // __ago_internal_h__
diff --git a/openvx/ago/ago_kernel_api.cpp b/openvx/ago/ago_kernel_api.cpp
new file mode 100644
index 0000000..6f189e6
--- /dev/null
+++ b/openvx/ago/ago_kernel_api.cpp
@@ -0,0 +1,17346 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+#include "ago_kernel_api.h"
+#include "ago_haf_gpu.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Local Utility Functions
+//
+static int ValidateArguments_Img_1IN(AgoNode * node, vx_df_image fmtIn)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[1]->u.img.width;
+	vx_uint32 height = node->paramList[1]->u.img.height;
+	if (node->paramList[1]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_1OUT(AgoNode * node, vx_df_image fmtOut)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[0]->u.img.width;
+	vx_uint32 height = node->paramList[0]->u.img.height;
+	if (node->paramList[0]->u.img.format != fmtOut)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut;
+	meta->data.u.img.rect_valid.start_x = 0;
+	meta->data.u.img.rect_valid.start_y = 0;
+	meta->data.u.img.rect_valid.end_x = (int)width - 1;
+	meta->data.u.img.rect_valid.end_y = (int)height - 1;
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_1OUT_1IN(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[1]->u.img.width;
+	vx_uint32 height = node->paramList[1]->u.img.height;
+	if (node->paramList[1]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[1]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[1]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[1]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[1]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_2OUT_1IN(AgoNode * node, vx_df_image fmtOut1, vx_df_image fmtOut2, vx_df_image fmtIn, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[2]->u.img.width;
+	vx_uint32 height = node->paramList[2]->u.img.height;
+	if (node->paramList[2]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut1;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[2]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[2]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[2]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[2]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+	}
+	meta = &node->metaList[1];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut2;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[2]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[2]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[2]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[2]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+	}
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_3OUT_1IN(AgoNode * node, vx_df_image fmtOut1, vx_df_image fmtOut2, vx_df_image fmtOut3, vx_df_image fmtIn, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[3]->u.img.width;
+	vx_uint32 height = node->paramList[3]->u.img.height;
+	if (node->paramList[3]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut1;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[3]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[3]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[3]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[3]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+	}
+	meta = &node->metaList[1];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut2;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[3]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[3]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[3]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[3]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+	}
+	meta = &node->metaList[2];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut3;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[3]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[3]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[3]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[3]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+	}
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_4OUT_1IN(AgoNode * node, vx_df_image fmtOut1, vx_df_image fmtOut2, vx_df_image fmtOut3, vx_df_image fmtOut4, vx_df_image fmtIn, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[4]->u.img.width;
+	vx_uint32 height = node->paramList[4]->u.img.height;
+	if (node->paramList[4]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut1;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[4]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[4]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[4]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[4]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[4]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[4]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[4]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[4]->u.img.rect_valid.end_y;
+	}
+	meta = &node->metaList[1];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut2;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[4]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[4]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[4]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[4]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[4]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[4]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[4]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[4]->u.img.rect_valid.end_y;
+	}
+	meta = &node->metaList[2];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut3;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[4]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[4]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[4]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[4]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[4]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[4]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[4]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[4]->u.img.rect_valid.end_y;
+	}
+	meta = &node->metaList[3];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut4;
+	if (bShrinkValidRegion)
+	{
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[4]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[4]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[4]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[4]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	}
+	else
+	{
+		meta->data.u.img.rect_valid.start_x = node->paramList[4]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[4]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[4]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[4]->u.img.rect_valid.end_y;
+	}
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_1OUT_1IN_S(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn, vx_enum scalarType, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	int status = ValidateArguments_Img_1OUT_1IN(node, fmtOut, fmtIn, bShrinkValidRegion, shrinkValidRegion_x, shrinkValidRegion_y);
+	if (!status) {
+		if (node->paramList[2]->u.scalar.type != scalarType)
+			return VX_ERROR_INVALID_TYPE;
+	}
+	return status;
+}
+static int ValidateArguments_Img_1OUT_1IN_2S(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn, vx_enum scalarType, vx_enum scalarType2, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	int status = ValidateArguments_Img_1OUT_1IN(node, fmtOut, fmtIn, bShrinkValidRegion, shrinkValidRegion_x, shrinkValidRegion_y);
+	if (!status) {
+		if (node->paramList[2]->u.scalar.type != scalarType && node->paramList[3]->u.scalar.type != scalarType2)
+			return VX_ERROR_INVALID_TYPE;
+	}
+	return status;
+}
+static int ValidateArguments_Img_1OUT_1IN_3S(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn, vx_enum scalarType, vx_enum scalarType2, vx_enum scalarType3, bool bShrinkValidRegion = false, int shrinkValidRegion_x = 0, int shrinkValidRegion_y = 0)
+{
+	int status = ValidateArguments_Img_1OUT_1IN(node, fmtOut, fmtIn, bShrinkValidRegion, shrinkValidRegion_x, shrinkValidRegion_y);
+	if (!status) {
+		if (node->paramList[2]->u.scalar.type != scalarType && node->paramList[3]->u.scalar.type != scalarType2 && node->paramList[4]->u.scalar.type != scalarType3)
+			return VX_ERROR_INVALID_TYPE;
+	}
+	return status;
+}
+static int ValidateArguments_Img_1OUT_2IN(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn1, vx_df_image fmtIn2)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[1]->u.img.width;
+	vx_uint32 height = node->paramList[1]->u.img.height;
+	if (node->paramList[1]->u.img.format != fmtIn1 || node->paramList[2]->u.img.format != fmtIn2)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height || width != node->paramList[2]->u.img.width || height != node->paramList[2]->u.img.height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut;
+	meta->data.u.img.rect_valid.start_x = max(node->paramList[1]->u.img.rect_valid.start_x, node->paramList[2]->u.img.rect_valid.start_x);
+	meta->data.u.img.rect_valid.start_y = max(node->paramList[1]->u.img.rect_valid.start_y, node->paramList[2]->u.img.rect_valid.start_y);
+	meta->data.u.img.rect_valid.end_x = min(node->paramList[1]->u.img.rect_valid.end_x, node->paramList[2]->u.img.rect_valid.end_x);
+	meta->data.u.img.rect_valid.end_y = min(node->paramList[1]->u.img.rect_valid.end_y, node->paramList[2]->u.img.rect_valid.end_y);
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_1OUT_2IN_S(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn1, vx_df_image fmtIn2, vx_enum scalarType)
+{
+	int status = ValidateArguments_Img_1OUT_2IN(node, fmtOut, fmtIn1, fmtIn2);
+	if (!status) {
+		if (node->paramList[3]->u.scalar.type != scalarType)
+			return VX_ERROR_INVALID_TYPE;
+	}
+	return status;
+}
+static int ValidateArguments_Img_1OUT_3IN(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn1, vx_df_image fmtIn2, vx_df_image fmtIn3)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[1]->u.img.width;
+	vx_uint32 height = node->paramList[1]->u.img.height;
+	if (node->paramList[1]->u.img.format != fmtIn1 || node->paramList[2]->u.img.format != fmtIn2 || 
+		node->paramList[3]->u.img.format != fmtIn3)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height || width != node->paramList[2]->u.img.width || height != node->paramList[2]->u.img.height || 
+		                          width != node->paramList[3]->u.img.width || height != node->paramList[3]->u.img.height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut;
+	meta->data.u.img.rect_valid.start_x = max(node->paramList[1]->u.img.rect_valid.start_x, max(node->paramList[2]->u.img.rect_valid.start_x, node->paramList[3]->u.img.rect_valid.start_x));
+	meta->data.u.img.rect_valid.start_y = max(node->paramList[1]->u.img.rect_valid.start_y, max(node->paramList[2]->u.img.rect_valid.start_y, node->paramList[3]->u.img.rect_valid.start_y));
+	meta->data.u.img.rect_valid.end_x = min(node->paramList[1]->u.img.rect_valid.end_x, min(node->paramList[2]->u.img.rect_valid.end_x, node->paramList[3]->u.img.rect_valid.end_x));
+	meta->data.u.img.rect_valid.end_y = min(node->paramList[1]->u.img.rect_valid.end_y, min(node->paramList[2]->u.img.rect_valid.end_y, node->paramList[3]->u.img.rect_valid.end_y));
+	return VX_SUCCESS;
+}
+static int ValidateArguments_Img_1OUT_4IN(AgoNode * node, vx_df_image fmtOut, vx_df_image fmtIn1, vx_df_image fmtIn2, vx_df_image fmtIn3, vx_df_image fmtIn4)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[1]->u.img.width;
+	vx_uint32 height = node->paramList[1]->u.img.height;
+	if (node->paramList[1]->u.img.format != fmtIn1 || node->paramList[2]->u.img.format != fmtIn2 ||
+		node->paramList[3]->u.img.format != fmtIn3 || node->paramList[4]->u.img.format != fmtIn4)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height || width != node->paramList[2]->u.img.width || height != node->paramList[2]->u.img.height ||
+								  width != node->paramList[3]->u.img.width || height != node->paramList[3]->u.img.height ||
+								  width != node->paramList[4]->u.img.width || height != node->paramList[4]->u.img.height)
+		return VX_ERROR_INVALID_DIMENSION;
+	// set output image sizes are same as input image size
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = fmtOut;
+	meta->data.u.img.rect_valid.start_x = max(node->paramList[1]->u.img.rect_valid.start_x, max(node->paramList[2]->u.img.rect_valid.start_x, max(node->paramList[3]->u.img.rect_valid.start_x, node->paramList[4]->u.img.rect_valid.start_x)));
+	meta->data.u.img.rect_valid.start_y = max(node->paramList[1]->u.img.rect_valid.start_y, max(node->paramList[2]->u.img.rect_valid.start_y, max(node->paramList[3]->u.img.rect_valid.start_y, node->paramList[4]->u.img.rect_valid.start_y)));
+	meta->data.u.img.rect_valid.end_x = min(node->paramList[1]->u.img.rect_valid.end_x, min(node->paramList[2]->u.img.rect_valid.end_x, min(node->paramList[3]->u.img.rect_valid.end_x, node->paramList[4]->u.img.rect_valid.end_x)));
+	meta->data.u.img.rect_valid.end_y = min(node->paramList[1]->u.img.rect_valid.end_y, min(node->paramList[2]->u.img.rect_valid.end_y, min(node->paramList[3]->u.img.rect_valid.end_y, node->paramList[4]->u.img.rect_valid.end_y)));
+	return VX_SUCCESS;
+}
+static int ValidateArguments_CannySuppThreshold_U8(AgoNode * node, vx_df_image fmtIn, int shrinkValidRegion_x, int shrinkValidRegion_y)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[1]->u.img.width;
+	vx_uint32 height = node->paramList[1]->u.img.height;
+	if (node->paramList[1]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE ||
+		(node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8 && node->paramList[2]->u.thr.data_type != VX_TYPE_UINT16 && node->paramList[2]->u.thr.data_type != VX_TYPE_INT16))
+		return VX_ERROR_INVALID_TYPE;
+	// set output info
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = VX_DF_IMAGE_U8;
+	meta->data.u.img.rect_valid.start_x = min(node->paramList[1]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+	meta->data.u.img.rect_valid.start_y = min(node->paramList[1]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+	meta->data.u.img.rect_valid.end_x = max((int)node->paramList[1]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+	meta->data.u.img.rect_valid.end_y = max((int)node->paramList[1]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	return VX_SUCCESS;
+}
+static int ValidateArguments_CannySuppThreshold_U8XY(AgoNode * node, vx_df_image fmtIn, int shrinkValidRegion_x, int shrinkValidRegion_y)
+{
+	// validate parameters
+	vx_uint32 width = node->paramList[2]->u.img.width;
+	vx_uint32 height = node->paramList[2]->u.img.height;
+	if (node->paramList[2]->u.img.format != fmtIn)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!width || !height)
+		return VX_ERROR_INVALID_DIMENSION;
+	if (node->paramList[3]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE || 
+		(node->paramList[3]->u.thr.data_type != VX_TYPE_UINT8 && node->paramList[3]->u.thr.data_type != VX_TYPE_UINT16 && node->paramList[3]->u.thr.data_type != VX_TYPE_INT16))
+		return VX_ERROR_INVALID_TYPE;
+	// set output info
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.img.width = width;
+	meta->data.u.img.height = height;
+	meta->data.u.img.format = VX_DF_IMAGE_U8;
+	meta->data.u.img.rect_valid.start_x = min(node->paramList[1]->u.img.rect_valid.start_x + shrinkValidRegion_x, width);
+	meta->data.u.img.rect_valid.start_y = min(node->paramList[1]->u.img.rect_valid.start_y + shrinkValidRegion_y, height);
+	meta->data.u.img.rect_valid.end_x = max((int)node->paramList[1]->u.img.rect_valid.end_x - shrinkValidRegion_x, 0);
+	meta->data.u.img.rect_valid.end_y = max((int)node->paramList[1]->u.img.rect_valid.end_y - shrinkValidRegion_y, 0);
+	return VX_SUCCESS;
+}
+static int ValidateArguments_OpticalFlowPyrLK_XY_XY(AgoNode * node)
+{
+	AgoData * oldPyr = node->paramList[1];
+	AgoData * newPyr = node->paramList[2];
+	AgoData * oldXY = node->paramList[3];
+	AgoData * newXYest = node->paramList[4];
+	AgoData * termination = node->paramList[5];
+	AgoData * epsilon = node->paramList[6];
+	AgoData * num_iterations = node->paramList[7];
+	AgoData * use_initial_estimate = node->paramList[8];
+	if (oldXY->u.arr.itemtype != VX_TYPE_KEYPOINT || newXYest->u.arr.itemtype != VX_TYPE_KEYPOINT ||
+		termination->u.scalar.type != VX_TYPE_ENUM || epsilon->u.scalar.type != VX_TYPE_FLOAT32 || 
+		num_iterations->u.scalar.type != VX_TYPE_UINT32 || use_initial_estimate->u.scalar.type != VX_TYPE_BOOL)
+		return VX_ERROR_INVALID_TYPE;
+	else if (oldPyr->u.pyr.format != VX_DF_IMAGE_U8 || newPyr->u.pyr.format != VX_DF_IMAGE_U8)
+		return VX_ERROR_INVALID_FORMAT;
+	else if (!oldPyr->u.pyr.width || !oldPyr->u.pyr.height || !newPyr->u.pyr.width || !newPyr->u.pyr.height ||
+		oldPyr->u.pyr.width != newPyr->u.pyr.width || oldPyr->u.pyr.height != newPyr->u.pyr.height ||
+		!oldXY->u.arr.capacity || !newXYest->u.arr.capacity || oldXY->u.arr.capacity != newXYest->u.arr.capacity)
+		return VX_ERROR_INVALID_DIMENSION;
+	else if (termination->u.scalar.u.e != VX_TERM_CRITERIA_ITERATIONS && termination->u.scalar.u.e != VX_TERM_CRITERIA_EPSILON && termination->u.scalar.u.e != VX_TERM_CRITERIA_BOTH)
+		return VX_ERROR_INVALID_VALUE;
+	else if (oldPyr->u.pyr.scale != newPyr->u.pyr.scale || oldPyr->u.pyr.levels != newPyr->u.pyr.levels)
+		return VX_ERROR_INVALID_VALUE;
+	// set output info
+	vx_meta_format meta;
+	meta = &node->metaList[0];
+	meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+	meta->data.u.arr.capacity = oldXY->u.arr.capacity;
+	return VX_SUCCESS;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// OpenVX 1.0 built-in kernels
+//
+int ovxKernel_Invalid(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: invalid kernel
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD: not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_ColorConvert(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_COLOR_CONVERT_* kernels
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD: not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		vx_df_image srcfmt = node->paramList[0]->u.img.format;
+		if (srcfmt != VX_DF_IMAGE_RGB && srcfmt != VX_DF_IMAGE_RGBX && srcfmt != VX_DF_IMAGE_NV12 && srcfmt != VX_DF_IMAGE_NV21 &&
+			srcfmt != VX_DF_IMAGE_IYUV && srcfmt != VX_DF_IMAGE_YUYV && srcfmt != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		vx_df_image dstfmt = node->paramList[1]->u.img.format;
+		if (dstfmt == VX_DF_IMAGE_VIRT)
+			return VX_ERROR_INVALID_FORMAT;
+		// set output image size is same as input image
+		vx_meta_format meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = dstfmt;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL
+					| AGO_KERNEL_FLAG_DEVICE_GPU
+#endif
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_ChannelExtract(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_CHANNEL_COPY_U8_U8 kernel for extracting from planar
+	//       use VX_KERNEL_AMD_CHANNEL_EXTRACT_* kernels for extracting from interleaved
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+    else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		vx_df_image srcfmt = node->paramList[0]->u.img.format;
+		if (srcfmt != VX_DF_IMAGE_RGB && srcfmt != VX_DF_IMAGE_RGBX && srcfmt != VX_DF_IMAGE_NV12 && srcfmt != VX_DF_IMAGE_NV21 &&
+			srcfmt != VX_DF_IMAGE_IYUV && srcfmt != VX_DF_IMAGE_YUYV && srcfmt != VX_DF_IMAGE_UYVY && srcfmt != VX_DF_IMAGE_YUV4)
+			return VX_ERROR_INVALID_FORMAT;
+		if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		vx_enum channel = node->paramList[1]->u.scalar.u.e;
+		int channel_index = agoChannelEnum2Index(channel);
+		int max_channel_index = (srcfmt == VX_DF_IMAGE_RGBX) ? 3 : 2;
+		if (node->paramList[1]->u.scalar.type != VX_TYPE_ENUM || channel_index < 0 || channel_index > max_channel_index)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image size is same as input image
+		vx_meta_format meta = &node->metaList[2];
+		vx_uint32 x_scale_factor_is_2 = 0, y_scale_factor_is_2 = 0;
+		if (channel_index > 0) {
+			if (node->paramList[0]->numChildren > 0) {
+				x_scale_factor_is_2 = node->paramList[0]->children[1]->u.img.x_scale_factor_is_2;
+				y_scale_factor_is_2 = node->paramList[0]->children[1]->u.img.y_scale_factor_is_2;
+			}
+			else if (srcfmt == VX_DF_IMAGE_YUYV || srcfmt == VX_DF_IMAGE_UYVY) {
+				x_scale_factor_is_2 = 1;
+			}
+		}
+		meta->data.u.img.width = width >> x_scale_factor_is_2;
+		meta->data.u.img.height = height >> y_scale_factor_is_2;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_get_image_plane_nonusage) {
+		status = VX_SUCCESS;
+		if (node->funcExchange[0] == 0) {
+			// mark that planes other than the specified channel are not used on input image
+			vx_enum channel = node->paramList[1]->u.scalar.u.e;
+			int channel_index = agoChannelEnum2Index(channel);
+			for (vx_uint32 plane = 0; plane < node->paramList[0]->numChildren; plane++)
+				node->funcExchange[1 + plane] = (plane != channel_index) ? 1 : 0;
+		}
+    }
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+					| AGO_KERNEL_FLAG_DEVICE_GPU
+#endif				
+					;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_ChannelCombine(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_CHANNEL_COPY_U8_U8 kernel for combining into planar
+	//       use VX_KERNEL_AMD_CHANNEL_COMBINE_* kernels for combining into interleaved
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 chroma_x_scale_factor_is_2 = 0, chroma_y_scale_factor_is_2 = 0;
+		vx_df_image dstfmt = node->paramList[4]->u.img.format;
+		if (dstfmt == VX_DF_IMAGE_IYUV || dstfmt == VX_DF_IMAGE_NV12 || dstfmt == VX_DF_IMAGE_NV21)
+			chroma_x_scale_factor_is_2 = chroma_y_scale_factor_is_2 = 1;
+		else if (dstfmt == VX_DF_IMAGE_YUYV || dstfmt == VX_DF_IMAGE_UYVY)
+			chroma_x_scale_factor_is_2 = 1;
+		else if (dstfmt != VX_DF_IMAGE_RGB && dstfmt != VX_DF_IMAGE_RGBX && dstfmt != VX_DF_IMAGE_YUV4)
+			return VX_ERROR_INVALID_FORMAT;
+		vx_uint32 planeCount = 2;
+		if (node->paramList[2]) planeCount++;
+		else if (node->paramList[3]) planeCount++;
+		if ((!node->paramList[2] && node->paramList[3]) || (planeCount != (node->paramList[4]->numChildren == 4 ? 4 : 3)))
+			return VX_ERROR_INVALID_PARAMETERS;
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		for (vx_uint32 plane = 1; plane < planeCount; plane++) {
+			if (node->paramList[plane]->u.img.format != VX_DF_IMAGE_U8)
+				return VX_ERROR_INVALID_FORMAT;
+			if (((node->paramList[plane]->u.img.width << chroma_x_scale_factor_is_2) != width) ||
+				((node->paramList[plane]->u.img.height << chroma_y_scale_factor_is_2) != height))
+				return VX_ERROR_INVALID_DIMENSION;
+		}
+		// set output image size is same as input image
+		vx_meta_format meta = &node->metaList[4];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = dstfmt;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+					| AGO_KERNEL_FLAG_DEVICE_GPU
+#endif				
+					;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Sobel3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_SOBEL_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		vx_df_image srcfmt = node->paramList[0]->u.img.format;
+		if (srcfmt != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_S16;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_S16;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+					| AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Magnitude(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_MAGNITUDE_S16_S16S16 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 || node->paramList[1]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_S16;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Phase(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_PHASE_U8_S16S16 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 || node->paramList[1]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_ScaleImage(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_SCALE_IMAGE_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[0]->u.img.width || !node->paramList[0]->u.img.height || !node->paramList[1]->u.img.width || !node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR &&
+			node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_BILINEAR &&
+			node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_AREA)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = node->paramList[1]->u.img.width;
+		meta->data.u.img.height = node->paramList[1]->u.img.height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		// set the valid region
+		vx_float32 widthOut = (vx_float32)node->paramList[1]->u.img.width;
+		vx_float32 widthIn = (vx_float32)node->paramList[0]->u.img.width;
+		vx_float32 heightOut = (vx_float32)node->paramList[1]->u.img.height;
+		vx_float32 heightIn = (vx_float32)node->paramList[0]->u.img.height;
+		meta->data.u.img.rect_valid.start_x = (vx_uint32)(((node->paramList[0]->u.img.rect_valid.start_x + 0.5f) * widthOut / widthIn) - 0.5f);
+		meta->data.u.img.rect_valid.start_y = (vx_uint32)(((node->paramList[0]->u.img.rect_valid.start_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		meta->data.u.img.rect_valid.end_x = (vx_uint32)(((node->paramList[0]->u.img.rect_valid.end_x + 0.5f) * widthOut / widthIn) - 0.5f);
+		meta->data.u.img.rect_valid.end_y = (vx_uint32)(((node->paramList[0]->u.img.rect_valid.end_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_TableLookup(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_LUT_U8_U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.lut.type != VX_TYPE_UINT8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                   
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Histogram(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_HISTOGRAM_DATA_U8 kernel to get histogram of full/sub-image
+	//       use VX_KERNEL_AMD_HISTOGRAM_MERGE_DATA_DATA kernel if sub-images are scheduled on multi-core
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_EqualizeHistogram(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_HISTOGRAM_DATA_U8 kernel to get histogram of full/sub-image
+	//       use VX_KERNEL_AMD_EQUALIZE_DATA_DATA kernel to generate lut from histogram(s) for equalization
+	//       use VX_KERNEL_AMD_LUT_U8_U8 kernels to equalize full/sub-images
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_AbsDiff(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_ABS_DIFF_U8_U8U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		vx_df_image format = node->paramList[0]->u.img.format;
+		if ((format != VX_DF_IMAGE_U8 && format != VX_DF_IMAGE_S16) || node->paramList[1]->u.img.format != format)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = format;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_MeanStdDev(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_MEAN_STD_DEV_DATA_U8 kernel to get sum and sum of squares on full/sub-images
+	//       use VX_KERNEL_AMD_MEAN_STD_DEV_MERGE_DATA_DATA kernel to get mean and std-dev
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set scaler output types to FLOAT32
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_FLOAT32;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_FLOAT32;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Threshold(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_THRESHOLD_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		if (node->paramList[1]->u.thr.data_type != VX_TYPE_UINT8)
+			return VX_ERROR_INVALID_TYPE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_IntegralImage(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_INTEGRAL_IMAGE_U32_U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U32;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Dilate3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_DILATE_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Erode3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_ERODE_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Median3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_MEDIAN_U8_U8_3x3 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Box3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_BOX_U8_U8_3x3 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		// set output image valid region
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Gaussian3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_GAUSSIAN_U8_U8_3x3 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + 1, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_CustomConvolution(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_LINEAR_FILTER_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (!node->paramList[1]->u.conv.columns || !node->paramList[1]->u.conv.rows)
+			return VX_ERROR_INVALID_DIMENSION;
+		vx_df_image_e dstfmt = VX_DF_IMAGE_S16;
+		if (node->paramList[2]->u.img.format == VX_DF_IMAGE_U8)
+			dstfmt = VX_DF_IMAGE_U8;
+		// set output image sizes are same as input image size
+		int M = (int) node->paramList[1]->u.conv.columns >> 1;
+		int N = (int) node->paramList[1]->u.conv.rows >> 1;
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = dstfmt;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + M, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + N, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - M, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - N, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_GaussianPyramid(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_SCALE_GAUSSIAN_* kernels recursively on each plane of the pyramid
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		AgoData * image = node->paramList[0];
+		vx_uint32 width = image->u.img.width;
+		vx_uint32 height = image->u.img.height;
+		vx_enum format = image->u.img.format;
+		vx_float32 scale = node->paramList[1]->u.pyr.scale;
+		vx_size levels = node->paramList[1]->u.pyr.levels;
+		if (format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (scale != VX_SCALE_PYRAMID_HALF && scale != VX_SCALE_PYRAMID_ORB)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.pyr.width = width;
+		meta->data.u.pyr.height = height;
+		meta->data.u.pyr.format = format;
+		meta->data.u.pyr.levels = levels;
+		meta->data.u.pyr.scale = scale;
+		meta->data.u.pyr.rect_valid.start_x = image->u.img.rect_valid.start_x;
+		meta->data.u.pyr.rect_valid.start_y = image->u.img.rect_valid.start_y;
+		meta->data.u.pyr.rect_valid.end_x = image->u.img.rect_valid.end_x;
+		meta->data.u.pyr.rect_valid.end_y = image->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Accumulate(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_ADD_S16_S16U8_SAT kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+    else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// Update the valid region
+		node->paramList[0]->u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		node->paramList[0]->u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		node->paramList[0]->u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		node->paramList[0]->u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_AccumulateWeighted(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_ACCUMULATE_WEIGHTED_U8_U8U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[2]->u.img.width || height != node->paramList[2]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[1]->u.scalar.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[1]->u.scalar.u.f < 0.0f || node->paramList[1]->u.scalar.u.f > 1.0f)
+			return VX_ERROR_INVALID_VALUE;
+		// Update the valid region
+		node->paramList[0]->u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		node->paramList[0]->u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		node->paramList[0]->u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		node->paramList[0]->u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_AccumulateSquare(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_ADD_SQUARED_S16_S16U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+    else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[2]->u.img.width || height != node->paramList[2]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[1]->u.scalar.type != VX_TYPE_UINT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[1]->u.scalar.u.u > 15)
+			return VX_ERROR_INVALID_VALUE;
+		// Update the valid region
+		node->paramList[0]->u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		node->paramList[0]->u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		node->paramList[0]->u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		node->paramList[0]->u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_MinMaxLoc(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8/S16 kernels find min & max of full/sub-images
+	//       use VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_DATA if sub-images are used to find min & max
+	//       use VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_U8/S16DATA_* kernels find min & max of full/sub-images depending on configuration
+	//       use VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_DATA kernel if Loc/Count is used on sub-images
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[0]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output data info
+		node->metaList[1].data.u.scalar.type = (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8) ? VX_TYPE_UINT8 : VX_TYPE_INT16;
+		node->metaList[2].data.u.scalar.type = (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8) ? VX_TYPE_UINT8 : VX_TYPE_INT16;
+		node->metaList[3].data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		node->metaList[3].data.u.arr.capacity = 0;
+		node->metaList[4].data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		node->metaList[4].data.u.arr.capacity = 0;
+		node->metaList[5].data.u.scalar.type = VX_TYPE_UINT32;
+		node->metaList[6].data.u.scalar.type = VX_TYPE_UINT32;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_ConvertDepth(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_COLOR_DEPTH_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[0]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM || node->paramList[3]->u.scalar.type != VX_TYPE_INT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if ((node->paramList[2]->u.scalar.u.e != VX_CONVERT_POLICY_WRAP && node->paramList[2]->u.scalar.u.e != VX_CONVERT_POLICY_SATURATE) ||
+			     (node->paramList[3]->u.scalar.u.i < 0 || node->paramList[3]->u.scalar.u.i >= 8))
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8) ? VX_DF_IMAGE_S16 : VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[0]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[0]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[0]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[0]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_CannyEdgeDetector(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: Alternative#1: (1st level performance optimization)
+	//         use VX_KERNEL_AMD_CANNY_SOBEL_U16_* or kernels to compute sobel magnitude
+	//         use VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3 kernel to threshold
+	//         use VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8XY kernel to trace the edges
+	//       Alternative#2: (2nd level performance optimization)
+	//         use VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_* or kernels to compute sobel, non-max supression, and threshold
+	//         use VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8XY kernel to trace the edges
+	//       Alternative#3: (3rd level performance optimization)
+	//         use VX_KERNEL_AMD_CANNY_SOBEL_U16_* or kernels to compute sobel magnitude
+	//         use VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8_U16_3x3 kernel to threshold
+	//         use VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8 kernel to trace the edges
+	//       Alternative#4: (4th level performance optimization)
+	//         use VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_* or kernels to compute sobel, non-max supression, and threshold
+	//         use VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8 kernel to trace the edges
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if ((node->paramList[1]->u.thr.data_type != VX_TYPE_UINT8 && node->paramList[1]->u.thr.data_type != VX_TYPE_UINT16 && node->paramList[1]->u.thr.data_type != VX_TYPE_INT16) ||
+			node->paramList[1]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE ||
+			node->paramList[2]->u.scalar.type != VX_TYPE_INT32 || 
+			node->paramList[3]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[3]->u.scalar.u.e != VX_NORM_L1 && node->paramList[3]->u.scalar.u.e != VX_NORM_L2)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		int N = node->paramList[2]->u.scalar.u.i >> 1;
+		vx_meta_format meta;
+		meta = &node->metaList[4];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[0]->u.img.rect_valid.start_x + N + 1, width);				// N rows invalidated by filtering and one extra for non-max supression
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[0]->u.img.rect_valid.start_y + N + 1, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[0]->u.img.rect_valid.end_x - N - 1, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[0]->u.img.rect_valid.end_y - N - 1, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_And(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_AND_U8_U8U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Or(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_OR_U8_U8U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Xor(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_XOR_U8_U8U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Not(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_NOT_U8_U8U8 kernel
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Multiply(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_MULTIPLY_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if ((node->paramList[0]->u.img.format != VX_DF_IMAGE_U8  && node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 && 
+			 node->paramList[0]->u.img.format != VX_DF_IMAGE_RGB && node->paramList[0]->u.img.format != VX_DF_IMAGE_RGBX) ||
+			(node->paramList[1]->u.img.format != VX_DF_IMAGE_U8  && node->paramList[1]->u.img.format != VX_DF_IMAGE_S16))
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_FLOAT32 || 
+			node->paramList[3]->u.scalar.type != VX_TYPE_ENUM ||
+			node->paramList[4]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if ((node->paramList[3]->u.scalar.u.e != VX_CONVERT_POLICY_WRAP && node->paramList[3]->u.scalar.u.e != VX_CONVERT_POLICY_SATURATE) ||
+			     (node->paramList[4]->u.scalar.u.e != VX_ROUND_POLICY_TO_ZERO && node->paramList[4]->u.scalar.u.e != VX_ROUND_POLICY_TO_NEAREST_EVEN))
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_df_image dstfmt = VX_DF_IMAGE_VIRT;
+		if (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8 && node->paramList[1]->u.img.format == VX_DF_IMAGE_U8)
+			dstfmt = (node->paramList[5]->u.img.format == VX_DF_IMAGE_U8) ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16;
+		else if (node->paramList[0]->u.img.format == VX_DF_IMAGE_S16 || node->paramList[1]->u.img.format == VX_DF_IMAGE_S16)
+			dstfmt = VX_DF_IMAGE_S16;
+		else if (node->paramList[1]->u.img.format == VX_DF_IMAGE_U8 && (node->paramList[0]->u.img.format == VX_DF_IMAGE_RGB || node->paramList[0]->u.img.format == VX_DF_IMAGE_RGBX))
+			dstfmt = node->paramList[0]->u.img.format;
+		else
+			return VX_ERROR_INVALID_FORMAT;
+		vx_meta_format meta;
+		meta = &node->metaList[5];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = dstfmt;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Add(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_ADD_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if ((node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[0]->u.img.format != VX_DF_IMAGE_S16) ||
+			(node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[1]->u.img.format != VX_DF_IMAGE_S16))
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.e != VX_CONVERT_POLICY_WRAP && node->paramList[2]->u.scalar.u.e != VX_CONVERT_POLICY_SATURATE)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_df_image dstfmt = VX_DF_IMAGE_S16;
+		if (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8 &&
+			node->paramList[1]->u.img.format == VX_DF_IMAGE_U8 &&
+			node->paramList[3]->u.img.format == VX_DF_IMAGE_U8)
+			dstfmt = VX_DF_IMAGE_U8;
+		vx_meta_format meta;
+		meta = &node->metaList[3];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = dstfmt;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Subtract(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_SUB_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if ((node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[0]->u.img.format != VX_DF_IMAGE_S16) ||
+			(node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[1]->u.img.format != VX_DF_IMAGE_S16))
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.e != VX_CONVERT_POLICY_WRAP && node->paramList[2]->u.scalar.u.e != VX_CONVERT_POLICY_SATURATE)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_df_image dstfmt = VX_DF_IMAGE_S16;
+		if (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8 &&
+			node->paramList[1]->u.img.format == VX_DF_IMAGE_U8 &&
+			node->paramList[3]->u.img.format == VX_DF_IMAGE_U8)
+			dstfmt = VX_DF_IMAGE_U8;
+		vx_meta_format meta;
+		meta = &node->metaList[3];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = dstfmt;
+		meta->data.u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		meta->data.u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		meta->data.u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		meta->data.u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_WarpAffine(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_WARP_AFFINE_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[1]->u.mat.type != VX_TYPE_FLOAT32 || node->paramList[1]->u.mat.columns != 2 || node->paramList[1]->u.mat.rows != 3)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR && node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_BILINEAR)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[3];
+		meta->data.u.img.width = node->paramList[3]->u.img.width;
+		meta->data.u.img.height = node->paramList[3]->u.img.height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_WarpPerspective(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_VX_KERNEL_AMD_WARP_PERSPECTIVE_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[1]->u.mat.type != VX_TYPE_FLOAT32 || node->paramList[1]->u.mat.columns != 3 || node->paramList[1]->u.mat.rows != 3)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR && node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_BILINEAR)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[3];
+		meta->data.u.img.width = node->paramList[3]->u.img.width;
+		meta->data.u.img.height = node->paramList[3]->u.img.height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_HarrisCorners(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_HARRIS_SOBEL_* kernels to compute Gx^2, Gx*Gy, Gy^2
+	//       use VX_KERNEL_AMD_HARRIS_SCORE_* kernels to compute Vc
+	//       use VX_KERNEL_AMD_HARRIS_MERGE_SORT_AND_PICK_XY_HVC kernel for final step
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[1]->u.scalar.type != VX_TYPE_FLOAT32 ||
+			node->paramList[2]->u.scalar.type != VX_TYPE_FLOAT32 || node->paramList[3]->u.scalar.type != VX_TYPE_FLOAT32 ||
+			node->paramList[4]->u.scalar.type != VX_TYPE_INT32 || node->paramList[5]->u.scalar.type != VX_TYPE_INT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if (!(node->paramList[4]->u.scalar.u.i & 1) || node->paramList[4]->u.scalar.u.i < 3 || node->paramList[4]->u.scalar.u.i > 7 ||
+				 !(node->paramList[5]->u.scalar.u.i & 1) || node->paramList[5]->u.scalar.u.i < 3 || node->paramList[5]->u.scalar.u.i > 7)
+			return VX_ERROR_INVALID_VALUE;
+		// set output data info
+		node->metaList[6].data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		node->metaList[6].data.u.arr.capacity = 0;
+		node->metaList[7].data.u.scalar.type = VX_TYPE_SIZE;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_FastCorners(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_FAST_CORNERS_XY_U8_* kernels at full/sub-image level
+	//       use VX_KERNEL_AMD_FAST_CORNER_MERGE_XY_XY kernel if sub-images are used
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[1]->u.scalar.type != VX_TYPE_FLOAT32 || node->paramList[2]->u.scalar.type != VX_TYPE_BOOL)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.i < 0 || node->paramList[2]->u.scalar.u.i > 1)
+			return VX_ERROR_INVALID_VALUE;
+		// set output data info
+		node->metaList[3].data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		node->metaList[3].data.u.arr.capacity = 0;
+		node->metaList[4].data.u.scalar.type = VX_TYPE_SIZE;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_OpticalFlowPyrLK(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_OPTICAL_FLOW_PYR_LK_XY_XY_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.pyr.width;
+		vx_uint32 height = node->paramList[0]->u.pyr.height;
+		if (node->paramList[0]->u.pyr.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.pyr.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.pyr.width || height != node->paramList[1]->u.pyr.height ||
+			node->paramList[0]->u.pyr.levels != node->paramList[1]->u.pyr.levels || node->paramList[0]->u.pyr.scale != node->paramList[1]->u.pyr.scale ||
+			!node->paramList[2]->u.arr.capacity || node->paramList[2]->u.arr.capacity != node->paramList[3]->u.arr.capacity)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.arr.itemtype != VX_TYPE_KEYPOINT || node->paramList[3]->u.arr.itemtype != VX_TYPE_KEYPOINT)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (node->paramList[5]->u.scalar.type != VX_TYPE_ENUM || 
+				  node->paramList[6]->u.scalar.type != VX_TYPE_FLOAT32 ||
+				  node->paramList[7]->u.scalar.type != VX_TYPE_UINT32 ||
+				  node->paramList[8]->u.scalar.type != VX_TYPE_BOOL ||
+				  node->paramList[9]->u.scalar.type != VX_TYPE_SIZE)
+			return VX_ERROR_INVALID_TYPE;
+		else if ((node->paramList[5]->u.scalar.u.e != VX_TERM_CRITERIA_ITERATIONS &&
+				  node->paramList[5]->u.scalar.u.e != VX_TERM_CRITERIA_EPSILON &&
+				  node->paramList[5]->u.scalar.u.e != VX_TERM_CRITERIA_BOTH) ||
+			     node->paramList[9]->u.scalar.u.s > AGO_OPTICALFLOWPYRLK_MAX_DIM)
+			return VX_ERROR_INVALID_VALUE;
+		// set output data info
+		node->metaList[4].data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		node->metaList[4].data.u.arr.capacity = node->paramList[2]->u.arr.capacity;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_Remap(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_REMAP_U8_U8_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 && node->paramList[0]->u.img.format != VX_DF_IMAGE_RGB && node->paramList[0]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.remap.src_width || height != node->paramList[1]->u.remap.src_height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_ENUM)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR && node->paramList[2]->u.scalar.u.e != VX_INTERPOLATION_TYPE_BILINEAR)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[3];
+		meta->data.u.img.width = node->paramList[1]->u.remap.dst_width;
+		meta->data.u.img.height = node->paramList[1]->u.remap.dst_height;
+		if (node->paramList[3]->u.img.format == VX_DF_IMAGE_VIRT || node->paramList[3]->u.img.format == node->paramList[0]->u.img.format)
+			meta->data.u.img.format = node->paramList[0]->u.img.format;
+		else if (node->paramList[3]->u.img.format == VX_DF_IMAGE_RGB && node->paramList[0]->u.img.format == VX_DF_IMAGE_RGBX)
+			meta->data.u.img.format = node->paramList[3]->u.img.format;
+		else
+			return VX_ERROR_INVALID_FORMAT;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int ovxKernel_HalfScaleGaussian(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_* kernels
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_INT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[2]->u.scalar.u.i != 3 && node->paramList[2]->u.scalar.u.i != 5)
+			return VX_ERROR_INVALID_VALUE;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		int N = node->paramList[2]->u.scalar.u.i >> 1;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = (width + 1) >> 1;
+		meta->data.u.img.height = (height + 1) >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = ((node->paramList[0]->u.img.rect_valid.start_x + 1) >> 1) + N;
+		meta->data.u.img.rect_valid.start_y = ((node->paramList[0]->u.img.rect_valid.start_y + 1) >> 1) + N;
+		meta->data.u.img.rect_valid.end_x = ((node->paramList[0]->u.img.rect_valid.end_x + 1) >> 1) - N;
+		meta->data.u.img.rect_valid.end_y = ((node->paramList[0]->u.img.rect_valid.end_y + 1) >> 1) - N;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+#if ENABLE_OPENCL
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Local OpenCL Codegen Functions
+//
+static void agoCodeGenOpenCL_Threshold_U8_U8_Binary(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef Threshold_U8_U8_Binary_\n"
+		"#define Threshold_U8_U8_Binary_\n"
+		"void Threshold_U8_U8_Binary(U8x8 * p0, U8x8 p1, uint p2)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 thr = (float4)amd_unpack0(p2);\n"
+		"  r.s0 = amd_pack((amd_unpack(p1.s0) - thr) * (float4)256.0f);\n"
+		"  r.s1 = amd_pack((amd_unpack(p1.s1) - thr) * (float4)256.0f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_Threshold_U8_U8_Range(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef Threshold_U8_U8_Range_\n"
+		"#define Threshold_U8_U8_Range_\n"
+		"void Threshold_U8_U8_Range(U8x8 * p0, U8x8 p1, uint2 p2)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 thr0 = (float4)(amd_unpack0(p2.s0) - 1.0f);\n"
+		"  float4 thr1 = (float4)(amd_unpack0(p2.s1) + 1.0f);\n"
+		"  float4 pix0 = amd_unpack(p1.s0);\n"
+		"  float4 pix1 = amd_unpack(p1.s1);\n"
+		"  r.s0  = amd_pack((pix0 - thr0) * (float4)256.0f);\n"
+		"  r.s0 &= amd_pack((thr1 - pix0) * (float4)256.0f);\n"
+		"  r.s1  = amd_pack((pix1 - thr0) * (float4)256.0f);\n"
+		"  r.s1 &= amd_pack((thr1 - pix1) * (float4)256.0f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_Add_S16_S16U8_Sat(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef Add_S16_S16U8_Sat_\n"
+		"#define Add_S16_S16U8_Sat_\n"
+		"void Add_S16_S16U8_Sat(S16x8 * p0, S16x8 p1, U8x8 p2)\n"
+		"{\n"
+		"  S16x8 r;\n"
+		"  r.s0  = (int)(clamp((float)(((int)(p1.s0) << 16) >> 16) + amd_unpack0(p2.s0), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+		"  r.s0 |= (int)(clamp((float)( (int)(p1.s0)        >> 16) + amd_unpack1(p2.s0), -32768.0f, 32767.0f)) << 16;\n"
+		"  r.s1  = (int)(clamp((float)(((int)(p1.s1) << 16) >> 16) + amd_unpack2(p2.s0), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+		"  r.s1 |= (int)(clamp((float)( (int)(p1.s1)        >> 16) + amd_unpack3(p2.s0), -32768.0f, 32767.0f)) << 16;\n"
+		"  r.s2  = (int)(clamp((float)(((int)(p1.s2) << 16) >> 16) + amd_unpack0(p2.s1), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+		"  r.s2 |= (int)(clamp((float)( (int)(p1.s2)        >> 16) + amd_unpack1(p2.s1), -32768.0f, 32767.0f)) << 16;\n"
+		"  r.s3  = (int)(clamp((float)(((int)(p1.s3) << 16) >> 16) + amd_unpack2(p2.s1), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+		"  r.s3 |= (int)(clamp((float)( (int)(p1.s3)        >> 16) + amd_unpack3(p2.s1), -32768.0f, 32767.0f)) << 16;\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_AbsDiff_S16_S16S16_Sat(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef AbsDiff_S16_S16S16_Sat_\n"
+		"#define AbsDiff_S16_S16S16_Sat_\n"
+		"void AbsDiff_S16_S16S16_Sat(S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+		"{\n"
+		"  S16x8 r;\n"
+		"  r.s0  = min(abs_diff((((int)(p1.s0) << 16) >> 16), (((int)(p2.s0) << 16) >> 16)), 32767u);\n"
+		"  r.s0 |= min(abs_diff(( (int)(p1.s0)        >> 16), ( (int)(p2.s0)        >> 16)), 32767u) << 16;\n"
+		"  r.s1  = min(abs_diff((((int)(p1.s1) << 16) >> 16), (((int)(p2.s1) << 16) >> 16)), 32767u);\n"
+		"  r.s1 |= min(abs_diff(( (int)(p1.s1)        >> 16), ( (int)(p2.s1)        >> 16)), 32767u) << 16;\n"
+		"  r.s2  = min(abs_diff((((int)(p1.s2) << 16) >> 16), (((int)(p2.s2) << 16) >> 16)), 32767u);\n"
+		"  r.s2 |= min(abs_diff(( (int)(p1.s2)        >> 16), ( (int)(p2.s2)        >> 16)), 32767u) << 16;\n"
+		"  r.s3  = min(abs_diff((((int)(p1.s3) << 16) >> 16), (((int)(p2.s3) << 16) >> 16)), 32767u);\n"
+		"  r.s3 |= min(abs_diff(( (int)(p1.s3)        >> 16), ( (int)(p2.s3)        >> 16)), 32767u) << 16;\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos0(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U24_Pos0_\n"
+		"#define ChannelExtract_U8_U24_Pos0_\n"
+		"void ChannelExtract_U8_U24_Pos0(U8x8 * p0, U24x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack3(p1.s0), amd_unpack2(p1.s1), amd_unpack1(p1.s2)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack0(p1.s3), amd_unpack3(p1.s3), amd_unpack2(p1.s4), amd_unpack1(p1.s5)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos1(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U24_Pos1_\n"
+		"#define ChannelExtract_U8_U24_Pos1_\n"
+		"void ChannelExtract_U8_U24_Pos1(U8x8 * p0, U24x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack1(p1.s0), amd_unpack0(p1.s1), amd_unpack3(p1.s1), amd_unpack2(p1.s2)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack1(p1.s3), amd_unpack0(p1.s4), amd_unpack3(p1.s4), amd_unpack2(p1.s5)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos2(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U24_Pos2_\n"
+		"#define ChannelExtract_U8_U24_Pos2_\n"
+		"void ChannelExtract_U8_U24_Pos2(U8x8 * p0, U24x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack2(p1.s0), amd_unpack1(p1.s1), amd_unpack0(p1.s2), amd_unpack3(p1.s2)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack2(p1.s3), amd_unpack1(p1.s4), amd_unpack0(p1.s5), amd_unpack3(p1.s5)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos0(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U32_Pos0_\n"
+		"#define ChannelExtract_U8_U32_Pos0_\n"
+		"void ChannelExtract_U8_U32_Pos0(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack0(p1.s1), amd_unpack0(p1.s2), amd_unpack0(p1.s3)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack0(p1.s4), amd_unpack0(p1.s5), amd_unpack0(p1.s6), amd_unpack0(p1.s7)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos1(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U32_Pos1_\n"
+		"#define ChannelExtract_U8_U32_Pos1_\n"
+		"void ChannelExtract_U8_U32_Pos1(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack1(p1.s0), amd_unpack1(p1.s1), amd_unpack1(p1.s2), amd_unpack1(p1.s3)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack1(p1.s4), amd_unpack1(p1.s5), amd_unpack1(p1.s6), amd_unpack1(p1.s7)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos2(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U32_Pos2_\n"
+		"#define ChannelExtract_U8_U32_Pos2_\n"
+		"void ChannelExtract_U8_U32_Pos2(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack2(p1.s0), amd_unpack2(p1.s1), amd_unpack2(p1.s2), amd_unpack2(p1.s3)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack2(p1.s4), amd_unpack2(p1.s5), amd_unpack2(p1.s6), amd_unpack2(p1.s7)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos3(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ChannelExtract_U8_U32_Pos3_\n"
+		"#define ChannelExtract_U8_U32_Pos3_\n"
+		"void ChannelExtract_U8_U32_Pos3(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  r.s0 = amd_pack((float4)(amd_unpack3(p1.s0), amd_unpack3(p1.s1), amd_unpack3(p1.s2), amd_unpack3(p1.s3)));\n"
+		"  r.s1 = amd_pack((float4)(amd_unpack3(p1.s4), amd_unpack3(p1.s5), amd_unpack3(p1.s6), amd_unpack3(p1.s7)));\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ColorConvert_Y_RGB(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ColorConvert_Y_RGB_\n"
+		"#define ColorConvert_Y_RGB_\n"
+		"void ColorConvert_Y_RGB(U8x8 * p0, U24x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 f;\n"
+		"  float3 cY = (float3)(0.2126f, 0.7152f, 0.0722f);\n"
+		"  f.s0 = dot(cY, (float3)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0))); \n"
+		"  f.s1 = dot(cY, (float3)(amd_unpack3(p1.s0), amd_unpack0(p1.s1), amd_unpack1(p1.s1))); \n"
+		"  f.s2 = dot(cY, (float3)(amd_unpack2(p1.s1), amd_unpack3(p1.s1), amd_unpack0(p1.s2))); \n"
+		"  f.s3 = dot(cY, (float3)(amd_unpack1(p1.s2), amd_unpack2(p1.s2), amd_unpack3(p1.s2))); \n"
+		"  r.s0 = amd_pack(f);\n"
+		"  f.s0 = dot(cY, (float3)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3))); \n"
+		"  f.s1 = dot(cY, (float3)(amd_unpack3(p1.s3), amd_unpack0(p1.s4), amd_unpack1(p1.s4))); \n"
+		"  f.s2 = dot(cY, (float3)(amd_unpack2(p1.s4), amd_unpack3(p1.s4), amd_unpack0(p1.s5))); \n"
+		"  f.s3 = dot(cY, (float3)(amd_unpack1(p1.s5), amd_unpack2(p1.s5), amd_unpack3(p1.s5))); \n"
+		"  r.s1 = amd_pack(f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ColorConvert_U_RGB(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ColorConvert_U_RGB_\n"
+		"#define ColorConvert_U_RGB_\n"
+		"void ColorConvert_U_RGB(U8x8 * p0, U24x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 f;\n"
+		"  float3 cU = (float3)(-0.1146f, -0.3854f, 0.5f);\n"
+		"  f.s0 = dot(cU, (float3)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0))) + 128.0f; \n"
+		"  f.s1 = dot(cU, (float3)(amd_unpack3(p1.s0), amd_unpack0(p1.s1), amd_unpack1(p1.s1))) + 128.0f; \n"
+		"  f.s2 = dot(cU, (float3)(amd_unpack2(p1.s1), amd_unpack3(p1.s1), amd_unpack0(p1.s2))) + 128.0f; \n"
+		"  f.s3 = dot(cU, (float3)(amd_unpack1(p1.s2), amd_unpack2(p1.s2), amd_unpack3(p1.s2))) + 128.0f; \n"
+		"  r.s0 = amd_pack(f);\n"
+		"  f.s0 = dot(cU, (float3)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3))) + 128.0f; \n"
+		"  f.s1 = dot(cU, (float3)(amd_unpack3(p1.s3), amd_unpack0(p1.s4), amd_unpack1(p1.s4))) + 128.0f; \n"
+		"  f.s2 = dot(cU, (float3)(amd_unpack2(p1.s4), amd_unpack3(p1.s4), amd_unpack0(p1.s5))) + 128.0f; \n"
+		"  f.s3 = dot(cU, (float3)(amd_unpack1(p1.s5), amd_unpack2(p1.s5), amd_unpack3(p1.s5))) + 128.0f; \n"
+		"  r.s1 = amd_pack(f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ColorConvert_V_RGB(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ColorConvert_V_RGB_\n"
+		"#define ColorConvert_V_RGB_\n"
+		"void ColorConvert_V_RGB(U8x8 * p0, U24x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 f;\n"
+		"  float3 cV = (float3)(0.5f, -0.4542f, -0.0458f);\n"
+		"  f.s0 = dot(cV, (float3)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0))) + 128.0f; \n"
+		"  f.s1 = dot(cV, (float3)(amd_unpack3(p1.s0), amd_unpack0(p1.s1), amd_unpack1(p1.s1))) + 128.0f; \n"
+		"  f.s2 = dot(cV, (float3)(amd_unpack2(p1.s1), amd_unpack3(p1.s1), amd_unpack0(p1.s2))) + 128.0f; \n"
+		"  f.s3 = dot(cV, (float3)(amd_unpack1(p1.s2), amd_unpack2(p1.s2), amd_unpack3(p1.s2))) + 128.0f; \n"
+		"  r.s0 = amd_pack(f);\n"
+		"  f.s0 = dot(cV, (float3)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3))) + 128.0f; \n"
+		"  f.s1 = dot(cV, (float3)(amd_unpack3(p1.s3), amd_unpack0(p1.s4), amd_unpack1(p1.s4))) + 128.0f; \n"
+		"  f.s2 = dot(cV, (float3)(amd_unpack2(p1.s4), amd_unpack3(p1.s4), amd_unpack0(p1.s5))) + 128.0f; \n"
+		"  f.s3 = dot(cV, (float3)(amd_unpack1(p1.s5), amd_unpack2(p1.s5), amd_unpack3(p1.s5))) + 128.0f; \n"
+		"  r.s1 = amd_pack(f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ColorConvert_Y_RGBX(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ColorConvert_Y_RGBX_\n"
+		"#define ColorConvert_Y_RGBX_\n"
+		"void ColorConvert_Y_RGBX(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 f;\n"
+		"  float3 cY = (float3)(0.2126f, 0.7152f, 0.0722f);\n"
+		"  f.s0 = dot(cY, (float3)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0))); \n"
+		"  f.s1 = dot(cY, (float3)(amd_unpack0(p1.s1), amd_unpack1(p1.s1), amd_unpack2(p1.s1))); \n"
+		"  f.s2 = dot(cY, (float3)(amd_unpack0(p1.s2), amd_unpack1(p1.s2), amd_unpack2(p1.s2))); \n"
+		"  f.s3 = dot(cY, (float3)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3))); \n"
+		"  r.s0 = amd_pack(f);\n"
+		"  f.s0 = dot(cY, (float3)(amd_unpack0(p1.s4), amd_unpack1(p1.s4), amd_unpack2(p1.s4))); \n"
+		"  f.s1 = dot(cY, (float3)(amd_unpack0(p1.s5), amd_unpack1(p1.s5), amd_unpack2(p1.s5))); \n"
+		"  f.s2 = dot(cY, (float3)(amd_unpack0(p1.s6), amd_unpack1(p1.s6), amd_unpack2(p1.s6))); \n"
+		"  f.s3 = dot(cY, (float3)(amd_unpack0(p1.s7), amd_unpack1(p1.s7), amd_unpack2(p1.s7))); \n"
+		"  r.s1 = amd_pack(f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ColorConvert_U_RGBX(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ColorConvert_U_RGBX_\n"
+		"#define ColorConvert_U_RGBX_\n"
+		"void ColorConvert_U_RGBX(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 f;\n"
+		"  float3 cU = (float3)(-0.1146f, -0.3854f, 0.5f);\n"
+		"  f.s0 = dot(cU, (float3)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0))) + 128.0f; \n"
+		"  f.s1 = dot(cU, (float3)(amd_unpack0(p1.s1), amd_unpack1(p1.s1), amd_unpack2(p1.s1))) + 128.0f; \n"
+		"  f.s2 = dot(cU, (float3)(amd_unpack0(p1.s2), amd_unpack1(p1.s2), amd_unpack2(p1.s2))) + 128.0f; \n"
+		"  f.s3 = dot(cU, (float3)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3))) + 128.0f; \n"
+		"  r.s0 = amd_pack(f);\n"
+		"  f.s0 = dot(cU, (float3)(amd_unpack0(p1.s4), amd_unpack1(p1.s4), amd_unpack2(p1.s4))) + 128.0f; \n"
+		"  f.s1 = dot(cU, (float3)(amd_unpack0(p1.s5), amd_unpack1(p1.s5), amd_unpack2(p1.s5))) + 128.0f; \n"
+		"  f.s2 = dot(cU, (float3)(amd_unpack0(p1.s6), amd_unpack1(p1.s6), amd_unpack2(p1.s6))) + 128.0f; \n"
+		"  f.s3 = dot(cU, (float3)(amd_unpack0(p1.s7), amd_unpack1(p1.s7), amd_unpack2(p1.s7))) + 128.0f; \n"
+		"  r.s1 = amd_pack(f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ColorConvert_V_RGBX(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ColorConvert_V_RGBX_\n"
+		"#define ColorConvert_V_RGBX_\n"
+		"void ColorConvert_V_RGBX(U8x8 * p0, U32x8 p1)\n"
+		"{\n"
+		"  U8x8 r;\n"
+		"  float4 f;\n"
+		"  float3 cV = (float3)(0.5f, -0.4542f, -0.0458f);\n"
+		"  f.s0 = dot(cV, (float3)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0))) + 128.0f; \n"
+		"  f.s1 = dot(cV, (float3)(amd_unpack0(p1.s1), amd_unpack1(p1.s1), amd_unpack2(p1.s1))) + 128.0f; \n"
+		"  f.s2 = dot(cV, (float3)(amd_unpack0(p1.s2), amd_unpack1(p1.s2), amd_unpack2(p1.s2))) + 128.0f; \n"
+		"  f.s3 = dot(cV, (float3)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3))) + 128.0f; \n"
+		"  r.s0 = amd_pack(f);\n"
+		"  f.s0 = dot(cV, (float3)(amd_unpack0(p1.s4), amd_unpack1(p1.s4), amd_unpack2(p1.s4))) + 128.0f; \n"
+		"  f.s1 = dot(cV, (float3)(amd_unpack0(p1.s5), amd_unpack1(p1.s5), amd_unpack2(p1.s5))) + 128.0f; \n"
+		"  f.s2 = dot(cV, (float3)(amd_unpack0(p1.s6), amd_unpack1(p1.s6), amd_unpack2(p1.s6))) + 128.0f; \n"
+		"  f.s3 = dot(cV, (float3)(amd_unpack0(p1.s7), amd_unpack1(p1.s7), amd_unpack2(p1.s7))) + 128.0f; \n"
+		"  r.s1 = amd_pack(f);\n"
+		"  *p0 = r;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_BilinearSample(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef BilinearSample_\n"
+		"#define BilinearSample_\n"
+		"float BilinearSample(__global uchar *p, uint ystride, uint xstride, float fy0, float fy1, int x, float fx0, float fx1)\n"
+		"{\n"
+		"  float4 f;\n"
+		"  p += x;\n"
+		"  f.s0 = amd_unpack0((uint)p[0]);\n"
+		"  f.s1 = amd_unpack0((uint)p[xstride]);\n"
+		"  p += ystride;\n"
+		"  f.s2 = amd_unpack0((uint)p[0]);\n"
+		"  f.s3 = amd_unpack0((uint)p[xstride]);\n"
+		"  f.s0 = mad(f.s0, fx0, f.s1 * fx1);\n"
+		"  f.s2 = mad(f.s2, fx0, f.s3 * fx1);\n"
+		"  f.s0 = mad(f.s0, fy0, f.s2 * fy1);\n"
+		"  return f.s0;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_BilinearSampleFXY(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef BilinearSampleFXY_\n"
+		"#define BilinearSampleFXY_\n"
+		"float BilinearSampleFXY(__global uchar *p, uint stride, float sx, float sy)\n"
+		"{\n"
+		"  float fx0, fx1, fy0, fy1, ii; uint x, y;\n"
+		"  fx1 = fract(sx, &ii); fx0 = 1.0f - fx1; x = (uint)ii;\n"
+		"  fy1 = fract(sy, &ii); fy0 = 1.0f - fy1; y = (uint)ii;\n"
+		"  p += mad24(stride, y, x);\n"
+		"  return BilinearSample(p, stride, 1, fy0, fy1, 0, fx0, fx1);\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_BilinearSampleFXYConstantForRemap(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef BilinearSampleFXYConstantForRemap_\n"
+		"#define BilinearSampleFXYConstantForRemap_\n"
+		"float BilinearSampleFXYConstantForRemap(__global uchar *p, uint stride, uint width, uint height, float sx, float sy, uint borderValue)\n"
+		"{\n"
+		"  float fx0, fx1, fy0, fy1, ii; int x, y;\n"
+		"  fx1 = fract(sx, &ii); fx0 = 1.0f - fx1; x = (int)floor(sx);\n"
+		"  fy1 = fract(sy, &ii); fy0 = 1.0f - fy1; y = (int)floor(sy);\n"
+		"  if (((uint)x) < width - 1 && ((uint)y) < height - 1) {\n"
+		"  	p += y*stride;\n"
+		"  	return BilinearSample(p, stride, 1, fy0, fy1, x, fx0, fx1);\n"
+		"  }\n"
+		"  else {\n"
+		"  	return amd_unpack0(borderValue);\n"
+		"  }\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_SampleWithConstBorder(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef SampleWithConstBorder_\n"
+		"#define SampleWithConstBorder_\n"
+		"uint SampleWithConstBorder(__global uchar *p, int x, int y, uint width, uint height, uint stride, uint borderValue)\n"
+		"{\n"
+		"  uint pixelValue = borderValue;\n"
+		"  if (x >= 0 && y >= 0 && x < width && y < height) {\n"
+		"  	pixelValue = p[y*stride + x];\n"
+		"  }\n"
+		"  return pixelValue;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_BilinearSampleWithConstBorder(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef BilinearSampleWithConstBorder_\n"
+		"#define BilinearSampleWithConstBorder_\n"
+		"float BilinearSampleWithConstBorder(__global uchar *p, int x, int y, uint width, uint height, uint stride, float fx0, float fx1, float fy0, float fy1, uint borderValue)\n"
+		"{\n"
+		"  float4 f;\n"
+		"  f.s0 = amd_unpack0(SampleWithConstBorder(p, x, y, width, height, stride, borderValue));\n"
+		"  f.s1 = amd_unpack0(SampleWithConstBorder(p, x + 1, y, width, height, stride, borderValue));\n"
+		"  f.s2 = amd_unpack0(SampleWithConstBorder(p, x, y + 1, width, height, stride, borderValue));\n"
+		"  f.s3 = amd_unpack0(SampleWithConstBorder(p, x + 1, y + 1, width, height, stride, borderValue));\n"
+		"  f.s0 = mad(f.s0, fx0, f.s1 * fx1);\n"
+		"  f.s2 = mad(f.s2, fx0, f.s3 * fx1);\n"
+		"  f.s0 = mad(f.s0, fy0, f.s2 * fy1);\n"
+		"  return f.s0;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_BilinearSampleFXYConstant(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef BilinearSampleFXYConstant_\n"
+		"#define BilinearSampleFXYConstant_\n"
+		"float BilinearSampleFXYConstant(__global uchar *p, uint stride, uint width, uint height, float sx, float sy, uint borderValue)\n"
+		"{\n"
+		"  float fx0, fx1, fy0, fy1, ii; int x, y;\n"
+		"  fx1 = fract(sx, &ii); fx0 = 1.0f - fx1; x = (int)ii;\n"
+		"  fy1 = fract(sy, &ii); fy0 = 1.0f - fy1; y = (int)ii;\n"
+		"  if (((uint)x) < width && ((uint)y) < height) {\n"
+		"  	p += y*stride;\n"
+		"  	return BilinearSample(p, stride, 1, fy0, fy1, x, fx0, fx1);\n"
+		"  }\n"
+		"  else {\n"
+		"  	return BilinearSampleWithConstBorder(p, x, y, width, height, stride, fx0, fx1, fy0, fy1, borderValue);\n"
+		"  }\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ClampPixelCoordinatesToBorder(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ClampPixelCoordinatesToBorder_\n"
+		"#define ClampPixelCoordinatesToBorder_\n"
+		"uint2 ClampPixelCoordinatesToBorder(float f, uint limit, uint stride)\n"
+		"{\n"
+		"  uint2 vstride;\n"
+		"  vstride.s0 = select((uint)f, 0u, f < 0);\n"
+		"  vstride.s1 = select(stride, 0u, f < 0);\n"
+		"  vstride.s0 = select(vstride.s0, limit, f >= limit);\n"
+		"  vstride.s1 = select(vstride.s1, 0u, f >= limit);\n"
+		"  return vstride;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+static void agoCodeGenOpenCL_ScaleImage_U8_U8_Bilinear(std::string& opencl_code)
+{
+	opencl_code += OPENCL_FORMAT(
+		"#ifndef ScaleImage_U8_U8_Bilinear_\n"
+		"#define ScaleImage_U8_U8_Bilinear_\n"
+		"void ScaleImage_U8_U8_Bilinear(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, float4 scaleInfo)\n"
+		"{\n"
+		"  U8x8 rv;\n"
+		"  float fx, fy, fint, frac, fy0, fy1;\n"
+		"  float4 f;\n"
+		"  fy = mad((float)y, scaleInfo.s1, scaleInfo.s3);\n"
+		"  fy0 = floor(fy); fy1 = fy - fy0; fy0 = 1.0f - fy1;\n"
+		"  p += mul24((uint)fy, stride);\n"
+		"  fx = mad((float)x, scaleInfo.s0, scaleInfo.s2); fint = floor(fx); frac = fx - fint; f.s0 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s1 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s2 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s3 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  rv.s0 = amd_pack(f);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s0 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s1 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s2 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  fx += scaleInfo.s0;                             fint = floor(fx); frac = fx - fint; f.s3 = BilinearSample(p, stride, 1, fy0, fy1, (int)fint, 1.0f - frac, frac);\n"
+		"  rv.s1 = amd_pack(f);\n"
+		"  *r = rv;\n"
+		"}\n"
+		"#endif\n"
+		);
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AMD low-level kernels
+//
+int agoKernel_Set00_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		if (HafCpu_MemSet_U8(node->paramList[0]->size, node->paramList[0]->buffer, 0x00)) {
+			status = VX_FAILURE;
+		}
+    }
+    else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT(node, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+    }
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * p0)\n"
+			"{\n"
+			"  *p0 = (U8x8)(0);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL
+					| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif
+                    ;
+        status = VX_SUCCESS;
+    }
+    return status;
+}
+
+int agoKernel_SetFF_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		if (HafCpu_MemSet_U8(node->paramList[0]->size, node->paramList[0]->buffer, 0xFF)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT(node, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0)\n"
+			"{\n"
+			"  *p0 = (U8x8)(0xffffffff);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL
+					| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Not_U8_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Not_U8_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1)\n"
+			"{\n"
+			"  *p0 = ~p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Not_U8_U1(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Not_U8_U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Not_U1_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Not_U1_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Not_U1_U1(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Not_U1_U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1)\n"
+			"{\n"
+			"  *p0 = ~p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Lut_U8_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iLut = node->paramList[2];
+		if (HafCpu_Lut_U8_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iLut->buffer)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, __read_only image1d_t lut)\n"
+			"{\n"
+			"    U8x8 r;\n"
+			"    float4 f;\n"
+			"    f.s0 = read_imagef(lut, amd_unpack0(p1.s0)).s0 * 255.0f;\n"
+			"    f.s1 = read_imagef(lut, amd_unpack1(p1.s0)).s0 * 255.0f;\n"
+			"    f.s2 = read_imagef(lut, amd_unpack2(p1.s0)).s0 * 255.0f;\n"
+			"    f.s3 = read_imagef(lut, amd_unpack3(p1.s0)).s0 * 255.0f;\n"
+			"    r.s0 = amd_pack(f);\n"
+			"    f.s0 = read_imagef(lut, amd_unpack0(p1.s1)).s0 * 255.0f;\n"
+			"    f.s1 = read_imagef(lut, amd_unpack1(p1.s1)).s0 * 255.0f;\n"
+			"    f.s2 = read_imagef(lut, amd_unpack2(p1.s1)).s0 * 255.0f;\n"
+			"    f.s3 = read_imagef(lut, amd_unpack3(p1.s1)).s0 * 255.0f;\n"
+			"    r.s1 = amd_pack(f);\n"
+			"    *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Threshold_U8_U8_Binary(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_Threshold_U8_U8_Binary(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_BINARY || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Binary(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s Threshold_U8_U8_Binary\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Threshold_U8_U8_Range(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_Threshold_U8_U8_Range(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Range(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s Threshold_U8_U8_Range\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Threshold_U1_U8_Binary(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_Threshold_U1_U8_Binary(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_BINARY || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Binary(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, uint p2)\n"
+			"{\n"
+			"  U8x8 r1;\n"
+			"  Threshold_U8_U8_Binary(&r1, p1, p2);\n"
+			"  Convert_U1_U8(p0, r1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Threshold_U1_U8_Range(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_Threshold_U1_U8_Range(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Range(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, uint2 p2)\n"
+			"{\n"
+			"  U8x8 r1;\n"
+			"  Threshold_U8_U8_Range(&r1, p1, p2);\n"
+			"  Convert_U1_U8(p0, r1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ThresholdNot_U8_U8_Binary(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_ThresholdNot_U8_U8_Binary(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_BINARY || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Binary(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, uint p2)\n"
+			"{\n"
+			"  U8x8 r1;\n"
+			"  Threshold_U8_U8_Binary(&r1, p1, p2);\n"
+			"  *p0 = ~r1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ThresholdNot_U8_U8_Range(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_ThresholdNot_U8_U8_Range(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Range(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, uint2 p2)\n"
+			"{\n"
+			"  U8x8 r1;\n"
+			"  Threshold_U8_U8_Range(&r1, p1, p2);\n"
+			"  *p0 = ~r1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ThresholdNot_U1_U8_Binary(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_ThresholdNot_U1_U8_Binary(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_BINARY || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Binary(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, uint p2)\n"
+			"{\n"
+			"  U8x8 r1;\n"
+			"  Threshold_U8_U8_Binary(&r1, p1, p2);\n"
+			"  Convert_U1_U8(p0, ~r1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ThresholdNot_U1_U8_Range(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iThr = node->paramList[2];
+		if (HafCpu_ThresholdNot_U1_U8_Range(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		if (!(status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8))) {
+			if (node->paramList[2]->u.thr.thresh_type != VX_THRESHOLD_TYPE_RANGE || node->paramList[2]->u.thr.data_type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_TYPE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Threshold_U8_U8_Range(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, uint2 p2)\n"
+			"{\n"
+			"  U8x8 r1;\n"
+			"  Threshold_U8_U8_Range(&r1, p1, p2);\n"
+			"  Convert_U1_U8(p0, ~r1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                    
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorDepth_U8_S16_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_int32 shift = node->paramList[2]->u.scalar.u.i;
+		if (HafCpu_ColorDepth_U8_S16_Wrap(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg->buffer, iImg->u.img.stride_in_bytes, shift)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_S16, VX_TYPE_INT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, S16x8 p1, uint p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  p2 += 16;\n"
+			"  r.s0  = ((((int)p1.s0)  << 16) >> p2) & 0xff;\n"
+			"  r.s0 |= ((((int)p1.s0)         >> p2) & 0xff) <<  8;\n"
+			"  r.s0 |= (((((int)p1.s1) << 16) >> p2) & 0xff) << 16;\n"
+			"  r.s0 |= ((((int)p1.s1)         >> p2) & 0xff) << 24;\n"
+			"  r.s1  = ((((int)p1.s2)  << 16) >> p2) & 0xff;\n"
+			"  r.s1 |= ((((int)p1.s2)         >> p2) & 0xff) <<  8;\n"
+			"  r.s1 |= (((((int)p1.s3) << 16) >> p2) & 0xff) << 16;\n"
+			"  r.s1 |= ((((int)p1.s3)         >> p2) & 0xff) << 24;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorDepth_U8_S16_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_int32 shift = node->paramList[2]->u.scalar.u.i;
+		if (HafCpu_ColorDepth_U8_S16_Sat(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg->buffer, iImg->u.img.stride_in_bytes, shift)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_S16, VX_TYPE_INT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, S16x8 p1, uint p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  float4 f;\n"
+			"  p2 += 16;\n"
+			"  f.s0 = (float)((((int)p1.s0) << 16) >> p2);\n"
+			"  f.s1 = (float)( ((int)p1.s0)        >> p2);\n"
+			"  f.s2 = (float)((((int)p1.s1) << 16) >> p2);\n"
+			"  f.s3 = (float)( ((int)p1.s1)        >> p2);\n"
+			"  r.s0 = amd_pack(f);\n"
+			"  f.s0 = (float)((((int)p1.s2) << 16) >> p2);\n"
+			"  f.s1 = (float)( ((int)p1.s2)        >> p2);\n"
+			"  f.s2 = (float)((((int)p1.s3) << 16) >> p2);\n"
+			"  f.s3 = (float)( ((int)p1.s3)        >> p2);\n"
+			"  r.s1 = amd_pack(f);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorDepth_S16_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_int32 shift = node->paramList[2]->u.scalar.u.i;
+		if (HafCpu_ColorDepth_S16_U8(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, shift)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_TYPE_INT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, uint p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  =  (p1.s0 & 0x000000ff) <<     p2 ;\n"
+			"  r.s0 |=  (p1.s0 & 0x0000ff00) << ( 8+p2);\n"
+			"  r.s1  =  (p1.s0 & 0x00ff0000) >> (16-p2);\n"
+			"  r.s1 |=  (p1.s0 & 0xff000000) >> ( 8-p2);\n"
+			"  r.s2  =  (p1.s1 & 0x000000ff) <<     p2 ;\n"
+			"  r.s2 |=  (p1.s1 & 0x0000ff00) << ( 8+p2);\n"
+			"  r.s3  =  (p1.s1 & 0x00ff0000) >> (16-p2);\n"
+			"  r.s3 |=  (p1.s1 & 0xff000000) >> ( 8-p2);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_U8_U8U8_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_U8_U8U8_Wrap(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0  = (p1.s0 +  p2.s0              ) & 0x000000ff;\n"
+			"  r.s0 |= (p1.s0 + (p2.s0 & 0x0000ff00)) & 0x0000ff00;\n"
+			"  r.s0 |= (p1.s0 + (p2.s0 & 0x00ff0000)) & 0x00ff0000;\n"
+			"  r.s0 |= (p1.s0 + (p2.s0 & 0xff000000)) & 0xff000000;\n"
+			"  r.s1  = (p1.s1 +  p2.s1              ) & 0x000000ff;\n"
+			"  r.s1 |= (p1.s1 + (p2.s1 & 0x0000ff00)) & 0x0000ff00;\n"
+			"  r.s1 |= (p1.s1 + (p2.s1 & 0x00ff0000)) & 0x00ff0000;\n"
+			"  r.s1 |= (p1.s1 + (p2.s1 & 0xff000000)) & 0xff000000;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_U8_U8U8_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_U8_U8U8_Sat(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0 = amd_pack(amd_unpack(p1.s0) + amd_unpack(p2.s0));\n"
+			"  r.s1 = amd_pack(amd_unpack(p1.s1) + amd_unpack(p2.s1));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_U8_U8U8_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_U8_U8U8_Wrap(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0  = (p1.s0 -  p2.s0              ) & 0x000000ff;\n"
+			"  r.s0 |= (p1.s0 - (p2.s0 & 0x0000ff00)) & 0x0000ff00;\n"
+			"  r.s0 |= (p1.s0 - (p2.s0 & 0x00ff0000)) & 0x00ff0000;\n"
+			"  r.s0 |= (p1.s0 - (p2.s0 & 0xff000000)) & 0xff000000;\n"
+			"  r.s1  = (p1.s1 -  p2.s1              ) & 0x000000ff;\n"
+			"  r.s1 |= (p1.s1 - (p2.s1 & 0x0000ff00)) & 0x0000ff00;\n"
+			"  r.s1 |= (p1.s1 - (p2.s1 & 0x00ff0000)) & 0x00ff0000;\n"
+			"  r.s1 |= (p1.s1 - (p2.s1 & 0xff000000)) & 0xff000000;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_U8_U8U8_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_U8_U8U8_Sat(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0 = amd_pack(amd_unpack(p1.s0) - amd_unpack(p2.s0));\n"
+			"  r.s1 = amd_pack(amd_unpack(p1.s1) - amd_unpack(p2.s1));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_U8_U8U8_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_U8_U8U8_Wrap_Trunc(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0  = ((int)(p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0)) & 0x000000ff)      ;\n"
+			"  r.s0 |= ((int)(p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0)) & 0x000000ff) <<  8;\n"
+			"  r.s0 |= ((int)(p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0)) & 0x000000ff) << 16;\n"
+			"  r.s0 |= ((int)(p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0))             ) << 24;\n"
+			"  r.s1  = ((int)(p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1)) & 0x000000ff)      ;\n"
+			"  r.s1 |= ((int)(p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1)) & 0x000000ff) <<  8;\n"
+			"  r.s1 |= ((int)(p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1)) & 0x000000ff) << 16;\n"
+			"  r.s1 |= ((int)(p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1))             ) << 24;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_U8_U8U8_Wrap_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_U8_U8U8_Wrap_Round(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0  = ((int)(p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0) + (0.5f - 0.00006103515625f)) & 0x000000ff)      ;\n"
+			"  r.s0 |= ((int)(p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0) + (0.5f - 0.00006103515625f)) & 0x000000ff) <<  8;\n"
+			"  r.s0 |= ((int)(p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0) + (0.5f - 0.00006103515625f)) & 0x000000ff) << 16;\n"
+			"  r.s0 |= ((int)(p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0) + (0.5f - 0.00006103515625f))             ) << 24;\n"
+			"  r.s1  = ((int)(p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1) + (0.5f - 0.00006103515625f)) & 0x000000ff)      ;\n"
+			"  r.s1 |= ((int)(p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1) + (0.5f - 0.00006103515625f)) & 0x000000ff) <<  8;\n"
+			"  r.s1 |= ((int)(p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1) + (0.5f - 0.00006103515625f)) & 0x000000ff) << 16;\n"
+			"  r.s1 |= ((int)(p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1) + (0.5f - 0.00006103515625f))             ) << 24;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_U8_U8U8_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_U8_U8U8_Sat_Trunc(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  float4 f;\n"
+			"  f.s0 = p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0) - (0.5f - 0.00006103515625f);\n"
+			"  f.s1 = p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0) - (0.5f - 0.00006103515625f);\n"
+			"  f.s2 = p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0) - (0.5f - 0.00006103515625f);\n"
+			"  f.s3 = p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0) - (0.5f - 0.00006103515625f);\n"
+			"  r.s0 = amd_pack(f);\n"
+			"  f.s0 = p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1) - (0.5f - 0.00006103515625f);\n"
+			"  f.s1 = p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1) - (0.5f - 0.00006103515625f);\n"
+			"  f.s2 = p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1) - (0.5f - 0.00006103515625f);\n"
+			"  f.s3 = p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1) - (0.5f - 0.00006103515625f);\n"
+			"  r.s1 = amd_pack(f);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_U8_U8U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_U8_U8U8_Sat_Round(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  float4 f;\n"
+			"  f.s0 = p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0);\n"
+			"  f.s1 = p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0);\n"
+			"  f.s2 = p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0);\n"
+			"  f.s3 = p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0);\n"
+			"  r.s0 = amd_pack(f);\n"
+			"  f.s0 = p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1);\n"
+			"  f.s1 = p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1);\n"
+			"  f.s2 = p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1);\n"
+			"  f.s3 = p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1);\n"
+			"  r.s1 = amd_pack(f);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_And_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  *p0 = p1 & p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U8_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_And_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p2);\n"
+			"  *p0 = p1 & r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U8_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_And_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  *p0 = p2 & r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U8_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_And_U8_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r1, r2;\n"
+			"  Convert_U8_U1(&r1, p1);\n"
+			"  Convert_U8_U1(&r2, p2);\n"
+			"  *p0 = r1 & r2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U1_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_And_U1_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r = p1 & p2;\n"
+			"  Convert_U1_U8(p0, r);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U1_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_And_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = r & p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U1_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_And_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p2);\n"
+			"  *p0 = r & p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_And_U1_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_And_U1_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  *p0 = p1 & p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Or_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  *p0 = p1 | p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U8_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Or_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p2);\n"
+			"  *p0 = p1 | r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U8_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Or_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  *p0 = p2 | r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U8_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Or_U8_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r1, r2;\n"
+			"  Convert_U8_U1(&r1, p1);\n"
+			"  Convert_U8_U1(&r2, p2);\n"
+			"  *p0 = r1 | r2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U1_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Or_U1_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r = p1 | p2;\n"
+			"  Convert_U1_U8(p0, r);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U1_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Or_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = r | p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U1_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Or_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p2);\n"
+			"  *p0 = r | p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Or_U1_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Or_U1_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  *p0 = p1 | p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xor_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  *p0 = p1 ^ p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U8_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xor_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p2);\n"
+			"  *p0 = p1 ^ r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U8_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Xor_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  *p0 = p2 ^ r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U8_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xor_U8_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r1, r2;\n"
+			"  Convert_U8_U1(&r1, p1);\n"
+			"  Convert_U8_U1(&r2, p2);\n"
+			"  *p0 = r1 ^ r2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U1_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xor_U1_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r = p1 ^ p2;\n"
+			"  Convert_U1_U8(p0, r);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U1_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xor_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = r ^ p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U1_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Xor_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p2);\n"
+			"  *p0 = r ^ p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xor_U1_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xor_U1_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  *p0 = p1 ^ p2;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nand_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r = p1 & p2;\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U8_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nand_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p2);\n"
+			"  r = r & p1;\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U8_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Nand_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  r = r & p2;\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U8_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nand_U8_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r1, r2;\n"
+			"  Convert_U8_U1(&r1, p1);\n"
+			"  Convert_U8_U1(&r2, p2);\n"
+			"  r1 = r1 & r2;\n"
+			"  *p0 = ~r1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U1_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nand_U1_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  p1 = p1 & p2;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U1_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nand_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = ~(r & p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U1_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Nand_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p2);\n"
+			"  *p0 = ~(r & p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nand_U1_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nand_U1_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  *p0 = ~(p1 & p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nor_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  *p0 = ~(p1 | p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U8_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nor_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p2);\n"
+			"  *p0 = ~(r | p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U8_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Nor_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  *p0 = ~(r | p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U8_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nor_U8_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r1, r2;\n"
+			"  Convert_U8_U1(&r1, p1);\n"
+			"  Convert_U8_U1(&r2, p2);\n"
+			"  *p0 = ~(r1 | r2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U1_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nor_U1_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1 | p2);\n"
+			"  *p0 = ~r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U1_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nor_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = ~(r | p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U1_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Nor_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p2);\n"
+			"  *p0 = ~(r | p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Nor_U1_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Nor_U1_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  *p0 = ~(p1 | p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xnor_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  *p0 = ~(p1 ^ p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U8_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xnor_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p2);\n"
+			"  *p0 = ~(r ^ p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U8_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Xnor_U8_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  Convert_U8_U1(&r, p1);\n"
+			"  *p0 = ~(r ^ p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U8_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xnor_U8_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U8x8 r1, r2;\n"
+			"  Convert_U8_U1(&r1, p1);\n"
+			"  Convert_U8_U1(&r2, p2);\n"
+			"  *p0 = ~(r1 ^ r2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U1_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xnor_U1_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  Convert_U1_U8(p0, ~(p1 ^ p2));\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U1_U8U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xnor_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p1);\n"
+			"  *p0 = ~(r ^ p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U1_U1U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[2];
+		AgoData * iImg1 = node->paramList[1];
+		if (HafCpu_Xnor_U1_U8U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U1x8 r;\n"
+			"  Convert_U1_U8(&r, p2);\n"
+			"  *p0 = ~(r ^ p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Xnor_U1_U1U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Xnor_U1_U1U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1, U1x8 p2)\n"
+			"{\n"
+			"  *p0 = ~(p1 ^ p2);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_AbsDiff_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_AbsDiff_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0 = amd_pack(fabs(amd_unpack(p1.s0) - amd_unpack(p2.s0)));\n"
+			"  r.s1 = amd_pack(fabs(amd_unpack(p1.s1) - amd_unpack(p2.s1)));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_AccumulateWeighted_U8_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_float32 alpha = node->paramList[2]->u.scalar.u.f;
+		if (HafCpu_AccumulateWeighted_U8_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, alpha)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_TYPE;
+		// Update the valid region
+		node->paramList[0]->u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		node->paramList[0]->u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		node->paramList[0]->u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		node->paramList[0]->u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1, float p2)\n"
+			"{\n"
+			"  U8x8 r = *p0;\n"
+			"  float p3 = 1.0f - p2;\n"
+			"  float4 f;\n"
+			"  f.s0 = p3 * amd_unpack0(r.s0) + p2 * amd_unpack0(p1.s0);\n"
+			"  f.s1 = p3 * amd_unpack1(r.s0) + p2 * amd_unpack1(p1.s0);\n"
+			"  f.s2 = p3 * amd_unpack2(r.s0) + p2 * amd_unpack2(p1.s0);\n"
+			"  f.s3 = p3 * amd_unpack3(r.s0) + p2 * amd_unpack3(p1.s0);\n"
+			"  r.s0 = amd_pack(f);\n"
+			"  f.s0 = p3 * amd_unpack0(r.s1) + p2 * amd_unpack0(p1.s1);\n"
+			"  f.s1 = p3 * amd_unpack1(r.s1) + p2 * amd_unpack1(p1.s1);\n"
+			"  f.s2 = p3 * amd_unpack2(r.s1) + p2 * amd_unpack2(p1.s1);\n"
+			"  f.s3 = p3 * amd_unpack3(r.s1) + p2 * amd_unpack3(p1.s1);\n"
+			"  r.s1 = amd_pack(f);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_S16_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_S16_U8U8(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = ((p1.s0 & 0x000000ff) + (p2.s0 & 0x000000ff));\n"
+			"  r.s0 |= ((p1.s0 & 0x0000ff00) + (p2.s0 & 0x0000ff00)) <<  8;\n"
+			"  r.s1  = ((p1.s0 & 0x00ff0000) + (p2.s0 & 0x00ff0000)) >> 16;\n"
+			"  r.s1 |= ((p1.s0 >>        24) + (p2.s0 >>        24)) << 16;\n"
+			"  r.s2  = ((p1.s1 & 0x000000ff) + (p2.s1 & 0x000000ff));\n"
+			"  r.s2 |= ((p1.s1 & 0x0000ff00) + (p2.s1 & 0x0000ff00)) <<  8;\n"
+			"  r.s3  = ((p1.s1 & 0x00ff0000) + (p2.s1 & 0x00ff0000)) >> 16;\n"
+			"  r.s3 |= ((p1.s1 >>        24) + (p2.s1 >>        24)) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_U8U8(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = ((p1.s0 & 0x000000ff) - (p2.s0 & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s0 |= ((p1.s0 & 0x0000ff00) - (p2.s0 & 0x0000ff00)) <<  8;\n"
+			"  r.s1  = ((p1.s0 & 0x00ff0000) - (p2.s0 & 0x00ff0000)) >> 16;\n"
+			"  r.s1 |= ((p1.s0 >>        24) - (p2.s0 >>        24)) << 16;\n"
+			"  r.s2  = ((p1.s1 & 0x000000ff) - (p2.s1 & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s2 |= ((p1.s1 & 0x0000ff00) - (p2.s1 & 0x0000ff00)) <<  8;\n"
+			"  r.s3  = ((p1.s1 & 0x00ff0000) - (p2.s1 & 0x00ff0000)) >> 16;\n"
+			"  r.s3 |= ((p1.s1 >>        24) - (p2.s1 >>        24)) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_U8U8_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_U8U8_Wrap_Trunc(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)(p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0))) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)(p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0)))             ) << 16;\n"
+			"  r.s1  = (((int)(p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0))) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)(p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0)))             ) << 16;\n"
+			"  r.s2  = (((int)(p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1))) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)(p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1)))             ) << 16;\n"
+			"  r.s3  = (((int)(p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1))) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)(p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1)))             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_U8U8_Wrap_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_U8U8_Wrap_Round(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)(p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0) + 0.5f)) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)(p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0) + 0.5f))             ) << 16;\n"
+			"  r.s1  = (((int)(p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0) + 0.5f)) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)(p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0) + 0.5f))             ) << 16;\n"
+			"  r.s2  = (((int)(p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1) + 0.5f)) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)(p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1) + 0.5f))             ) << 16;\n"
+			"  r.s3  = (((int)(p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1) + 0.5f)) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)(p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1) + 0.5f))             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_U8U8_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_U8U8_Sat_Trunc(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)(clamp(p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0), -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)(clamp(p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0), -32768.0f, 32767.0f)))             ) << 16;\n"
+			"  r.s1  = (((int)(clamp(p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0), -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)(clamp(p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0), -32768.0f, 32767.0f)))             ) << 16;\n"
+			"  r.s2  = (((int)(clamp(p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1), -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)(clamp(p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1), -32768.0f, 32767.0f)))             ) << 16;\n"
+			"  r.s3  = (((int)(clamp(p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1), -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)(clamp(p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1), -32768.0f, 32767.0f)))             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_U8U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_U8U8_Sat_Round(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"    S16x8 r;\n"
+			"    r.s0  = (((int)(clamp(p3 * amd_unpack0(p1.s0) * amd_unpack0(p2.s0) + 0.5f, -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"    r.s0 |= (((int)(clamp(p3 * amd_unpack1(p1.s0) * amd_unpack1(p2.s0) + 0.5f, -32768.0f, 32767.0f)))             ) << 16;\n"
+			"    r.s1  = (((int)(clamp(p3 * amd_unpack2(p1.s0) * amd_unpack2(p2.s0) + 0.5f, -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"    r.s1 |= (((int)(clamp(p3 * amd_unpack3(p1.s0) * amd_unpack3(p2.s0) + 0.5f, -32768.0f, 32767.0f)))             ) << 16;\n"
+			"    r.s2  = (((int)(clamp(p3 * amd_unpack0(p1.s1) * amd_unpack0(p2.s1) + 0.5f, -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"    r.s2 |= (((int)(clamp(p3 * amd_unpack1(p1.s1) * amd_unpack1(p2.s1) + 0.5f, -32768.0f, 32767.0f)))             ) << 16;\n"
+			"    r.s3  = (((int)(clamp(p3 * amd_unpack2(p1.s1) * amd_unpack2(p2.s1) + 0.5f, -32768.0f, 32767.0f))) & 0x0000ffff)      ;\n"
+			"    r.s3 |= (((int)(clamp(p3 * amd_unpack3(p1.s1) * amd_unpack3(p2.s1) + 0.5f, -32768.0f, 32767.0f)))             ) << 16;\n"
+			"    *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_S16_S16U8_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_S16_S16U8_Wrap(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = ((((int)(p1.s0) << 16) >> 16) + ( p2.s0        & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s0 |= ((       p1.s0  & 0xffff0000) + ((p2.s0 <<  8) & 0x00ff0000));\n"
+			"  r.s1  = ((((int)(p1.s1) << 16) >> 16) + ((p2.s0 >> 16) & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s1 |= ((       p1.s1  & 0xffff0000) + ((p2.s0 >>  8) & 0x00ff0000));\n"
+			"  r.s2  = ((((int)(p1.s2) << 16) >> 16) + ( p2.s1        & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s2 |= ((       p1.s2  & 0xffff0000) + ((p2.s1 <<  8) & 0x00ff0000));\n"
+			"  r.s3  = ((((int)(p1.s3) << 16) >> 16) + ((p2.s1 >> 16) & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s3 |= ((       p1.s3  & 0xffff0000) + ((p2.s1 >>  8) & 0x00ff0000));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_S16_S16U8_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Add_S16_S16U8_Sat(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s Add_S16_S16U8_Sat\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Accumulate_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Accumulate_S16_S16U8_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// Update the valid region
+		node->paramList[0]->u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		node->paramList[0]->u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		node->paramList[0]->u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		node->paramList[0]->u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_Add_S16_S16U8_Sat(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1)\n"
+			"{\n"
+			"  Add_S16_S16U8_Sat (p0, *p0, p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_S16U8_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_S16U8_Wrap(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = ((((int)(p1.s0) << 16) >> 16) - ( p2.s0        & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s0 |= ((       p1.s0  & 0xffff0000) - ((p2.s0 <<  8) & 0x00ff0000));\n"
+			"  r.s1  = ((((int)(p1.s1) << 16) >> 16) - ((p2.s0 >> 16) & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s1 |= ((       p1.s1  & 0xffff0000) - ((p2.s0 >>  8) & 0x00ff0000));\n"
+			"  r.s2  = ((((int)(p1.s2) << 16) >> 16) - ( p2.s1        & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s2 |= ((       p1.s2  & 0xffff0000) - ((p2.s1 <<  8) & 0x00ff0000));\n"
+			"  r.s3  = ((((int)(p1.s3) << 16) >> 16) - ((p2.s1 >> 16) & 0x000000ff)) & 0x0000ffff;\n"
+			"  r.s3 |= ((       p1.s3  & 0xffff0000) - ((p2.s1 >>  8) & 0x00ff0000));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_S16U8_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (int)(clamp((float)(((int)(p1.s0) << 16) >> 16) - amd_unpack0(p2.s0), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s0 |= (int)(clamp((float)( (int)(p1.s0)        >> 16) - amd_unpack1(p2.s0), -32768.0f, 32767.0f)) << 16;\n"
+			"  r.s1  = (int)(clamp((float)(((int)(p1.s1) << 16) >> 16) - amd_unpack2(p2.s0), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s1 |= (int)(clamp((float)( (int)(p1.s1)        >> 16) - amd_unpack3(p2.s0), -32768.0f, 32767.0f)) << 16;\n"
+			"  r.s2  = (int)(clamp((float)(((int)(p1.s2) << 16) >> 16) - amd_unpack0(p2.s1), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s2 |= (int)(clamp((float)( (int)(p1.s2)        >> 16) - amd_unpack1(p2.s1), -32768.0f, 32767.0f)) << 16;\n"
+			"  r.s3  = (int)(clamp((float)(((int)(p1.s3) << 16) >> 16) - amd_unpack2(p2.s1), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s3 |= (int)(clamp((float)( (int)(p1.s3)        >> 16) - amd_unpack3(p2.s1), -32768.0f, 32767.0f)) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16U8_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16U8_Wrap_Trunc(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)(p3 * (float)(((int)(p1.s0) << 16) >> 16) * amd_unpack0(p2.s0))) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)(p3 * (float)( (int)(p1.s0)        >> 16) * amd_unpack1(p2.s0)))             ) << 16;\n"
+			"  r.s1  = (((int)(p3 * (float)(((int)(p1.s1) << 16) >> 16) * amd_unpack2(p2.s0))) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)(p3 * (float)( (int)(p1.s1)        >> 16) * amd_unpack3(p2.s0)))             ) << 16;\n"
+			"  r.s2  = (((int)(p3 * (float)(((int)(p1.s2) << 16) >> 16) * amd_unpack0(p2.s1))) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)(p3 * (float)( (int)(p1.s2)        >> 16) * amd_unpack1(p2.s1)))             ) << 16;\n"
+			"  r.s3  = (((int)(p3 * (float)(((int)(p1.s3) << 16) >> 16) * amd_unpack2(p2.s1))) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)(p3 * (float)( (int)(p1.s3)        >> 16) * amd_unpack3(p2.s1)))             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16U8_Wrap_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16U8_Wrap_Round(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)convert_short_rte(p3 * (float)(((int)(p1.s0) << 16) >> 16) * amd_unpack0(p2.s0))) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)convert_short_rte(p3 * (float)( (int)(p1.s0) >> 16)        * amd_unpack1(p2.s0)))             ) << 16;\n"
+			"  r.s1  = (((int)convert_short_rte(p3 * (float)(((int)(p1.s1) << 16) >> 16) * amd_unpack2(p2.s0))) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)convert_short_rte(p3 * (float)( (int)(p1.s1) >> 16)        * amd_unpack3(p2.s0)))             ) << 16;\n"
+			"  r.s2  = (((int)convert_short_rte(p3 * (float)(((int)(p1.s2) << 16) >> 16) * amd_unpack0(p2.s1))) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)convert_short_rte(p3 * (float)( (int)(p1.s2) >> 16)        * amd_unpack1(p2.s1)))             ) << 16;\n"
+			"  r.s3  = (((int)convert_short_rte(p3 * (float)(((int)(p1.s3) << 16) >> 16) * amd_unpack2(p2.s1))) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)convert_short_rte(p3 * (float)( (int)(p1.s3) >> 16)        * amd_unpack3(p2.s1)))             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16U8_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16U8_Sat_Trunc(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  float f;\n"
+			"  f = clamp(p3 * (float)(((int)(p1.s0) << 16) >> 16) * amd_unpack0(p2.s0), -32768.0f, 32767.0f); r.s0  = ((int)(f) & 0x0000ffff)      ;\n"
+			"  f = clamp(p3 * (float)( (int)(p1.s0)        >> 16) * amd_unpack1(p2.s0), -32768.0f, 32767.0f); r.s0 |= ((int)(f)             ) << 16;\n"
+			"  f = clamp(p3 * (float)(((int)(p1.s1) << 16) >> 16) * amd_unpack2(p2.s0), -32768.0f, 32767.0f); r.s1  = ((int)(f) & 0x0000ffff)      ;\n"
+			"  f = clamp(p3 * (float)( (int)(p1.s1)        >> 16) * amd_unpack3(p2.s0), -32768.0f, 32767.0f); r.s1 |= ((int)(f)             ) << 16;\n"
+			"  f = clamp(p3 * (float)(((int)(p1.s2) << 16) >> 16) * amd_unpack0(p2.s1), -32768.0f, 32767.0f); r.s2  = ((int)(f) & 0x0000ffff)      ;\n"
+			"  f = clamp(p3 * (float)( (int)(p1.s2)        >> 16) * amd_unpack1(p2.s1), -32768.0f, 32767.0f); r.s2 |= ((int)(f)             ) << 16;\n"
+			"  f = clamp(p3 * (float)(((int)(p1.s3) << 16) >> 16) * amd_unpack2(p2.s1), -32768.0f, 32767.0f); r.s3  = ((int)(f) & 0x0000ffff)      ;\n"
+			"  f = clamp(p3 * (float)( (int)(p1.s3)        >> 16) * amd_unpack3(p2.s1), -32768.0f, 32767.0f); r.s3 |= ((int)(f)             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16U8_Sat_Round(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0 =  (((int)(convert_short_sat_rte(p3 * (float)(((int)(p1.s0) << 16) >> 16) * amd_unpack0(p2.s0)))) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)(convert_short_sat_rte(p3 * (float)((int)(p1.s0)  >> 16)	       * amd_unpack1(p2.s0))))             ) << 16;\n"
+			"  r.s1  = (((int)(convert_short_sat_rte(p3 * (float)(((int)(p1.s1) << 16) >> 16) * amd_unpack2(p2.s0)))) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)(convert_short_sat_rte(p3 * (float)((int)(p1.s1)  >> 16)        * amd_unpack3(p2.s0)))))              << 16;\n"
+			"  r.s2  = (((int)(convert_short_sat_rte(p3 * (float)(((int)(p1.s2) << 16) >> 16) * amd_unpack0(p2.s1)))) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)(convert_short_sat_rte(p3 * (float)((int)(p1.s2)  >> 16)        * amd_unpack1(p2.s1)))))              << 16;\n"
+			"  r.s3  = (((int)(convert_short_sat_rte(p3 * (float)(((int)(p1.s3) << 16) >> 16) * amd_unpack2(p2.s1)))) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)(convert_short_sat_rte(p3 * (float)((int)(p1.s3)  >> 16)        * amd_unpack3(p2.s1)))))              << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_AccumulateSquared_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_uint32 shift = node->paramList[2]->u.scalar.u.u;
+		if (HafCpu_AccumulateSquared_S16_S16U8_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, shift)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[0]->u.img.width;
+		vx_uint32 height = node->paramList[0]->u.img.height;
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 || node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != node->paramList[1]->u.img.width || height != node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.scalar.type != VX_TYPE_UINT32)
+			return VX_ERROR_INVALID_TYPE;
+		// Update the valid region
+		node->paramList[0]->u.img.rect_valid.start_x = max(node->paramList[0]->u.img.rect_valid.start_x, node->paramList[1]->u.img.rect_valid.start_x);
+		node->paramList[0]->u.img.rect_valid.start_y = max(node->paramList[0]->u.img.rect_valid.start_y, node->paramList[1]->u.img.rect_valid.start_y);
+		node->paramList[0]->u.img.rect_valid.end_x = min(node->paramList[0]->u.img.rect_valid.end_x, node->paramList[1]->u.img.rect_valid.end_x);
+		node->paramList[0]->u.img.rect_valid.end_y = min(node->paramList[0]->u.img.rect_valid.end_y, node->paramList[1]->u.img.rect_valid.end_y);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p2, uint p3)\n"
+			"{\n"
+			"  S16x8 p1 = *p0;\n"
+			"  S16x8 r; int i;\n"
+			"  i = (p2.s0      ) & 255; i *= i; i >>= p3; i += (p1.s0 & 0xffff); i = clamp(i, -32768, 32767); r.s0  = i & 0xffff;\n"
+			"  i = (p2.s0 >>  8) & 255; i *= i; i >>= p3; i += (p1.s0 >>    16); i = clamp(i, -32768, 32767); r.s0 |= i <<    16;\n"
+			"  i = (p2.s0 >> 16) & 255; i *= i; i >>= p3; i += (p1.s1 & 0xffff); i = clamp(i, -32768, 32767); r.s1  = i & 0xffff;\n"
+			"  i = (p2.s0 >> 24) & 255; i *= i; i >>= p3; i += (p1.s1 >>    16); i = clamp(i, -32768, 32767); r.s1 |= i <<    16;\n"
+			"  i = (p2.s1      ) & 255; i *= i; i >>= p3; i += (p1.s2 & 0xffff); i = clamp(i, -32768, 32767); r.s2  = i & 0xffff;\n"
+			"  i = (p2.s1 >>  8) & 255; i *= i; i >>= p3; i += (p1.s2 >>    16); i = clamp(i, -32768, 32767); r.s2 |= i <<    16;\n"
+			"  i = (p2.s1 >> 16) & 255; i *= i; i >>= p3; i += (p1.s3 & 0xffff); i = clamp(i, -32768, 32767); r.s3  = i & 0xffff;\n"
+			"  i = (p2.s1 >> 24) & 255; i *= i; i >>= p3; i += (p1.s3 >>    16); i = clamp(i, -32768, 32767); r.s3 |= i <<    16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_U8S16_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_U8S16_Wrap(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (( p1.s0        & 0x000000ff) - (((int)(p2.s0) << 16) >> 16)) & 0x0000ffff;\n"
+			"  r.s0 |= (((p1.s0 <<  8) & 0x00ff0000) - (       p2.s0  & 0xffff0000));\n"
+			"  r.s1  = (((p1.s0 >> 16) & 0x000000ff) - (((int)(p2.s1) << 16) >> 16)) & 0x0000ffff;\n"
+			"  r.s1 |= (((p1.s0 >>  8) & 0x00ff0000) - (       p2.s1  & 0xffff0000));\n"
+			"  r.s2  = (( p1.s1        & 0x000000ff) - (((int)(p2.s2) << 16) >> 16)) & 0x0000ffff;\n"
+			"  r.s2 |= (((p1.s1 <<  8) & 0x00ff0000) - (       p2.s2  & 0xffff0000));\n"
+			"  r.s3  = (((p1.s1 >> 16) & 0x000000ff) - (((int)(p2.s3) << 16) >> 16)) & 0x0000ffff;\n"
+			"  r.s3 |= (((p1.s1 >>  8) & 0x00ff0000) - (       p2.s3  & 0xffff0000));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_U8S16_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_U8S16_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		//NOTE: Check line 6489 from line 1014 in ago_haf_gpu_elemwise.cpp
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, U8x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (int)(clamp(amd_unpack0(p1.s0) - (float)(((int)(p2.s0) << 16) >> 16), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s0 |= (int)(clamp(amd_unpack1(p1.s0) - (float)( (int)(p2.s0)        >> 16), -32768.0f, 32767.0f)) << 16;\n"
+			"  r.s1  = (int)(clamp(amd_unpack2(p1.s0) - (float)(((int)(p2.s1) << 16) >> 16), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s1 |= (int)(clamp(amd_unpack3(p1.s0) - (float)( (int)(p2.s1)        >> 16), -32768.0f, 32767.0f)) << 16;\n"
+			"  r.s2  = (int)(clamp(amd_unpack0(p1.s1) - (float)(((int)(p2.s2) << 16) >> 16), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s2 |= (int)(clamp(amd_unpack1(p1.s1) - (float)( (int)(p2.s2)        >> 16), -32768.0f, 32767.0f)) << 16;\n"
+			"  r.s3  = (int)(clamp(amd_unpack2(p1.s1) - (float)(((int)(p2.s3) << 16) >> 16), -32768.0f, 32767.0f)) & 0x0000ffff;\n"
+			"  r.s3 |= (int)(clamp(amd_unpack3(p1.s1) - (float)( (int)(p2.s3)        >> 16), -32768.0f, 32767.0f)) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_AbsDiff_S16_S16S16_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_AbsDiff_S16_S16S16_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_AbsDiff_S16_S16S16_Sat(node->opencl_code);
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		char item[128];
+		if (iImg1->u.img.isUniform && !iImg0->u.img.isUniform) {
+			// avoid having to read constant uniform image for AbsDiff (users might do this for Abs operation)
+			node->opencl_param_discard_mask = (1 << 2);
+			sprintf(item, "#define %s(p0,p1) AbsDiff_S16_S16S16_Sat(p0,p1,(S16x8)(%d))\n", node->opencl_name, (int)iImg1->u.img.uniform[0]);
+			node->opencl_code += item;
+		}
+		else if(iImg0->u.img.isUniform && !iImg1->u.img.isUniform) {
+			// avoid having to read constant uniform image for AbsDiff (users might do this for Abs operation)
+			node->opencl_param_discard_mask = (1 << 1);
+			sprintf(item, "#define %s(p0,p2) AbsDiff_S16_S16S16_Sat(p0,p2,(S16x8)(%d))\n", node->opencl_name, (int)iImg0->u.img.uniform[0]);
+			node->opencl_code += item;
+		}
+		else {
+			sprintf(item, "#define %s(p0,p1,p2) AbsDiff_S16_S16S16_Sat(p0,p1,p2)\n", node->opencl_name);
+			node->opencl_code += item;
+		}
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_S16_S16S16_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_S16_S16S16_Wrap(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (p1.s0 +  p2.s0              ) & 0x0000ffff;\n"
+			"  r.s0 |= (p1.s0 + (p2.s0 & 0xffff0000)) & 0xffff0000;\n"
+			"  r.s1  = (p1.s1 +  p2.s1              ) & 0x0000ffff;\n"
+			"  r.s1 |= (p1.s1 + (p2.s1 & 0xffff0000)) & 0xffff0000;\n"
+			"  r.s2  = (p1.s2 +  p2.s2              ) & 0x0000ffff;\n"
+			"  r.s2 |= (p1.s2 + (p2.s2 & 0xffff0000)) & 0xffff0000;\n"
+			"  r.s3  = (p1.s3 +  p2.s3              ) & 0x0000ffff;\n"
+			"  r.s3 |= (p1.s3 + (p2.s3 & 0xffff0000)) & 0xffff0000;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Add_S16_S16S16_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Add_S16_S16S16_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = clamp((((int)(p1.s0) << 16) >> 16) + (((int)(p2.s0) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s0 |= clamp(( (int)(p1.s0)        >> 16) + ( (int)(p2.s0)        >> 16), -32768, 32767) << 16;\n"
+			"  r.s1  = clamp((((int)(p1.s1) << 16) >> 16) + (((int)(p2.s1) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s1 |= clamp(( (int)(p1.s1)        >> 16) + ( (int)(p2.s1)        >> 16), -32768, 32767) << 16;\n"
+			"  r.s2  = clamp((((int)(p1.s2) << 16) >> 16) + (((int)(p2.s2) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s2 |= clamp(( (int)(p1.s2)        >> 16) + ( (int)(p2.s2)        >> 16), -32768, 32767) << 16;\n"
+			"  r.s3  = clamp((((int)(p1.s3) << 16) >> 16) + (((int)(p2.s3) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s3 |= clamp(( (int)(p1.s3)        >> 16) + ( (int)(p2.s3)        >> 16), -32768, 32767) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_S16S16_Wrap(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_S16S16_Wrap(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (p1.s0 -  p2.s0              ) & 0x0000ffff;\n"
+			"  r.s0 |= (p1.s0 - (p2.s0 & 0xffff0000)) & 0xffff0000;\n"
+			"  r.s1  = (p1.s1 -  p2.s1              ) & 0x0000ffff;\n"
+			"  r.s1 |= (p1.s1 - (p2.s1 & 0xffff0000)) & 0xffff0000;\n"
+			"  r.s2  = (p1.s2 -  p2.s2              ) & 0x0000ffff;\n"
+			"  r.s2 |= (p1.s2 - (p2.s2 & 0xffff0000)) & 0xffff0000;\n"
+			"  r.s3  = (p1.s3 -  p2.s3              ) & 0x0000ffff;\n"
+			"  r.s3 |= (p1.s3 - (p2.s3 & 0xffff0000)) & 0xffff0000;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sub_S16_S16S16_Sat(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Sub_S16_S16S16_Sat(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = clamp((((int)(p1.s0) << 16) >> 16) - (((int)(p2.s0) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s0 |= clamp(( (int)(p1.s0)        >> 16) - ( (int)(p2.s0)        >> 16), -32768, 32767) << 16;\n"
+			"  r.s1  = clamp((((int)(p1.s1) << 16) >> 16) - (((int)(p2.s1) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s1 |= clamp(( (int)(p1.s1)        >> 16) - ( (int)(p2.s1)        >> 16), -32768, 32767) << 16;\n"
+			"  r.s2  = clamp((((int)(p1.s2) << 16) >> 16) - (((int)(p2.s2) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s2 |= clamp(( (int)(p1.s2)        >> 16) - ( (int)(p2.s2)        >> 16), -32768, 32767) << 16;\n"
+			"  r.s3  = clamp((((int)(p1.s3) << 16) >> 16) - (((int)(p2.s3) << 16) >> 16), -32768, 32767) & 0x0000ffff;\n"
+			"  r.s3 |= clamp(( (int)(p1.s3)        >> 16) - ( (int)(p2.s3)        >> 16), -32768, 32767) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16S16_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16S16_Wrap_Trunc(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = ((int)(p3 * (double)((((int)(p1.s0)) << 16) >> 16) * (double)((((int)(p2.s0)) << 16) >> 16))) & 0x0000ffff;\n"
+			"  r.s0 |= ((int)(p3 * (double)(( (int)(p1.s0)) >> 16)        * (double)(( (int)(p2.s0)) >> 16)))		  << 16;\n"
+			"  r.s1  = ((int)(p3 * (double)((((int)(p1.s1)) << 16) >> 16) * (double)((((int)(p2.s1)) << 16) >> 16))) & 0x0000ffff;\n"
+			"  r.s1 |= ((int)(p3 * (double)(( (int)(p1.s1)) >> 16)        * (double)(( (int)(p2.s1)) >> 16)))		  << 16;\n"
+			"  r.s2  = ((int)(p3 * (double)((((int)(p1.s2)) << 16) >> 16) * (double)((((int)(p2.s2)) << 16) >> 16))) & 0x0000ffff;\n"
+			"  r.s2 |= ((int)(p3 * (double)(( (int)(p1.s2)) >> 16)        * (double)(( (int)(p2.s2)) >> 16)))		  << 16;\n"
+			"  r.s3  = ((int)(p3 * (double)((((int)(p1.s3)) << 16) >> 16) * (double)((((int)(p2.s3)) << 16) >> 16))) & 0x0000ffff;\n"
+			"  r.s3 |= ((int)(p3 * (double)(( (int)(p1.s3)) >> 16)        * (double)(( (int)(p2.s3)) >> 16)))		  << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16S16_Wrap_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16S16_Wrap_Round(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = ((int)convert_short_rte(p3 * (float)((((int)(p1.s0)) << 16) >> 16) * (float)((((int)(p2.s0)) << 16) >> 16)) & 0x0000ffff)      ;\n"
+			"  r.s0 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s0))  >> 16)        * (float)(((int)(p2.s0))  >> 16))                    ) << 16;\n"
+			"  r.s1  = ((int)convert_short_rte(p3 * (float)((((int)(p1.s1)) << 16) >> 16) * (float)((((int)(p2.s1)) << 16) >> 16)) & 0x0000ffff);\n"
+			"  r.s1 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s1))  >> 16)        * (float)(((int)(p2.s1))  >> 16))                    ) << 16;\n"
+			"  r.s2  = ((int)convert_short_rte(p3 * (float)((((int)(p1.s2)) << 16) >> 16) * (float)((((int)(p2.s2)) << 16) >> 16)) & 0x0000ffff)      ;\n"
+			"  r.s2 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s2))  >> 16)        * (float)(((int)(p2.s2))  >> 16))                    ) << 16;\n"
+			"  r.s3  = ((int)convert_short_rte(p3 * (float)((((int)(p1.s3)) << 16) >> 16) * (float)((((int)(p2.s3)) << 16) >> 16)) & 0x0000ffff)      ;\n"
+			"  r.s3 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s3))  >> 16)        * (float)(((int)(p2.s3))  >> 16))                    ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16S16_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16S16_Sat_Trunc(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)clamp((p3 * (float)((((int)(p1.s0)) << 16) >> 16) * (float)((((int)(p2.s0)) << 16) >> 16)), -32768.0f, 32767.0f)) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)clamp((p3 * (float)(( (int)(p1.s0))        >> 16) * (float)(( (int)(p2.s0))        >> 16)), -32768.0f, 32767.0f))             ) << 16;\n"
+			"  r.s1  = (((int)clamp((p3 * (float)((((int)(p1.s1)) << 16) >> 16) * (float)((((int)(p2.s1)) << 16) >> 16)), -32768.0f, 32767.0f)) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)clamp((p3 * (float)(( (int)(p1.s1))        >> 16) * (float)(( (int)(p2.s1))        >> 16)), -32768.0f, 32767.0f))             ) << 16;\n"
+			"  r.s2  = (((int)clamp((p3 * (float)((((int)(p1.s2)) << 16) >> 16) * (float)((((int)(p2.s2)) << 16) >> 16)), -32768.0f, 32767.0f)) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)clamp((p3 * (float)(( (int)(p1.s2))        >> 16) * (float)(( (int)(p2.s2))        >> 16)), -32768.0f, 32767.0f))             ) << 16;\n"
+			"  r.s3  = (((int)clamp((p3 * (float)((((int)(p1.s3)) << 16) >> 16) * (float)((((int)(p2.s3)) << 16) >> 16)), -32768.0f, 32767.0f)) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)clamp((p3 * (float)(( (int)(p1.s3))        >> 16) * (float)(( (int)(p2.s3))        >> 16)), -32768.0f, 32767.0f))             ) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_S16_S16S16_Sat_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		vx_float32 scale = node->paramList[3]->u.scalar.u.f;
+		if (HafCpu_Mul_S16_S16S16_Sat_Round(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes, scale)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2, float p3)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  r.s0  = (((int)convert_short_sat_rte(p3 * (float)((((int)(p1.s0)) << 16) >> 16) * (float)((((int)(p2.s0)) << 16) >> 16))) & 0x0000ffff)      ;\n"
+			"  r.s0 |= (((int)convert_short_sat_rte(p3 * (float)(((int)(p1.s0))  >> 16)        * (float)(( (int)(p2.s0)) >> 16)      )))               << 16;\n"
+			"  r.s1  = (((int)convert_short_sat_rte(p3 * (float)((((int)(p1.s1)) << 16) >> 16) * (float)((((int)(p2.s1)) << 16) >> 16))) & 0x0000ffff)      ;\n"
+			"  r.s1 |= (((int)convert_short_sat_rte(p3 * (float)(((int)(p1.s1))  >> 16)        * (float)(( (int)(p2.s1)) >> 16)      )))               << 16;\n"
+			"  r.s2  = (((int)convert_short_sat_rte(p3 * (float)((((int)(p1.s2)) << 16) >> 16) * (float)((((int)(p2.s2)) << 16) >> 16))) & 0x0000ffff)      ;\n"
+			"  r.s2 |= (((int)convert_short_sat_rte(p3 * (float)(((int)(p1.s2))  >> 16)        * (float)(( (int)(p2.s2)) >> 16)      )))               << 16;\n"
+			"  r.s3  = (((int)convert_short_sat_rte(p3 * (float)((((int)(p1.s3)) << 16) >> 16) * (float)((((int)(p2.s3)) << 16) >> 16))) & 0x0000ffff)      ;\n"
+			"  r.s3 |= (((int)convert_short_sat_rte(p3 * (float)(((int)(p1.s3))  >> 16)        * (float)(( (int)(p2.s3)) >> 16)      )))               << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Magnitude_S16_S16S16(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Magnitude_S16_S16S16(oImg->u.img.width, oImg->u.img.height, (vx_int16 *)oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (S16x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  S16x8 r;\n"
+			"  float2 f;\n"
+			"  f.s0 = (float)((((int)(p1.s0)) << 16) >> 16); f.s1 = (float)((((int)(p2.s0)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s0  = (uint)(f.s0);\n"
+			"  f.s0 = (float)(( (int)(p1.s0))        >> 16); f.s1 = (float)(( (int)(p2.s0))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s0 |= (uint)(f.s0) << 16;\n"
+			"  f.s0 = (float)((((int)(p1.s1)) << 16) >> 16); f.s1 = (float)((((int)(p2.s1)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s1  = (uint)(f.s0);\n"
+			"  f.s0 = (float)(( (int)(p1.s1))        >> 16); f.s1 = (float)(( (int)(p2.s1))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s1 |= (uint)(f.s0) << 16;\n"
+			"  f.s0 = (float)((((int)(p1.s2)) << 16) >> 16); f.s1 = (float)((((int)(p2.s2)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s2  = (uint)(f.s0);\n"
+			"  f.s0 = (float)(( (int)(p1.s2))        >> 16); f.s1 = (float)(( (int)(p2.s2))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s2 |= (uint)(f.s0) << 16;\n"
+			"  f.s0 = (float)((((int)(p1.s3)) << 16) >> 16); f.s1 = (float)((((int)(p2.s3)) << 16) >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s3  = (uint)(f.s0);\n"
+			"  f.s0 = (float)(( (int)(p1.s3))        >> 16); f.s1 = (float)(( (int)(p2.s3))        >> 16); f.s0 *= f.s0; f.s0 = mad(f.s1, f.s1, f.s0); f.s0 = native_sqrt(f.s0); f.s0 = min(f.s0 + 0.5f, 32767.0f); r.s3 |= (uint)(f.s0) << 16;\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Phase_U8_S16S16(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg0 = node->paramList[1];
+		AgoData * iImg1 = node->paramList[2];
+		if (HafCpu_Phase_U8_S16S16(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, (vx_int16 *)iImg0->buffer, iImg0->u.img.stride_in_bytes, (vx_int16 *)iImg1->buffer, iImg1->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, S16x8 p1, S16x8 p2)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  float2 f; float4 p4;\n"
+			"  f.s0 = (float)((((int)(p1.s0)) << 16) >> 16); f.s1 = (float)((((int)(p2.s0)) << 16) >> 16); p4.s0 = atan2pi(f.s1, f.s0); p4.s0 += (p4.s0 < 0.0) ? 2.0f : 0.0; p4.s0 *= 128.0f;\n"
+			"  f.s0 = (float)(( (int)(p1.s0))        >> 16); f.s1 = (float)(( (int)(p2.s0))        >> 16); p4.s1 = atan2pi(f.s1, f.s0); p4.s1 += (p4.s1 < 0.0) ? 2.0f : 0.0; p4.s1 *= 128.0f;\n"
+			"  f.s0 = (float)((((int)(p1.s1)) << 16) >> 16); f.s1 = (float)((((int)(p2.s1)) << 16) >> 16); p4.s2 = atan2pi(f.s1, f.s0); p4.s2 += (p4.s2 < 0.0) ? 2.0f : 0.0; p4.s2 *= 128.0f;\n"
+			"  f.s0 = (float)(( (int)(p1.s1))        >> 16); f.s1 = (float)(( (int)(p2.s1))        >> 16); p4.s3 = atan2pi(f.s1, f.s0); p4.s3 += (p4.s3 < 0.0) ? 2.0f : 0.0; p4.s3 *= 128.0f;\n"
+			"  p4 = select(p4, (float4) 0.0f, p4 > 255.5f);\n"
+			"  r.s0 = amd_pack(p4);\n"
+			"  f.s0 = (float)((((int)(p1.s2)) << 16) >> 16); f.s1 = (float)((((int)(p2.s2)) << 16) >> 16); p4.s0 = atan2pi(f.s1, f.s0); p4.s0 += (p4.s0 < 0.0) ? 2.0f : 0.0; p4.s0 *= 128.0f;\n"
+			"  f.s0 = (float)(( (int)(p1.s2))        >> 16); f.s1 = (float)(( (int)(p2.s2))        >> 16); p4.s1 = atan2pi(f.s1, f.s0); p4.s1 += (p4.s1 < 0.0) ? 2.0f : 0.0; p4.s1 *= 128.0f;\n"
+			"  f.s0 = (float)((((int)(p1.s3)) << 16) >> 16); f.s1 = (float)((((int)(p2.s3)) << 16) >> 16); p4.s2 = atan2pi(f.s1, f.s0); p4.s2 += (p4.s2 < 0.0) ? 2.0f : 0.0; p4.s2 *= 128.0f;\n"
+			"  f.s0 = (float)(( (int)(p1.s3))        >> 16); f.s1 = (float)(( (int)(p2.s3))        >> 16); p4.s3 = atan2pi(f.s1, f.s0); p4.s3 += (p4.s3 < 0.0) ? 2.0f : 0.0; p4.s3 *= 128.0f;\n"
+			"  p4 = select(p4, (float4) 0.0f, p4 > 255.5f);\n"
+			"  r.s1 = amd_pack(p4);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCopy_U8_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelCopy_U8_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 p1)\n"
+			"{\n"
+			"  *p0 = p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCopy_U8_U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelCopy_U8_U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U1x8 p1)\n"
+			"{\n"
+			"  Convert_U8_U1(p0, p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCopy_U1_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelCopy_U1_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U8x8 p1)\n"
+			"{\n"
+			"  Convert_U1_U8(p0, p1);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCopy_U1_U1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelCopy_U1_U1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U1x8 * p0, U1x8 p1)\n"
+			"{\n"
+			"  *p0 = p1;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U16_Pos0(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U16_Pos0(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U16 && node->paramList[1]->u.img.format != VX_DF_IMAGE_YUYV && node->paramList[1]->u.img.format != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U16x8 p1)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack2(p1.s0), amd_unpack0(p1.s1), amd_unpack2(p1.s1)));\n"
+			"  r.s1 = amd_pack((float4)(amd_unpack0(p1.s2), amd_unpack2(p1.s2), amd_unpack0(p1.s3), amd_unpack2(p1.s3)));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U16_Pos1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U16_Pos1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U16 && node->paramList[1]->u.img.format != VX_DF_IMAGE_YUYV && node->paramList[1]->u.img.format != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U16x8 p1)\n"
+			"{\n"
+			"  U8x8 r;\n"
+			"  r.s0 = amd_pack((float4)(amd_unpack1(p1.s0), amd_unpack3(p1.s0), amd_unpack1(p1.s1), amd_unpack3(p1.s1)));\n"
+			"  r.s1 = amd_pack((float4)(amd_unpack1(p1.s2), amd_unpack3(p1.s2), amd_unpack1(p1.s3), amd_unpack3(p1.s3)));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U24_Pos0(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U24_Pos0(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos0(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ChannelExtract_U8_U24_Pos0\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U24_Pos1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U24_Pos1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos1(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ChannelExtract_U8_U24_Pos1\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U24_Pos2(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U24_Pos2(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos2(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ChannelExtract_U8_U24_Pos2\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U32_Pos0(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U32_Pos0(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		vx_df_image format = node->paramList[1]->u.img.format;
+		if (format != VX_DF_IMAGE_RGBX && format != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> (format == VX_DF_IMAGE_RGBX ? 0 : 1);
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX){
+			node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+			agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos0(node->opencl_code);
+			char textBuffer[2048];
+			sprintf(textBuffer, OPENCL_FORMAT(
+				"#define %s ChannelExtract_U8_U32_Pos0\n"
+				), node->opencl_name);
+			node->opencl_code += textBuffer;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_UYVY)
+			status = HafGpu_ChannelExtract_U8_U32(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif				
+				;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_UYVY)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif				
+				;
+		}
+        
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U32_Pos1(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U32_Pos1(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		vx_df_image format = node->paramList[1]->u.img.format;
+		if (format != VX_DF_IMAGE_RGBX && format != VX_DF_IMAGE_YUYV)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> (format == VX_DF_IMAGE_RGBX ? 0 : 1);
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX){
+			node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+			agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos1(node->opencl_code);
+			char textBuffer[2048];
+			sprintf(textBuffer, OPENCL_FORMAT(
+				"#define %s ChannelExtract_U8_U32_Pos1\n"
+				), node->opencl_name);
+			node->opencl_code += textBuffer;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_YUYV)
+			status = HafGpu_ChannelExtract_U8_U32(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif				
+				;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_YUYV)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL	
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif				
+				;
+		}
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U32_Pos2(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U32_Pos2(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		vx_df_image format = node->paramList[1]->u.img.format;
+		if (format != VX_DF_IMAGE_RGBX && format != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> (format == VX_DF_IMAGE_RGBX ? 0 : 1);
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX){
+			node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+			agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos2(node->opencl_code);
+			char textBuffer[2048];
+			sprintf(textBuffer, OPENCL_FORMAT(
+				"#define %s ChannelExtract_U8_U32_Pos2\n"
+				), node->opencl_name);
+			node->opencl_code += textBuffer;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_UYVY)
+			status = HafGpu_ChannelExtract_U8_U32(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif				
+				;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_UYVY)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif				
+				;
+		}
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8_U32_Pos3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ChannelExtract_U8_U32_Pos3(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		vx_df_image format = node->paramList[1]->u.img.format;
+		if (format != VX_DF_IMAGE_RGBX && format != VX_DF_IMAGE_YUYV)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> (format == VX_DF_IMAGE_RGBX ? 0 : 1);
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX){
+			node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+			agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos3(node->opencl_code);
+			char textBuffer[2048];
+			sprintf(textBuffer, OPENCL_FORMAT(
+				"#define %s ChannelExtract_U8_U32_Pos3\n"
+				), node->opencl_name);
+			node->opencl_code += textBuffer;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_YUYV)
+			status = HafGpu_ChannelExtract_U8_U32(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+		AgoData * iImg = node->paramList[1];
+		if (iImg->u.img.format == VX_DF_IMAGE_RGBX)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif				
+				;
+		}
+		else if (iImg->u.img.format == VX_DF_IMAGE_YUYV)	{
+			node->target_support_flags = 0
+				| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL				
+				| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif				
+				;
+		}
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8U8U8_U24(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg0 = node->paramList[0];
+		AgoData * oImg1 = node->paramList[1];
+		AgoData * oImg2 = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_ChannelExtract_U8U8U8_U24(oImg0->u.img.width, oImg0->u.img.height, oImg0->buffer, oImg1->buffer, oImg2->buffer, oImg0->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_3OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos0(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos1(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U24_Pos2(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 * p1, U8x8 * p2, U24x8 p3)\n"
+			"{\n"
+			"  ChannelExtract_U8_U24_Pos0(p0, p3);\n"
+			"  ChannelExtract_U8_U24_Pos1(p1, p3);\n"
+			"  ChannelExtract_U8_U24_Pos2(p2, p3);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8U8U8_U32(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg0 = node->paramList[0];
+		AgoData * oImg1 = node->paramList[1];
+		AgoData * oImg2 = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_ChannelExtract_U8U8U8_U32(oImg0->u.img.width, oImg0->u.img.height, oImg0->buffer, oImg1->buffer, oImg2->buffer, oImg0->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_3OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGBX);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos0(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos1(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos2(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 * p1, U8x8 * p2, U32x8 p3)\n"
+			"{\n"
+			"  ChannelExtract_U8_U32_Pos0(p0, p3);\n"
+			"  ChannelExtract_U8_U32_Pos1(p1, p3);\n"
+			"  ChannelExtract_U8_U32_Pos2(p2, p3);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelExtract_U8U8U8U8_U32(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg0 = node->paramList[0];
+		AgoData * oImg1 = node->paramList[1];
+		AgoData * oImg2 = node->paramList[2];
+		AgoData * oImg3 = node->paramList[3];
+		AgoData * iImg = node->paramList[4];
+		if (HafCpu_ChannelExtract_U8U8U8U8_U32(oImg0->u.img.width, oImg0->u.img.height, oImg0->buffer, oImg1->buffer, oImg2->buffer, oImg3->buffer, oImg0->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_4OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGBX);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos0(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos1(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos2(node->opencl_code);
+		agoCodeGenOpenCL_ChannelExtract_U8_U32_Pos3(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 * p1, U8x8 * p2, U8x8 * p3, U32x8 p4)\n"
+			"{\n"
+			"  ChannelExtract_U8_U32_Pos0(p0, p4);\n"
+			"  ChannelExtract_U8_U32_Pos1(p1, p4);\n"
+			"  ChannelExtract_U8_U32_Pos2(p2, p4);\n"
+			"  ChannelExtract_U8_U32_Pos3(p3, p4);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCombine_U16_U8U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		if (HafCpu_ChannelCombine_U16_U8U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										   iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U16x8 * p0, U8x8 p1, U8x8 p2)\n"
+			"{\n"
+			"  U16x8 r;\n"
+			"  r.s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack0(p2.s0), amd_unpack1(p1.s0), amd_unpack1(p2.s0)));\n"
+			"  r.s1 = amd_pack((float4)(amd_unpack2(p1.s0), amd_unpack2(p2.s0), amd_unpack3(p1.s0), amd_unpack3(p2.s0)));\n"
+			"  r.s2 = amd_pack((float4)(amd_unpack0(p1.s1), amd_unpack0(p2.s1), amd_unpack1(p1.s1), amd_unpack1(p2.s1)));\n"
+			"  r.s3 = amd_pack((float4)(amd_unpack2(p1.s1), amd_unpack2(p2.s1), amd_unpack3(p1.s1), amd_unpack3(p2.s1)));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCombine_U24_U8U8U8_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		AgoData * iImg3 = node->paramList[3];
+		if (HafCpu_ChannelCombine_U24_U8U8U8_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+												 iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes, 
+												 iImg3->buffer, iImg3->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_3IN(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U24x8 * p0, U8x8 p1, U8x8 p2, U8x8 p3)\n"
+			"{\n"
+			"  (*p0).s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack0(p2.s0), amd_unpack0(p3.s0), amd_unpack1(p1.s0)));\n"
+			"  (*p0).s1 = amd_pack((float4)(amd_unpack1(p2.s0), amd_unpack1(p3.s0), amd_unpack2(p1.s0), amd_unpack2(p2.s0)));\n"
+			"  (*p0).s2 = amd_pack((float4)(amd_unpack2(p3.s0), amd_unpack3(p1.s0), amd_unpack3(p2.s0), amd_unpack3(p3.s0)));\n"
+			"  (*p0).s3 = amd_pack((float4)(amd_unpack0(p1.s1), amd_unpack0(p2.s1), amd_unpack0(p3.s1), amd_unpack1(p1.s1)));\n"
+			"  (*p0).s4 = amd_pack((float4)(amd_unpack1(p2.s1), amd_unpack1(p3.s1), amd_unpack2(p1.s1), amd_unpack2(p2.s1)));\n"
+			"  (*p0).s5 = amd_pack((float4)(amd_unpack2(p3.s1), amd_unpack3(p1.s1), amd_unpack3(p2.s1), amd_unpack3(p3.s1)));\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCombine_U32_U8U8U8_UYVY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		AgoData * iImg3 = node->paramList[3];
+		if (HafCpu_ChannelCombine_U32_U8U8U8_UYVY(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+												  iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes,
+												  iImg3->buffer, iImg3->u.img.stride_in_bytes)) 
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[3]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || height != node->paramList[2]->u.img.height || 
+			                          width != (node->paramList[3]->u.img.width << 1) || height != node->paramList[3]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_UYVY;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ChannelCombine_U32_422(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCombine_U32_U8U8U8_YUYV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		AgoData * iImg3 = node->paramList[3];
+		if (HafCpu_ChannelCombine_U32_U8U8U8_YUYV(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+			                                      iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes, 
+												  iImg3->buffer, iImg3->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[3]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || height != node->paramList[2]->u.img.height ||
+			width != (node->paramList[3]->u.img.width << 1) || height != node->paramList[3]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_YUYV;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ChannelCombine_U32_422(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ChannelCombine_U32_U8U8U8U8_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		AgoData * iImg3 = node->paramList[3];
+		AgoData * iImg4 = node->paramList[4];
+		if (HafCpu_ChannelCombine_U32_U8U8U8U8_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+			                                        iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes, 
+													iImg3->buffer, iImg3->u.img.stride_in_bytes, iImg4->buffer, iImg4->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_4IN(node, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U32x8 * p0, U8x8 p1, U8x8 p2, U8x8 p3, U8x8 p4)\n"
+			"{\n"
+			"  U32x8 r;\n"
+			"  r.s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack0(p2.s0), amd_unpack0(p3.s0), amd_unpack0(p4.s0)));\n"
+			"  r.s1 = amd_pack((float4)(amd_unpack1(p1.s0), amd_unpack1(p2.s0), amd_unpack1(p3.s0), amd_unpack1(p4.s0)));\n"
+			"  r.s2 = amd_pack((float4)(amd_unpack2(p1.s0), amd_unpack2(p2.s0), amd_unpack2(p3.s0), amd_unpack2(p4.s0)));\n"
+			"  r.s3 = amd_pack((float4)(amd_unpack3(p1.s0), amd_unpack3(p2.s0), amd_unpack3(p3.s0), amd_unpack3(p4.s0)));\n"
+			"  r.s4 = amd_pack((float4)(amd_unpack0(p1.s1), amd_unpack0(p2.s1), amd_unpack0(p3.s1), amd_unpack0(p4.s1)));\n"
+			"  r.s5 = amd_pack((float4)(amd_unpack1(p1.s1), amd_unpack1(p2.s1), amd_unpack1(p3.s1), amd_unpack1(p4.s1)));\n"
+			"  r.s6 = amd_pack((float4)(amd_unpack2(p1.s1), amd_unpack2(p2.s1), amd_unpack2(p3.s1), amd_unpack2(p4.s1)));\n"
+			"  r.s7 = amd_pack((float4)(amd_unpack3(p1.s1), amd_unpack3(p2.s1), amd_unpack3(p3.s1), amd_unpack3(p4.s1)));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Mul_U24_U24U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGB, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U24x8 * p0, U24x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  U24x8 r;\n"
+			"  float4 f; float m3;\n"
+			"  m3 = p3 * amd_unpack0(p2.s0);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s0);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s0);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s0);\n"
+			"  m3 = p3 * amd_unpack1(p2.s0);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s0);\n"
+			"  r.s0 = amd_pack(f);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s1);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s1);\n"
+			"  m3 = p3 * amd_unpack2(p2.s0);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s1);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s1);\n"
+			"  r.s1 = amd_pack(f);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s2);\n"
+			"  m3 = p3 * amd_unpack3(p2.s0);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s2);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s2);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s2);\n"
+			"  r.s2 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack0(p2.s1);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s3);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s3);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s3);\n"
+			"  m3 = p3 * amd_unpack1(p2.s1);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s3);\n"
+			"  r.s3 = amd_pack(f);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s4);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s4);\n"
+			"  m3 = p3 * amd_unpack2(p2.s1);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s4);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s4);\n"
+			"  r.s4 = amd_pack(f);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s5);\n"
+			"  m3 = p3 * amd_unpack3(p2.s1);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s5);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s5);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s5);\n"
+			"  r.s5 = amd_pack(f);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_Mul_U32_U32U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN_S(node, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_U8, VX_TYPE_FLOAT32);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U32x8 * p0, U32x8 p1, U8x8 p2, float p3)\n"
+			"{\n"
+			"  U32x8 r;\n"
+			"  float4 f; float m3;\n"
+			"  m3 = p3 * amd_unpack0(p2.s0);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s0);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s0);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s0);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s0);\n"
+			"  r.s0 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack1(p2.s0);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s1);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s1);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s1);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s1);\n"
+			"  r.s1 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack2(p2.s0);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s2);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s2);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s2);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s2);\n"
+			"  r.s2 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack3(p2.s0);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s3);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s3);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s3);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s3);\n"
+			"  r.s3 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack0(p2.s1);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s4);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s4);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s4);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s4);\n"
+			"  r.s4 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack1(p2.s1);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s5);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s5);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s5);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s5);\n"
+			"  r.s5 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack2(p2.s1);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s6);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s6);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s6);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s6);\n"
+			"  r.s6 = amd_pack(f);\n"
+			"  m3 = p3 * amd_unpack3(p2.s1);\n"
+			"  f.s0 = m3 * amd_unpack0(p1.s7);\n"
+			"  f.s1 = m3 * amd_unpack1(p1.s7);\n"
+			"  f.s2 = m3 * amd_unpack2(p1.s7);\n"
+			"  f.s3 = m3 * amd_unpack3(p1.s7);\n"
+			"  r.s7 = amd_pack(f);\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGB_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_RGB_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U24x8 * p0, U32x8 p1)\n"
+			"{\n"
+			"  (*p0).s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0), amd_unpack0(p1.s1)));\n"
+			"  (*p0).s1 = amd_pack((float4)(amd_unpack1(p1.s1), amd_unpack2(p1.s1), amd_unpack0(p1.s2), amd_unpack1(p1.s2)));\n"
+			"  (*p0).s2 = amd_pack((float4)(amd_unpack2(p1.s2), amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3)));\n"
+			"  (*p0).s3 = amd_pack((float4)(amd_unpack0(p1.s4), amd_unpack1(p1.s4), amd_unpack2(p1.s4), amd_unpack0(p1.s5)));\n"
+			"  (*p0).s4 = amd_pack((float4)(amd_unpack1(p1.s5), amd_unpack2(p1.s5), amd_unpack0(p1.s6), amd_unpack1(p1.s6)));\n"
+			"  (*p0).s5 = amd_pack((float4)(amd_unpack2(p1.s6), amd_unpack0(p1.s7), amd_unpack1(p1.s7), amd_unpack2(p1.s7)));\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_RGB_UYVY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_RGB_UYVY(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_UYVY);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+					| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif				
+					;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_RGB_YUYV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_RGB_YUYV(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_YUYV);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGB_IYUV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		AgoData * iImg3 = node->paramList[3];
+		if (HafCpu_ColorConvert_RGB_IYUV(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes,
+										 iImg3->buffer, iImg3->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[3]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || (height != node->paramList[2]->u.img.height << 1) ||
+									  width != (node->paramList[3]->u.img.width << 1) || (height != node->paramList[3]->u.img.height << 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_RGB;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGB_NV12(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		if (HafCpu_ColorConvert_RGB_NV12(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || (height != node->paramList[2]->u.img.height << 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_RGB;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGB_NV21(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		if (HafCpu_ColorConvert_RGB_NV21(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || (height != node->paramList[2]->u.img.height << 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_RGB;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGBX_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_RGBX_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U32x8 * p0, U24x8 p1)\n"
+			"{\n"
+			"  U32x8 r;\n"
+			"  r.s0 = amd_pack((float4)(amd_unpack0(p1.s0), amd_unpack1(p1.s0), amd_unpack2(p1.s0), 255.0f));\n"
+			"  r.s1 = amd_pack((float4)(amd_unpack3(p1.s0), amd_unpack0(p1.s1), amd_unpack1(p1.s1), 255.0f));\n"
+			"  r.s2 = amd_pack((float4)(amd_unpack2(p1.s1), amd_unpack3(p1.s1), amd_unpack0(p1.s2), 255.0f));\n"
+			"  r.s3 = amd_pack((float4)(amd_unpack1(p1.s2), amd_unpack2(p1.s2), amd_unpack3(p1.s2), 255.0f));\n"
+			"  r.s4 = amd_pack((float4)(amd_unpack0(p1.s3), amd_unpack1(p1.s3), amd_unpack2(p1.s3), 255.0f));\n"
+			"  r.s5 = amd_pack((float4)(amd_unpack3(p1.s3), amd_unpack0(p1.s4), amd_unpack1(p1.s4), 255.0f));\n"
+			"  r.s6 = amd_pack((float4)(amd_unpack2(p1.s4), amd_unpack3(p1.s4), amd_unpack0(p1.s5), 255.0f));\n"
+			"  r.s7 = amd_pack((float4)(amd_unpack1(p1.s5), amd_unpack2(p1.s5), amd_unpack3(p1.s5), 255.0f));\n"
+			"  *p0 = r;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_RGBX_UYVY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_RGBX_UYVY(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										  iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_UYVY);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGBX_YUYV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_RGBX_YUYV(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										  iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_YUYV);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGBX_IYUV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		AgoData * iImg3 = node->paramList[3];
+		if (HafCpu_ColorConvert_RGBX_IYUV(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										  iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes,
+										  iImg3->buffer, iImg3->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[3]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || (height != node->paramList[2]->u.img.height << 1) ||
+									  width != (node->paramList[3]->u.img.width << 1) || (height != node->paramList[3]->u.img.height << 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_RGBX;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGBX_NV12(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		if (HafCpu_ColorConvert_RGBX_NV12(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										  iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || (height != node->paramList[2]->u.img.height << 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_RGBX;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_RGBX_NV21(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg1 = node->paramList[1];
+		AgoData * iImg2 = node->paramList[2];
+		if (HafCpu_ColorConvert_RGBX_NV21(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										  iImg1->buffer, iImg1->u.img.stride_in_bytes, iImg2->buffer, iImg2->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 || node->paramList[2]->u.img.format != VX_DF_IMAGE_U16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || width != (node->paramList[2]->u.img.width << 1) || (height != node->paramList[2]->u.img.height << 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_RGBX;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_YUV4_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgU = node->paramList[1];
+		AgoData * oImgV = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_ColorConvert_YUV4_RGB(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										 oImgU->buffer, oImgU->u.img.stride_in_bytes, oImgV->buffer, oImgV->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[3]->u.img.width;
+		vx_uint32 height = node->paramList[3]->u.img.height;
+		if (node->paramList[3]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_Y_RGB(node->opencl_code);
+		agoCodeGenOpenCL_ColorConvert_U_RGB(node->opencl_code);
+		agoCodeGenOpenCL_ColorConvert_V_RGB(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 * p1, U8x8 * p2, U24x8 p3)\n"
+			"{\n"
+			"  ColorConvert_Y_RGB(p0, p3);\n"
+			"  ColorConvert_U_RGB(p1, p3);\n"
+			"  ColorConvert_V_RGB(p2, p3);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_YUV4_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgU = node->paramList[1];
+		AgoData * oImgV = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_ColorConvert_YUV4_RGBX(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										 oImgU->buffer, oImgU->u.img.stride_in_bytes, oImgV->buffer, oImgV->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[3]->u.img.width;
+		vx_uint32 height = node->paramList[3]->u.img.height;
+		if (node->paramList[3]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_Y_RGBX(node->opencl_code);
+		agoCodeGenOpenCL_ColorConvert_U_RGBX(node->opencl_code);
+		agoCodeGenOpenCL_ColorConvert_V_RGBX(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s (U8x8 * p0, U8x8 * p1, U8x8 * p2, U32x8 p3)\n"
+			"{\n"
+			"  ColorConvert_Y_RGBX(p0, p3);\n"
+			"  ColorConvert_U_RGBX(p1, p3);\n"
+			"  ColorConvert_V_RGBX(p2, p3);\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleUp2x2_U8_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleUp2x2_U8_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+									iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width << 1;
+		meta->data.u.img.height = height << 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x << 1;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y << 1;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x << 1;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y << 1;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_Chroma(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FormatConvert_UV_UV12(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgU = node->paramList[0];
+		AgoData * oImgV = node->paramList[1];
+		AgoData * iImgC = node->paramList[2];
+		if (HafCpu_FormatConvert_UV_UV12(oImgU->u.img.width, oImgU->u.img.height, oImgU->buffer, oImgU->u.img.stride_in_bytes,
+										 oImgV->buffer, oImgV->u.img.stride_in_bytes, iImgC->buffer, iImgC->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_U16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width << 1;
+		meta->data.u.img.height = height << 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width << 1;
+		meta->data.u.img.height = height << 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_Chroma(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_IYUV_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgU = node->paramList[1];
+		AgoData * oImgV = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_ColorConvert_IYUV_RGB(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										 oImgU->buffer, oImgU->u.img.stride_in_bytes, oImgV->buffer, oImgV->u.img.stride_in_bytes,
+										 iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[3]->u.img.width;
+		vx_uint32 height = node->paramList[3]->u.img.height;
+		if (node->paramList[3]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_IYUV_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgU = node->paramList[1];
+		AgoData * oImgV = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_ColorConvert_IYUV_RGBX(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										  oImgU->buffer, oImgU->u.img.stride_in_bytes, oImgV->buffer, oImgV->u.img.stride_in_bytes,
+										  iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[3]->u.img.width;
+		vx_uint32 height = node->paramList[3]->u.img.height;
+		if (node->paramList[3]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_FormatConvert_IYUV_UYVY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgU = node->paramList[1];
+		AgoData * oImgV = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_FormatConvert_IYUV_UYVY(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										  oImgU->buffer, oImgU->u.img.stride_in_bytes, oImgV->buffer, oImgV->u.img.stride_in_bytes,
+										  iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[3]->u.img.width;
+		vx_uint32 height = node->paramList[3]->u.img.height;
+		if (node->paramList[3]->u.img.format != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_420_422(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FormatConvert_IYUV_YUYV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgU = node->paramList[1];
+		AgoData * oImgV = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		if (HafCpu_FormatConvert_IYUV_YUYV(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										  oImgU->buffer, oImgU->u.img.stride_in_bytes, oImgV->buffer, oImgV->u.img.stride_in_bytes,
+										  iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[3]->u.img.width;
+		vx_uint32 height = node->paramList[3]->u.img.height;
+		if (node->paramList[3]->u.img.format != VX_DF_IMAGE_YUYV)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes are same as input image size
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		meta = &node->metaList[2];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[3]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[3]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[3]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[3]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_420_422(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FormatConvert_IUV_UV12(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgU = node->paramList[0];
+		AgoData * oImgV = node->paramList[1];
+		AgoData * iImgC = node->paramList[2];
+		if (HafCpu_FormatConvert_IUV_UV12(oImgU->u.img.width, oImgU->u.img.height, oImgU->buffer, oImgU->u.img.stride_in_bytes,
+										  oImgV->buffer, oImgV->u.img.stride_in_bytes, iImgC->buffer, iImgC->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_U16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_Chroma(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_NV12_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgC = node->paramList[1];
+		AgoData * iImg  = node->paramList[2];
+		if (HafCpu_ColorConvert_NV12_RGB(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										 oImgC->buffer, oImgC->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U16;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_NV12_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgC = node->paramList[1];
+		AgoData * iImg  = node->paramList[2];
+		if (HafCpu_ColorConvert_NV12_RGBX(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										  oImgC->buffer, oImgC->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U16;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_FormatConvert_NV12_UYVY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgC = node->paramList[1];
+		AgoData * iImg  = node->paramList[2];
+		if (HafCpu_FormatConvert_NV12_UYVY(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										   oImgC->buffer, oImgC->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_UYVY)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U16;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_420_422(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FormatConvert_NV12_YUYV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgY = node->paramList[0];
+		AgoData * oImgC = node->paramList[1];
+		AgoData * iImg  = node->paramList[2];
+		if (HafCpu_FormatConvert_NV12_YUYV(oImgY->u.img.width, oImgY->u.img.height, oImgY->buffer, oImgY->u.img.stride_in_bytes,
+										   oImgC->buffer, oImgC->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_YUYV)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U16;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_420_422(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FormatConvert_UV12_IUV(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgC = node->paramList[0];
+		AgoData * iImgU = node->paramList[1];
+		AgoData * iImgV = node->paramList[2];
+		if (HafCpu_FormatConvert_UV12_IUV(oImgC->u.img.width, oImgC->u.img.height, oImgC->buffer, oImgC->u.img.stride_in_bytes,
+										  iImgU->buffer, iImgU->u.img.stride_in_bytes, iImgV->buffer, iImgV->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_2IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FormatConvert_Chroma(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_Y_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_Y_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_Y_RGB(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ColorConvert_Y_RGB\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_Y_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_Y_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGBX);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_Y_RGBX(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ColorConvert_Y_RGBX\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_U_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_U_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_U_RGB(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ColorConvert_U_RGB\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_U_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_U_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGBX);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_U_RGBX(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ColorConvert_U_RGBX\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_V_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_V_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGB);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_V_RGB(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ColorConvert_V_RGB\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_V_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_V_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_RGBX);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
+		agoCodeGenOpenCL_ColorConvert_V_RGBX(node->opencl_code);
+		char textBuffer[2048];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"#define %s ColorConvert_V_RGBX\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_R2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ColorConvert_IU_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_IU_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_IU_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_IU_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_IV_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_IV_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_IV_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_IV_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_IUV_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgU = node->paramList[0];
+		AgoData * oImgV = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		if (HafCpu_ColorConvert_IUV_RGB(oImgU->u.img.width, oImgU->u.img.height, oImgU->buffer, oImgU->u.img.stride_in_bytes, 
+										oImgV->buffer, oImgV->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_IUV_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImgU = node->paramList[0];
+		AgoData * oImgV = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		if (HafCpu_ColorConvert_IUV_RGBX(oImgU->u.img.width, oImgU->u.img.height, oImgU->buffer, oImgU->u.img.stride_in_bytes, 
+										 oImgV->buffer, oImgV->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = node->paramList[2]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[2]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[2]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[2]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_UV12_RGB(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_UV12_RGB(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_RGB)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U16;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_ColorConvert_UV12_RGBX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ColorConvert_UV12_RGBX(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_RGBX)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height || (width & 1) || (height & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width >> 1;
+		meta->data.u.img.height = height >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U16;
+		meta->data.u.img.rect_valid.start_x = node->paramList[1]->u.img.rect_valid.start_x;
+		meta->data.u.img.rect_valid.start_y = node->paramList[1]->u.img.rect_valid.start_y;
+		meta->data.u.img.rect_valid.end_x = node->paramList[1]->u.img.rect_valid.end_x;
+		meta->data.u.img.rect_valid.end_y = node->paramList[1]->u.img.rect_valid.end_y;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ColorConvert(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_Box_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Box_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes,
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth * sizeof(vx_uint16);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		// re-use LinearFilter_ANY_U8
+		float filterCoef[] = { 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f };
+		AgoData filter;
+		filter.ref.type = VX_TYPE_MATRIX; filter.u.mat.type = VX_TYPE_FLOAT32; filter.u.mat.columns = filter.u.mat.rows = 3; filter.buffer = (vx_uint8 *)filterCoef; filter.ref.read_only = true;
+		status = HafGpu_LinearFilter_ANY_U8(node, VX_DF_IMAGE_U8, &filter, false);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Dilate_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Dilate_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes,
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U8(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Erode_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Erode_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes,
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U8(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Median_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Median_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U8(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Gaussian_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Gaussian_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes,
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth * sizeof(vx_uint16);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		// re-use LinearFilter_ANY_U8
+		float filterCoef[] = { 0.0625f, 0.125f, 0.0625f, 0.125f, 0.25f, 0.125f, 0.0625f, 0.125f, 0.0625f };
+		AgoData filter;
+		filter.ref.type = VX_TYPE_MATRIX; filter.u.mat.type = VX_TYPE_FLOAT32; filter.u.mat.columns = filter.u.mat.rows = 3; filter.buffer = (vx_uint8 *)filterCoef; filter.ref.read_only = true;
+		status = HafGpu_LinearFilter_ANY_U8(node, VX_DF_IMAGE_U8, &filter, false);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleGaussianHalf_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleGaussianHalf_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes,
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = (width + 1) >> 1;
+		meta->data.u.img.height = (height + 1) >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = ((node->paramList[1]->u.img.rect_valid.start_x + 1) >> 1) + 1;
+		meta->data.u.img.rect_valid.start_y = ((node->paramList[1]->u.img.rect_valid.start_y + 1) >> 1) + 1;
+		meta->data.u.img.rect_valid.end_x = ((node->paramList[1]->u.img.rect_valid.end_x + 1) >> 1) - 1;
+		meta->data.u.img.rect_valid.end_y = ((node->paramList[1]->u.img.rect_valid.end_y + 1) >> 1) - 1;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = (2 * alignedWidth * sizeof(vx_uint16))+16;				// 2 rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ScaleGaussianHalf(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleGaussianHalf_U8_U8_5x5(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		bool sampleFirstRow = (iImg->u.img.height & 1) ? true : false;
+		bool sampleFirstColumn = (iImg->u.img.width & 1) ? true : false;
+		if (iImg->u.img.width < 5 || iImg->u.img.height < 5 || oImg->u.img.width < 3 || oImg->u.img.height < 3) {
+			status = VX_ERROR_INVALID_DIMENSION;
+		}
+		else if (HafCpu_ScaleGaussianHalf_U8_U8_5x5(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + (2 * iImg->u.img.stride_in_bytes), iImg->u.img.stride_in_bytes, sampleFirstRow, sampleFirstColumn, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = (width + 1) >> 1;
+		meta->data.u.img.height = (height + 1) >> 1;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = ((node->paramList[1]->u.img.rect_valid.start_x + 1) >> 1) + 2;
+		meta->data.u.img.rect_valid.start_y = ((node->paramList[1]->u.img.rect_valid.start_y + 1) >> 1) + 2;
+		meta->data.u.img.rect_valid.end_x = ((node->paramList[1]->u.img.rect_valid.end_x + 1) >> 1) - 2;
+		meta->data.u.img.rect_valid.end_y = ((node->paramList[1]->u.img.rect_valid.end_y + 1) >> 1) - 2;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedDstStride = (node->paramList[0]->u.img.stride_in_bytes + 15) & ~15;
+		node->localDataSize = 5 * 2 * alignedDstStride * sizeof(vx_int16);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ScaleGaussianHalf(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleGaussianOrb_U8_U8_5x5(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleGaussianOrb_U8_U8_5x5(oImg->u.img.width, oImg->u.img.height - 4, oImg->buffer + (2 * oImg->u.img.stride_in_bytes), oImg->u.img.stride_in_bytes, 
+			iImg->buffer, iImg->u.img.stride_in_bytes, iImg->u.img.width, iImg->u.img.height, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_uint32 dwidth = (vx_uint32)ceilf(VX_SCALE_PYRAMID_ORB * width);
+		vx_uint32 dheight = (vx_uint32)ceilf(VX_SCALE_PYRAMID_ORB * height);
+		if ((node->paramList[0]->u.img.width && abs((int)dwidth - (int)node->paramList[0]->u.img.width) > 1) ||
+			(node->paramList[0]->u.img.height && abs((int)dheight - (int)node->paramList[0]->u.img.height) > 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = node->paramList[0]->u.img.width ? node->paramList[0]->u.img.width : dwidth;
+		meta->data.u.img.height = node->paramList[0]->u.img.height ? node->paramList[0]->u.img.height : dheight;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = (vx_uint32)ceilf(VX_SCALE_PYRAMID_ORB * node->paramList[1]->u.img.rect_valid.start_x) + 2;
+		meta->data.u.img.rect_valid.start_y = (vx_uint32)ceilf(VX_SCALE_PYRAMID_ORB * node->paramList[1]->u.img.rect_valid.start_y) + 2;
+		meta->data.u.img.rect_valid.end_x = (vx_uint32)floorf(VX_SCALE_PYRAMID_ORB * node->paramList[1]->u.img.rect_valid.end_x) - 2;
+		meta->data.u.img.rect_valid.end_y = (vx_uint32)floorf(VX_SCALE_PYRAMID_ORB * node->paramList[1]->u.img.rect_valid.end_y) - 2;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = (3 * node->paramList[1]->u.img.width)+(2 * alignedWidth) + 128;				// 2 rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_ScaleGaussianOrb(node, VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Convolve_U8_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iConv = node->paramList[2];
+		vx_uint32 convolutionWidth = (vx_uint32)iConv->u.conv.columns;
+		vx_uint32 convolutionHeight = (vx_uint32)iConv->u.conv.rows;
+		if (convolutionWidth == 3) {
+			status = HafCpu_Convolve_U8_U8_3xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else if (convolutionWidth == 5) {
+			status = HafCpu_Convolve_U8_U8_5xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else if (convolutionWidth == 7) {
+			status = HafCpu_Convolve_U8_U8_7xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else if (convolutionWidth == 9) {
+			status = HafCpu_Convolve_U8_U8_9xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else {
+			status = HafCpu_Convolve_U8_U8_MxN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionWidth, convolutionHeight, iConv->u.conv.shift);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		int M = (int) node->paramList[2]->u.conv.columns >> 1;
+		int N = (int) node->paramList[2]->u.conv.rows >> 1;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (!(node->paramList[2]->u.conv.rows & 1) || !(node->paramList[2]->u.conv.columns & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_U8;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[1]->u.img.rect_valid.start_x + M, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[1]->u.img.rect_valid.start_y + N, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[1]->u.img.rect_valid.end_x - M, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[1]->u.img.rect_valid.end_y - N, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_LinearFilter_ANY_U8(node, VX_DF_IMAGE_U8, node->paramList[2], false);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Convolve_S16_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iConv = node->paramList[2];
+		vx_uint32 convolutionWidth = (vx_uint32)iConv->u.conv.columns;
+		vx_uint32 convolutionHeight = (vx_uint32)iConv->u.conv.rows;
+		if (convolutionWidth == 3) {
+			status = HafCpu_Convolve_S16_U8_3xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				(vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1)), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else if (convolutionWidth == 5) {
+			status = HafCpu_Convolve_S16_U8_5xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				(vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1)), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else if (convolutionWidth == 7) {
+			status = HafCpu_Convolve_S16_U8_7xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				(vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1)), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else if (convolutionWidth == 9) {
+			status = HafCpu_Convolve_S16_U8_9xN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				(vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1)), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionHeight, iConv->u.conv.shift);
+		}
+		else {
+			status = HafCpu_Convolve_S16_U8_MxN(oImg->u.img.width, oImg->u.img.height - convolutionHeight + 1,
+				(vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes * (convolutionHeight >> 1)), oImg->u.img.stride_in_bytes,
+				iImg->buffer + iImg->u.img.stride_in_bytes * (convolutionHeight >> 1), iImg->u.img.stride_in_bytes, (vx_int16 *)iConv->buffer, convolutionWidth, convolutionHeight, iConv->u.conv.shift);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		int M = (int) node->paramList[2]->u.conv.columns >> 1;
+		int N = (int) node->paramList[2]->u.conv.rows >> 1;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (!(node->paramList[2]->u.conv.rows & 1) || !(node->paramList[2]->u.conv.columns & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = VX_DF_IMAGE_S16;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[1]->u.img.rect_valid.start_x + M, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[1]->u.img.rect_valid.start_y + N, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[1]->u.img.rect_valid.end_x - M, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[1]->u.img.rect_valid.end_y - N, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_LinearFilter_ANY_U8(node, VX_DF_IMAGE_S16, node->paramList[2], false);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_LinearFilter_ANY_ANY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		if (node->paramList[0]->u.img.format == VX_DF_IMAGE_U8 && node->paramList[1]->u.img.format == VX_DF_IMAGE_U8) {
+			status = agoKernel_Convolve_U8_U8(node, cmd);
+		}
+		else if (node->paramList[0]->u.img.format == VX_DF_IMAGE_S16 && node->paramList[1]->u.img.format == VX_DF_IMAGE_U8) {
+			status = agoKernel_Convolve_S16_U8(node, cmd);
+		}
+		else {
+			// TBD: not implemented yet
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[1]->u.img.width;
+		vx_uint32 height = node->paramList[1]->u.img.height;
+		int M = (int) node->paramList[2]->u.conv.columns >> 1;
+		int N = (int) node->paramList[2]->u.conv.rows >> 1;
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8 &&
+			node->paramList[1]->u.img.format != VX_DF_IMAGE_S16 &&
+			node->paramList[1]->u.img.format != VX_DF_IMAGE_F32_AMD)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (!(node->paramList[2]->u.mat.rows & 1) || !(node->paramList[2]->u.mat.columns & 1))
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 &&
+			node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 &&
+			node->paramList[0]->u.img.format != VX_DF_IMAGE_F32_AMD)
+			return VX_ERROR_INVALID_FORMAT;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = node->paramList[0]->u.img.format;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[1]->u.img.rect_valid.start_x + M, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[1]->u.img.rect_valid.start_y + N, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[1]->u.img.rect_valid.end_x - M, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[1]->u.img.rect_valid.end_y - N, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		if (node->paramList[1]->u.img.format == VX_DF_IMAGE_U8) {
+			status = HafGpu_LinearFilter_ANY_U8(node, node->paramList[0]->u.img.format, node->paramList[2], true);
+		}
+		else if (node->paramList[1]->u.img.format == VX_DF_IMAGE_S16) {
+			status = HafGpu_LinearFilter_ANY_S16(node, node->paramList[0]->u.img.format, node->paramList[2], true);
+		}
+		else if (node->paramList[1]->u.img.format == VX_DF_IMAGE_F32_AMD) {
+			status = HafGpu_LinearFilter_ANY_F32(node, node->paramList[0]->u.img.format, node->paramList[2], true);
+		}
+		else {
+			// TBD: not implemented yet
+		}
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_LinearFilter_ANYx2_ANY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD: not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		int M = (int) node->paramList[3]->u.conv.columns >> 1;
+		int N = (int) node->paramList[3]->u.conv.rows >> 1;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (!(node->paramList[3]->u.mat.rows & 1) || !(node->paramList[3]->u.mat.columns & 1) ||
+			      (node->paramList[3]->u.mat.rows != node->paramList[4]->u.mat.rows) ||
+				  (node->paramList[3]->u.mat.columns != node->paramList[4]->u.mat.columns))
+			return VX_ERROR_INVALID_DIMENSION;
+		else if ((node->paramList[3]->u.mat.type != VX_TYPE_FLOAT32) || (node->paramList[3]->u.mat.type != node->paramList[4]->u.mat.type))
+			return VX_ERROR_INVALID_FORMAT;
+		else if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8 &&
+			node->paramList[0]->u.img.format != VX_DF_IMAGE_S16 &&
+			node->paramList[0]->u.img.format != VX_DF_IMAGE_F32_AMD &&
+			node->paramList[0]->u.img.format != node->paramList[1]->u.img.format)
+			return VX_ERROR_INVALID_FORMAT;
+		// set output image sizes and format
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = node->paramList[0]->u.img.format;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[2]->u.img.rect_valid.start_x + M, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[2]->u.img.rect_valid.start_y + N, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[2]->u.img.rect_valid.end_x - M, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[2]->u.img.rect_valid.end_y - N, 0);
+		meta = &node->metaList[1];
+		meta->data.u.img.width = width;
+		meta->data.u.img.height = height;
+		meta->data.u.img.format = node->paramList[0]->u.img.format;
+		meta->data.u.img.rect_valid.start_x = min(node->paramList[2]->u.img.rect_valid.start_x + M, width);
+		meta->data.u.img.rect_valid.start_y = min(node->paramList[2]->u.img.rect_valid.start_y + N, height);
+		meta->data.u.img.rect_valid.end_x = max((int)node->paramList[2]->u.img.rect_valid.end_x - M, 0);
+		meta->data.u.img.rect_valid.end_y = max((int)node->paramList[2]->u.img.rect_valid.end_y - N, 0);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		if (node->paramList[2]->u.img.format == VX_DF_IMAGE_U8) {
+			status = HafGpu_LinearFilter_ANYx2_U8(node, node->paramList[0]->u.img.format, node->paramList[3], node->paramList[4], true);
+		}
+		else {
+			// TBD: not implemented yet
+		}
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+#if ENABLE_OPENCL        
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_SobelMagnitude_S16_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_SobelMagnitude_S16_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, (vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes), oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_SobelSpecialCases(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_SobelPhase_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_SobelPhase_U8_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;	// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		int alignedStride = (node->paramList[0]->u.img.stride_in_bytes + 15) & ~15;
+		node->localDataSize = (alignedStride * node->paramList[0]->u.img.height * sizeof(vx_int16) * 2) + (6 * alignedWidth * sizeof(vx_int16));	// Two buffers for Gx and Gy and Three rows (+some extra) worth of scratch memory
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_SobelSpecialCases(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_SobelMagnitudePhase_S16U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg1 = node->paramList[0];
+		AgoData * oImg2 = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		if (HafCpu_SobelMagnitudePhase_S16U8_U8_3x3(oImg1->u.img.width, oImg1->u.img.height - 2, 
+			(vx_int16 *)(oImg1->buffer + oImg1->u.img.stride_in_bytes), oImg1->u.img.stride_in_bytes,
+			oImg2->buffer + oImg2->u.img.stride_in_bytes, oImg2->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_2OUT_1IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_SobelSpecialCases(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sobel_S16S16_U8_3x3_GXY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg1 = node->paramList[0];
+		AgoData * oImg2 = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		if (HafCpu_Sobel_S16S16_U8_3x3_GXY(oImg1->u.img.width, oImg1->u.img.height - 2, 
+			(vx_int16 *)(oImg1->buffer + oImg1->u.img.stride_in_bytes), oImg1->u.img.stride_in_bytes,
+			(vx_int16 *)(oImg2->buffer + oImg2->u.img.stride_in_bytes), oImg2->u.img.stride_in_bytes,
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_2OUT_1IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 6 * alignedWidth * sizeof(vx_int16);				// Three rows (+some extra) worth of scratch memory	- each row is Gx and Gy	
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_SobelSpecialCases(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sobel_S16_U8_3x3_GX(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Sobel_S16_U8_3x3_GX(oImg->u.img.width, oImg->u.img.height - 2, (vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes), oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth * sizeof(vx_int16);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_SobelSpecialCases(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Sobel_S16_U8_3x3_GY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Sobel_S16_U8_3x3_GY(oImg->u.img.width, oImg->u.img.height - 2, (vx_int16 *)(oImg->buffer + oImg->u.img.stride_in_bytes), oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_S16, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth * sizeof(vx_int16);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_SobelSpecialCases(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Dilate_U1_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Dilate_U1_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U8(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Erode_U1_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Erode_U1_U8_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U8(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Dilate_U1_U1_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Dilate_U1_U1_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U1(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Erode_U1_U1_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Erode_U1_U1_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U1_AMD, VX_DF_IMAGE_U1_AMD, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U1(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Dilate_U8_U1_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Dilate_U8_U1_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U1(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Erode_U8_U1_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_Erode_U8_U1_3x3(oImg->u.img.width, oImg->u.img.height - 2, oImg->buffer + oImg->u.img.stride_in_bytes, oImg->u.img.stride_in_bytes, 
+			iImg->buffer + iImg->u.img.stride_in_bytes, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U1_AMD, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonLinearFilter_3x3_ANY_U1(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FastCorners_XY_U8_Supression(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oXY = node->paramList[0];
+		AgoData * oNumCorners = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+		vx_uint32 numXY = 0;
+		if (HafCpu_FastCorners_XY_U8_Supression((vx_uint32)oXY->u.arr.capacity, (vx_keypoint_t *)oXY->buffer, &numXY, 
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, strength_threshold, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+		else {
+			oXY->u.arr.numitems = min(numXY, (vx_uint32)oXY->u.arr.capacity);
+			if (oNumCorners) oNumCorners->u.scalar.u.s = numXY;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[3]->u.scalar.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_TYPE;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_SIZE;
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FastCorners_XY_U8(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_initialize) {
+		node->localDataSize = node->paramList[2]->u.img.width * node->paramList[2]->u.img.height;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_FastCorners_XY_U8_NoSupression(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oXY = node->paramList[0];
+		AgoData * oNumCorners = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+		vx_uint32 numXY = 0;
+		if (HafCpu_FastCorners_XY_U8_NoSupression((vx_uint32)oXY->u.arr.capacity, (vx_keypoint_t *)oXY->buffer, &numXY,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, strength_threshold)) {
+			status = VX_FAILURE;
+		}
+		else {
+			oXY->u.arr.numitems = min(numXY, (vx_uint32)oXY->u.arr.capacity);
+			if (oNumCorners) oNumCorners->u.scalar.u.s = numXY;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		vx_uint32 width = node->paramList[2]->u.img.width;
+		vx_uint32 height = node->paramList[2]->u.img.height;
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!width || !height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[3]->u.scalar.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_TYPE;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_SIZE;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_FastCorners_XY_U8(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisSobel_HG3_U8_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_HarrisSobel_HG3_U8_3x3(oImg->u.img.width, oImg->u.img.height, (vx_float32 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_F32x3_AMD, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 6 * alignedWidth * sizeof(vx_int16);				// Three rows (one vx_int16 for Gx and one for Gy + some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_HarrisSobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisSobel_HG3_U8_5x5(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_HarrisSobel_HG3_U8_5x5(oImg->u.img.width, oImg->u.img.height, (vx_float32 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_F32x3_AMD, VX_DF_IMAGE_U8, true, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 10 * alignedWidth * sizeof(vx_int16);				// Five rows (one vx_int16 for Gx and one for Gy + some extra) worth of scratch memory		
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_HarrisSobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisSobel_HG3_U8_7x7(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_HarrisSobel_HG3_U8_7x7(oImg->u.img.width, oImg->u.img.height, (vx_float32 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_F32x3_AMD, VX_DF_IMAGE_U8, true, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 14 * alignedWidth * sizeof(vx_int16);				// Seven rows (one vx_int16 for Gx and one for Gy + some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_HarrisSobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisScore_HVC_HG3_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_float32 sensitivity = node->paramList[2]->u.scalar.u.f;
+		vx_int32 gradient_size = node->paramList[4]->u.scalar.u.i;
+		vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+		vx_float32 normFactor = 255.0f * (1 << (gradient_size - 1)) * 3;
+		normFactor = normFactor * normFactor * normFactor * normFactor;
+		if (HafCpu_HarrisScore_HVC_HG3_3x3(oImg->u.img.width, oImg->u.img.height, (vx_float32 *)oImg->buffer, oImg->u.img.stride_in_bytes,
+			(vx_float32 *)iImg->buffer, iImg->u.img.stride_in_bytes, sensitivity, strength_threshold, normFactor)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_3S(node, VX_DF_IMAGE_F32_AMD, VX_DF_IMAGE_F32x3_AMD, VX_TYPE_FLOAT32, VX_TYPE_FLOAT32, VX_TYPE_INT32, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_HarrisScoreFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisScore_HVC_HG3_5x5(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_float32 sensitivity = node->paramList[2]->u.scalar.u.f;
+		vx_int32 gradient_size = node->paramList[4]->u.scalar.u.i;
+		vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+		vx_float32 normFactor = 255.0f * (1 << (gradient_size - 1)) * 5;
+		normFactor = normFactor * normFactor * normFactor * normFactor;
+		if (HafCpu_HarrisScore_HVC_HG3_5x5(oImg->u.img.width, oImg->u.img.height, (vx_float32 *)oImg->buffer, oImg->u.img.stride_in_bytes,
+			(vx_float32 *)iImg->buffer, iImg->u.img.stride_in_bytes, sensitivity, strength_threshold, normFactor)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_3S(node, VX_DF_IMAGE_F32_AMD, VX_DF_IMAGE_F32x3_AMD, VX_TYPE_FLOAT32, VX_TYPE_FLOAT32, VX_TYPE_INT32, true, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_HarrisScoreFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisScore_HVC_HG3_7x7(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_float32 sensitivity = node->paramList[2]->u.scalar.u.f;
+		vx_int32 gradient_size = node->paramList[4]->u.scalar.u.i;
+		vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+		vx_float32 normFactor = 255.0f * (1 << (gradient_size - 1)) * 7;
+		normFactor = normFactor * normFactor * normFactor * normFactor;
+		if (HafCpu_HarrisScore_HVC_HG3_7x7(oImg->u.img.width, oImg->u.img.height, (vx_float32 *)oImg->buffer, oImg->u.img.stride_in_bytes,
+			(vx_float32 *)iImg->buffer, iImg->u.img.stride_in_bytes, sensitivity, strength_threshold, normFactor)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_3S(node, VX_DF_IMAGE_F32_AMD, VX_DF_IMAGE_F32x3_AMD, VX_TYPE_FLOAT32, VX_TYPE_FLOAT32, VX_TYPE_INT32, true, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_HarrisScoreFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8_U8_3x3_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+    else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U8, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8_U8_3x3_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U8, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8_U8_5x5_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U8, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8_U8_5x5_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U8, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8_U8_7x7_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U8, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8_U8_7x7_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+        // TBD: not implemented yet
+    }
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U8, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_3x3_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySobelSuppThreshold_U8XY_U8_3x3_L1NORM(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+															  oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+															  iImg->buffer, iImg->u.img.stride_in_bytes,
+															  iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U8, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedStride = (node->paramList[0]->u.img.stride_in_bytes + 15) & ~15;
+		node->localDataSize = ((2 * alignedStride * node->paramList[0]->u.img.height) + (6 * alignedStride)) * sizeof(vx_int16);
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_3x3_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySobelSuppThreshold_U8XY_U8_3x3_L2NORM(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+															  oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+															  iImg->buffer, iImg->u.img.stride_in_bytes,
+															  iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U8, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_5x5_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySobelSuppThreshold_U8XY_U8_5x5_L1NORM(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+															  oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+															  iImg->buffer, iImg->u.img.stride_in_bytes,
+															  iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U8, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_5x5_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySobelSuppThreshold_U8XY_U8_5x5_L2NORM(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+															  oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+															  iImg->buffer, iImg->u.img.stride_in_bytes,
+															  iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U8, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_7x7_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySobelSuppThreshold_U8XY_U8_7x7_L1NORM(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+															  oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+															  iImg->buffer, iImg->u.img.stride_in_bytes,
+															  iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U8, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_7x7_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySobelSuppThreshold_U8XY_U8_7x7_L2NORM(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+															  oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes, 
+															  iImg->buffer, iImg->u.img.stride_in_bytes,
+															  iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U8, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobel_U16_U8_3x3_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_CannySobel_U16_U8_3x3_L1NORM(oImg->u.img.width, oImg->u.img.height, (vx_uint16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = (alignedWidth * 4) + 128;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobel_U16_U8_3x3_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_CannySobel_U16_U8_3x3_L2NORM(oImg->u.img.width, oImg->u.img.height, (vx_uint16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, true, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = (alignedWidth * 4) + 128;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobel_U16_U8_5x5_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_CannySobel_U16_U8_5x5_L1NORM(oImg->u.img.width, oImg->u.img.height, (vx_uint16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, true, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = (alignedWidth * 4) + 128;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobel_U16_U8_5x5_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_CannySobel_U16_U8_5x5_L2NORM(oImg->u.img.width, oImg->u.img.height, (vx_uint16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, true, 2, 2);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = (alignedWidth * 4) + 128;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobel_U16_U8_7x7_L1NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_CannySobel_U16_U8_7x7_L1NORM(oImg->u.img.width, oImg->u.img.height, (vx_uint16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, true, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = (alignedWidth * 4) + 128;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySobel_U16_U8_7x7_L2NORM(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_CannySobel_U16_U8_7x7_L2NORM(oImg->u.img.width, oImg->u.img.height, (vx_uint16 *)oImg->buffer, oImg->u.img.stride_in_bytes, iImg->buffer, iImg->u.img.stride_in_bytes, node->localDataPtr)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U16, VX_DF_IMAGE_U8, true, 3, 3);
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = (alignedWidth * 4) + 128;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySobelFilters(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySuppThreshold_U8_U16_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD: not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8(node, VX_DF_IMAGE_U16, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySuppThreshold(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+#if ENABLE_OPENCL        
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannySuppThreshold_U8XY_U16_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+    vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+    if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * oStack = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		AgoData * iThr = node->paramList[3];
+		oStack->u.cannystack.stackTop = 0;
+		if (HafCpu_CannySuppThreshold_U8XY_U16_3x3(oStack->u.cannystack.count, (ago_coord2d_ushort_t *)oStack->buffer, &oStack->u.cannystack.stackTop,
+			oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			(vx_uint16 *)iImg->buffer, iImg->u.img.stride_in_bytes,
+			iThr->u.thr.threshold_lower, iThr->u.thr.threshold_upper))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_CannySuppThreshold_U8XY(node, VX_DF_IMAGE_U16, 1, 1);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_CannySuppThreshold(node);
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_NonMaxSupp_XY_ANY_3x3(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oList = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_uint32 numitems = 0;
+		if (HafCpu_NonMaxSupp_XY_ANY_3x3((vx_uint32)oList->u.arr.capacity, (ago_keypoint_xys_t *)oList->buffer, &numitems,
+			iImg->u.img.width, iImg->u.img.height, (vx_float32 *)iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else
+		{
+			oList->u.arr.numitems = numitems;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_F32_AMD)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[1]->u.img.width || !node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = AGO_TYPE_KEYPOINT_XYS;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = HafGpu_NonMaxSupp_XY_ANY_3x3(node);
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL			
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_Remap_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMap = node->paramList[2];
+		if (HafCpu_Remap_U8_U8_Nearest(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+									   iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes,
+									   (ago_coord2d_ushort_t *)iMap->buffer, iMap->u.remap.dst_width * sizeof(ago_coord2d_ushort_t)))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_U8;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes)\n"
+			"{\n"
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U8x8 rv;\n"
+			"  int map; uint v;\n"
+			"#define COMPUTE x = ((map & 0xffff) + 4) >> 3; y = (map + 0x00040000) >> 19; v = p[mad24(stride, y, x)]\n"
+			"  map = remap[0]; COMPUTE ; rv.s0  = v;\n"
+			"  map = remap[1]; COMPUTE ; rv.s0 |= v << 8;\n"
+			"  map = remap[2]; COMPUTE ; rv.s0 |= v << 16;\n"
+			"  map = remap[3]; COMPUTE ; rv.s0 |= v << 24;\n"
+			"  map = remap[4]; COMPUTE ; rv.s1  = v;\n"
+			"  map = remap[5]; COMPUTE ; rv.s1 |= v << 8;\n"
+			"  map = remap[6]; COMPUTE ; rv.s1 |= v << 16;\n"
+			"  map = remap[7]; COMPUTE ; rv.s1 |= v << 24;\n"
+			"#undef COMPUTE\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code = textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Remap_U8_U8_Nearest_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMap = node->paramList[2];
+		if (HafCpu_Remap_U8_U8_Nearest_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes,
+			(ago_coord2d_ushort_t *)iMap->buffer, iMap->u.remap.dst_width * sizeof(ago_coord2d_ushort_t), node->paramList[3]->u.scalar.u.u))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			if (node->paramList[3]->u.scalar.type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_FORMAT;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_U8;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes, uint borderValue)\n"
+			"{\n"
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U8x8 rv;\n"
+			"  int map; uint mask, v;\n"
+			"  width -= 1; height -= 1;\n"
+			"#define COMPUTE x = ((map & 0xffff) + 4) >> 3; y = (map + 0x00040000) >> 19; mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask; x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(borderValue, v, mask)\n"
+			"  map = remap[0]; COMPUTE ; rv.s0 = v;\n"
+			"  map = remap[1]; COMPUTE ; rv.s0 |= v << 8;\n"
+			"  map = remap[2]; COMPUTE ; rv.s0 |= v << 16;\n"
+			"  map = remap[3]; COMPUTE ; rv.s0 |= v << 24;\n"
+			"  map = remap[4]; COMPUTE ; rv.s1  = v;\n"
+			"  map = remap[5]; COMPUTE ; rv.s1 |= v << 8;\n"
+			"  map = remap[6]; COMPUTE ; rv.s1 |= v << 16;\n"
+			"  map = remap[7]; COMPUTE ; rv.s1 |= v << 24;\n"
+			"#undef COMPUTE\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Remap_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMap = node->paramList[2];
+		if (HafCpu_Remap_U8_U8_Bilinear(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes,
+			(ago_coord2d_ushort_t *)iMap->buffer, iMap->u.remap.dst_width * sizeof(ago_coord2d_ushort_t)))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_U8;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleFXY(node->opencl_code);
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes)\n"
+			"{\n"
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U8x8 rv;\n"
+			"  float4 f; int map;\n"
+			"  map = remap[0]; f.s0 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  map = remap[1]; f.s1 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  map = remap[2]; f.s2 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  map = remap[3]; f.s3 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  map = remap[4]; f.s0 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  map = remap[5]; f.s1 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  map = remap[6]; f.s2 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  map = remap[7]; f.s3 = BilinearSampleFXY(p, stride, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f);\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Remap_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMap = node->paramList[2];
+		if (HafCpu_Remap_U8_U8_Bilinear_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes,
+			(ago_coord2d_ushort_t *)iMap->buffer, iMap->u.remap.dst_width * sizeof(ago_coord2d_ushort_t), node->paramList[3]->u.scalar.u.u))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			if (node->paramList[3]->u.scalar.type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_FORMAT;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_U8;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleFXYConstantForRemap(node->opencl_code);
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes, uint borderValue)\n"
+			"{\n"
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U8x8 rv;\n"
+			"  float4 f; int map;\n"
+			"  map = remap[0]; f.s0 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  map = remap[1]; f.s1 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  map = remap[2]; f.s2 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  map = remap[3]; f.s3 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  map = remap[4]; f.s0 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  map = remap[5]; f.s1 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  map = remap[6]; f.s2 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  map = remap[7]; f.s3 = BilinearSampleFXYConstantForRemap(p, stride, width, height, ((map << 16) >> 16) * 0.125f, (map >> 16) * 0.125f, borderValue);\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Remap_U24_U24_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGB);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_RGB;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[1024];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U24x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes)\n"
+			"{\n"
+			"  uint QF = %d;\n"
+			), node->opencl_name, node->paramList[2]->u.remap.remap_fractional_bits);
+		node->opencl_code += textBuffer;
+		node->opencl_code += OPENCL_FORMAT(
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U24x8 rv;\n"
+			"  float4 f; uint map, sx, sy, offset; uint3 px0, px1; __global uchar * pt; float4 mf;\n"
+			"  uint QFB = (1 << QF) - 1; float QFM = 1.0f / (1 << QF);\n"
+			"  // pixel[0]\n"
+			"  map = remap[0]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[1]\n"
+			"  map = remap[1]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s3 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[2]\n"
+			"  map = remap[2]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s2 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[3]\n"
+			"  map = remap[3]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s1 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s2 = amd_pack(f);\n"
+			"  // pixel[4]\n"
+			"  map = remap[4]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[5]\n"
+			"  map = remap[5]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s3 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  rv.s3 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[6]\n"
+			"  map = remap[6]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s2 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s4 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[7]\n"
+			"  map = remap[7]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) * 3; pt = p + (offset & ~3); px0 = vload3(0, (__global uint *)pt); px1 = vload3(0, (__global uint *)(pt + stride)); px0.s0 = amd_bytealign(px0.s1, px0.s0, offset); px0.s1 = amd_bytealign(px0.s2, px0.s1, offset); px1.s0 = amd_bytealign(px1.s1, px1.s0, offset); px1.s1 = amd_bytealign(px1.s2, px1.s1, offset); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s1 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack3(px0.s0) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack3(px1.s0) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s5 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			);
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_Remap_U24_U32_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_RGB;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[1024];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U24x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes)\n"
+			"{\n"
+			"  uint QF = %d;\n"
+			), node->opencl_name, node->paramList[2]->u.remap.remap_fractional_bits);
+		node->opencl_code += textBuffer;
+		node->opencl_code += OPENCL_FORMAT(
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U24x8 rv;\n"
+			"  float4 f; uint map, sx, sy, offset; uint2 px0, px1; __global uchar * pt; float4 mf;\n"
+			"  uint QFB = (1 << QF) - 1; float QFM = 1.0f / (1 << QF);\n"
+			"  // pixel[0]\n"
+			"  map = remap[0]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[1]\n"
+			"  map = remap[1]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s3 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[2]\n"
+			"  map = remap[2]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s2 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[3]\n"
+			"  map = remap[3]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s1 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s2 = amd_pack(f);\n"
+			"  // pixel[4]\n"
+			"  map = remap[4]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[5]\n"
+			"  map = remap[5]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s3 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s3 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[6]\n"
+			"  map = remap[6]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s2 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s4 = amd_pack(f);\n"
+			"  f.s0 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  // pixel[7]\n"
+			"  map = remap[7]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s1 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s5 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			);
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_Remap_U32_U32_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// not implemented yet
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_RGBX, VX_DF_IMAGE_RGBX);
+		if (!status) {
+			if (node->paramList[1]->u.img.width != node->paramList[2]->u.remap.src_width ||
+				node->paramList[1]->u.img.height != node->paramList[2]->u.remap.src_height)
+				return VX_ERROR_INVALID_DIMENSION;
+			// set output image sizes are same as input image size
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[2]->u.remap.dst_width;
+			meta->data.u.img.height = node->paramList[2]->u.remap.dst_height;
+			meta->data.u.img.format = VX_DF_IMAGE_RGBX;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[1024];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U32x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, __global uchar * remap_, uint remap_stride_in_bytes)\n"
+			"{\n"
+			"  uint QF = %d;\n"
+			), node->opencl_name, node->paramList[2]->u.remap.remap_fractional_bits);
+		node->opencl_code += textBuffer;
+		node->opencl_code += OPENCL_FORMAT(
+			"  __global int * remap = (__global int *) (remap_ + y * remap_stride_in_bytes + (x << 2));\n"
+			"  U32x8 rv;\n"
+			"  float4 f; uint map, sx, sy, offset; uint2 px0, px1; __global uchar * pt; float4 mf;\n"
+			"  uint QFB = (1 << QF) - 1; float QFM = 1.0f / (1 << QF);\n"
+			"  // pixel[0]\n"
+			"  map = remap[0]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  // pixel[1]\n"
+			"  map = remap[1]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  // pixel[2]\n"
+			"  map = remap[2]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s2 = amd_pack(f);\n"
+			"  // pixel[3]\n"
+			"  map = remap[3]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s3 = amd_pack(f);\n"
+			"  // pixel[4]\n"
+			"  map = remap[4]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s4 = amd_pack(f);\n"
+			"  // pixel[5]\n"
+			"  map = remap[5]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s5 = amd_pack(f);\n"
+			"  // pixel[6]\n"
+			"  map = remap[6]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s6 = amd_pack(f);\n"
+			"  // pixel[7]\n"
+			"  map = remap[7]; sx = map & 0xffff; sy = (map >> 16); offset = (sy >> QF) * stride + (sx >> QF) << 2; pt = p + offset; px0 = vload2(0, (__global uint *)pt); px1 = vload2(0, (__global uint *)(pt + stride)); mf.s0 = (sx & QFB) * QFM; mf.s1 = (sy & QFB) * QFM; mf.s2 = 1.0f - mf.s0; mf.s3 = 1.0f - mf.s1;\n"
+			"  f.s0 = (amd_unpack0(px0.s0) * mf.s2 + amd_unpack0(px0.s1) * mf.s0) * mf.s3 + (amd_unpack0(px1.s0) * mf.s2 + amd_unpack0(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s1 = (amd_unpack1(px0.s0) * mf.s2 + amd_unpack1(px0.s1) * mf.s0) * mf.s3 + (amd_unpack1(px1.s0) * mf.s2 + amd_unpack1(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s2 = (amd_unpack2(px0.s0) * mf.s2 + amd_unpack2(px0.s1) * mf.s0) * mf.s3 + (amd_unpack2(px1.s0) * mf.s2 + amd_unpack2(px1.s1) * mf.s0) * mf.s1;\n"
+			"  f.s3 = (amd_unpack3(px0.s0) * mf.s2 + amd_unpack3(px0.s1) * mf.s0) * mf.s3 + (amd_unpack3(px1.s0) * mf.s2 + amd_unpack3(px1.s1) * mf.s0) * mf.s1;\n"
+			"  rv.s7 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			);
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+#if ENABLE_OPENCL		
+			| AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif			
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_WarpAffine_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpAffine_U8_U8_Nearest(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_affine_matrix_t *)iMat->buffer, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 2 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 2 * alignedWidth*sizeof(float);				// 2 rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_affine_matrix_t matrix)\n"
+			"{\n"
+			"  U8x8 rv;\n"
+			"  float sx, sy;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  rv.s0 = p[mad24(stride, (uint)sy, (uint)sx)];\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s0 |= p[mad24(stride, (uint)sy, (uint)sx)] << 8;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s0 |= p[mad24(stride, (uint)sy, (uint)sx)] << 16;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s0 |= p[mad24(stride, (uint)sy, (uint)sx)] << 24;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s1  = p[mad24(stride, (uint)sy, (uint)sx)];\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s1 |= p[mad24(stride, (uint)sy, (uint)sx)] << 8;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s1 |= p[mad24(stride, (uint)sy, (uint)sx)] << 16;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; rv.s1 |= p[mad24(stride, (uint)sy, (uint)sx)] << 24;\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpAffine_U8_U8_Nearest_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpAffine_U8_U8_Nearest_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_affine_matrix_t *)iMat->buffer, node->paramList[3]->u.scalar.u.u, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 2 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			if (node->paramList[3]->u.scalar.type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_FORMAT;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 2 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_affine_matrix_t matrix, uint border)\n"
+			"{\n"
+			"  U8x8 rv;\n"
+			"  float sx, sy; uint mask, v;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  width -= 2; height -= 2;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 = v;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy; \n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 |= v << 8;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 |= v << 16;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 |= v << 24;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 = v;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 |= v << 8;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 |= v << 16;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; x = (uint)(int)sx; y = (uint)(int)sy;\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 |= v << 24;\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpAffine_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpAffine_U8_U8_Bilinear(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_affine_matrix_t *)iMat->buffer, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 2 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 2 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleFXY(node->opencl_code);
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_affine_matrix_t matrix)\n"
+			"{\n"
+			"  U8x8 rv; float4 f;\n"
+			"  float sx, sy;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  f.s0 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s1 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s2 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s3 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s0 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s1 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s2 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s3 = BilinearSampleFXY(p, stride, sx, sy);\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpAffine_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpAffine_U8_U8_Bilinear_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_affine_matrix_t *)iMat->buffer, node->paramList[3]->u.scalar.u.u, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 2 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			if (node->paramList[3]->u.scalar.type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_FORMAT;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 2 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		agoCodeGenOpenCL_SampleWithConstBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleWithConstBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleFXYConstant(node->opencl_code);
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_affine_matrix_t matrix, uint borderValue)\n"
+			"{\n"
+			"  U8x8 rv; float4 f;\n"
+			"  float sx, sy;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  f.s0 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s1 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s2 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s3 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s0 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s1 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s2 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; f.s3 = BilinearSampleFXYConstant(p, stride, width, height, sx, sy, borderValue);\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpPerspective_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpPerspective_U8_U8_Nearest(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_perspective_matrix_t *)iMat->buffer, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 3 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_perspective_matrix_t matrix)\n"
+			"{\n"
+			"  U8x8 rv;\n"
+			"  float sx, sy, sz, isz;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  sz = mad(dy, matrix.M[1][2], matrix.M[2][2]); sz = mad(dx, matrix.M[0][2], sz);\n"
+			"  isz = 1.0f / sz; rv.s0 = p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))];\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s0 |= p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))] << 8;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s0 |= p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))] << 16;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s0 |= p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))] << 24;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s1  = p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))];\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s1 |= p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))] << 8;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s1 |= p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))] << 16;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; rv.s1 |= p[mad24(stride, (uint)(sy*isz), (uint)(sx*isz))] << 24;\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpPerspective_U8_U8_Nearest_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpPerspective_U8_U8_Nearest_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_perspective_matrix_t *)iMat->buffer, node->paramList[3]->u.scalar.u.u, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 3 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			if (node->paramList[3]->u.scalar.type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_FORMAT;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_perspective_matrix_t matrix, uint border)\n"
+			"{\n"
+			"  width -= 2; height -= 2;\n"
+			"  U8x8 rv;\n"
+			"  float sx, sy, sz, isz; uint mask, v;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  sz = mad(dy, matrix.M[1][2], matrix.M[2][2]); sz = mad(dx, matrix.M[0][2], sz);\n"
+			"  isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 = v;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 |= (v << 8);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 |= (v << 16);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s0 |= (v << 24);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 = v;\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 |= (v << 8);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 |= (v << 16);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; x = (uint)(int)(sx*isz); y = (uint)(int)(sy*isz);\n"
+			"  mask = ((int)(x | (width - x) | y | (height - y))) >> 31; mask = ~mask;\n"
+			"  x &= mask; y &= mask; v = p[mad24(stride, y, x)]; v = bitselect(border, v, mask); rv.s1 |= (v << 24);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpPerspective_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpPerspective_U8_U8_Bilinear(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_perspective_matrix_t *)iMat->buffer, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 3 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleFXY(node->opencl_code);
+		char textBuffer[4096];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_perspective_matrix_t matrix)\n"
+			"{\n"
+			"  U8x8 rv; float4 f;\n"
+			"  float sx, sy, sz, isz;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  sz = mad(dy, matrix.M[1][2], matrix.M[2][2]); sz = mad(dx, matrix.M[0][2], sz);\n"
+			"  isz = 1.0f / sz; f.s0 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s1 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s2 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s3 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s0 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s1 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s2 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s3 = BilinearSampleFXY(p, stride, sx*isz, sy*isz);\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_WarpPerspective_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iMat = node->paramList[2];
+		if (HafCpu_WarpPerspective_U8_U8_Bilinear_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (ago_perspective_matrix_t *)iMat->buffer, node->paramList[3]->u.scalar.u.u, node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			if (node->paramList[2]->u.mat.type != VX_TYPE_FLOAT32)
+				return VX_ERROR_INVALID_TYPE;
+			if (node->paramList[2]->u.mat.columns != 3 || node->paramList[2]->u.mat.rows != 3)
+				return VX_ERROR_INVALID_DIMENSION;
+			if (node->paramList[3]->u.scalar.type != VX_TYPE_UINT8)
+				return VX_ERROR_INVALID_FORMAT;
+			// output image dimensions have no constraints
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		int alignedWidth = (node->paramList[0]->u.img.width + 15) & ~15;		// Next highest multiple of 16, so that the buffer is aligned for all three lines
+		node->localDataSize = 3 * alignedWidth*sizeof(float);				// Three rows (+some extra) worth of scratch memory			
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		agoCodeGenOpenCL_SampleWithConstBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleWithConstBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleFXYConstant(node->opencl_code);
+		char textBuffer[8192];
+		sprintf(textBuffer, OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, ago_perspective_matrix_t matrix, uint borderValue)\n"
+			"{\n"
+			"  U8x8 rv; float4 f;\n"
+			"  float sx, sy, sz, isz;\n"
+			"  float dx = (float)x, dy = (float)y;\n"
+			"  sx = mad(dy, matrix.M[1][0], matrix.M[2][0]); sx = mad(dx, matrix.M[0][0], sx);\n"
+			"  sy = mad(dy, matrix.M[1][1], matrix.M[2][1]); sy = mad(dx, matrix.M[0][1], sy);\n"
+			"  sz = mad(dy, matrix.M[1][2], matrix.M[2][2]); sz = mad(dx, matrix.M[0][2], sz);\n"
+			"  isz = 1.0f / sz; f.s0 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s1 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s2 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s3 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  rv.s0 = amd_pack(f);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s0 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s1 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s2 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  sx += matrix.M[0][0]; sy += matrix.M[0][1]; sz += matrix.M[0][2]; isz = 1.0f / sz; f.s3 = BilinearSampleFXYConstant(p, stride, width, height, sx*isz, sy*isz, borderValue);\n"
+			"  rv.s1 = amd_pack(f);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleImage_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleImage_U8_U8_Nearest(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (AgoConfigScaleMatrix *)node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+			// set the valid region
+			vx_float32 widthOut = (vx_float32)node->paramList[0]->u.img.width;
+			vx_float32 widthIn = (vx_float32)node->paramList[1]->u.img.width;
+			vx_float32 heightOut = (vx_float32)node->paramList[0]->u.img.height;
+			vx_float32 heightIn = (vx_float32)node->paramList[1]->u.img.height;
+			meta->data.u.img.rect_valid.start_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.start_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_y + 0.5f) * heightOut / heightIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		int alignedHeight = (oImg->u.img.height + 15) & ~15;
+		node->localDataSize = sizeof(AgoConfigScaleMatrix) + (alignedWidth * 2) + (alignedHeight * 2);
+		node->localDataPtr = (vx_uint8 *)agoAllocMemory(node->localDataSize);
+		if (!node->localDataPtr) return VX_ERROR_NO_MEMORY;
+		// compute scale matrix from the input and output image sizes
+		AgoConfigScaleMatrix * scalemat = (AgoConfigScaleMatrix *)node->localDataPtr;
+		scalemat->xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat->yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat->xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5);
+		scalemat->yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5);
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+		if (node->localDataPtr) {
+			agoReleaseMemory(node->localDataPtr);
+			node->localDataPtr = nullptr;
+		}
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoConfigScaleMatrix scalemat; // compute scale matrix from the input and output image sizes
+		scalemat.xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat.yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat.xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5);
+		scalemat.yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5);
+		char textBuffer[1024];
+		sprintf(textBuffer,
+			OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+			"{\n"
+			"  float4 scaleInfo = (float4)(%.12f,%.12f,%.12f,%.12f);\n"
+			"  U8x8 rv;\n"
+			"  p += stride*(uint)mad((float)y, scaleInfo.s1, scaleInfo.s3);\n"
+			"  float fx = mad((float)x, scaleInfo.s0, scaleInfo.s2);\n"
+			"  rv.s0  = p[(int)fx];\n"
+			"  fx += scaleInfo.s0; rv.s0 |= p[(int)fx] << 8;\n"
+			"  fx += scaleInfo.s0; rv.s0 |= p[(int)fx] << 16;\n"
+			"  fx += scaleInfo.s0; rv.s0 |= p[(int)fx] << 24;\n"
+			"  fx += scaleInfo.s0; rv.s1  = p[(int)fx];\n"
+			"  fx += scaleInfo.s0; rv.s1 |= p[(int)fx] << 8;\n"
+			"  fx += scaleInfo.s0; rv.s1 |= p[(int)fx] << 16;\n"
+			"  fx += scaleInfo.s0; rv.s1 |= p[(int)fx] << 24;\n"
+			"  *r = rv;\n"
+			"}\n"
+			), node->opencl_name, scalemat.xscale, scalemat.yscale, scalemat.xoffset, scalemat.yoffset);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleImage_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleImage_U8_U8_Bilinear(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (AgoConfigScaleMatrix *)node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+			// set the valid region
+			vx_float32 widthOut = (vx_float32)node->paramList[0]->u.img.width;
+			vx_float32 widthIn = (vx_float32)node->paramList[1]->u.img.width;
+			vx_float32 heightOut = (vx_float32)node->paramList[0]->u.img.height;
+			vx_float32 heightIn = (vx_float32)node->paramList[1]->u.img.height;
+			meta->data.u.img.rect_valid.start_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.start_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_y + 0.5f) * heightOut / heightIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = sizeof(AgoConfigScaleMatrix) + (alignedWidth * 6);
+		node->localDataPtr = (vx_uint8 *)agoAllocMemory(node->localDataSize);
+		if (!node->localDataPtr) return VX_ERROR_NO_MEMORY;
+		// compute scale matrix from the input and output image sizes
+		AgoConfigScaleMatrix * scalemat = (AgoConfigScaleMatrix *)node->localDataPtr;
+		scalemat->xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat->yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat->xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5 - 0.5);
+		scalemat->yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5 - 0.5);
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+		if (node->localDataPtr) {
+			agoReleaseMemory(node->localDataPtr);
+			node->localDataPtr = nullptr;
+		}
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoConfigScaleMatrix scalemat; // compute scale matrix from the input and output image sizes
+		scalemat.xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat.yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat.xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5 - 0.5);
+		scalemat.yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5 - 0.5);
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_ScaleImage_U8_U8_Bilinear(node->opencl_code);
+		char textBuffer[8192];
+		sprintf(textBuffer,
+			OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+			"{\n"
+			"  float4 scaleInfo = (float4)(%.12f,%.12f,%.12f,%.12f);\n"
+			"  ScaleImage_U8_U8_Bilinear(r, x, y, p, stride, scaleInfo);"
+			"}\n"
+			), node->opencl_name, scalemat.xscale, scalemat.yscale, scalemat.xoffset, scalemat.yoffset);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleImage_U8_U8_Bilinear_Replicate(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleImage_U8_U8_Bilinear_Replicate(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (AgoConfigScaleMatrix *)node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+			// set the valid region
+			vx_float32 widthOut = (vx_float32)node->paramList[0]->u.img.width;
+			vx_float32 widthIn = (vx_float32)node->paramList[1]->u.img.width;
+			vx_float32 heightOut = (vx_float32)node->paramList[0]->u.img.height;
+			vx_float32 heightIn = (vx_float32)node->paramList[1]->u.img.height;
+			meta->data.u.img.rect_valid.start_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.start_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_y + 0.5f) * heightOut / heightIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = sizeof(AgoConfigScaleMatrix) + (alignedWidth * 6);
+		node->localDataPtr = (vx_uint8 *)agoAllocMemory(node->localDataSize);
+		if (!node->localDataPtr) return VX_ERROR_NO_MEMORY;
+		// compute scale matrix from the input and output image sizes
+		AgoConfigScaleMatrix * scalemat = (AgoConfigScaleMatrix *)node->localDataPtr;
+		scalemat->xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat->yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat->xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5 - 0.5);
+		scalemat->yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5 - 0.5);
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+		if (node->localDataPtr) {
+			agoReleaseMemory(node->localDataPtr);
+			node->localDataPtr = nullptr;
+		}
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoConfigScaleMatrix scalemat; // compute scale matrix from the input and output image sizes
+		scalemat.xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat.yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat.xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5 - 0.5);
+		scalemat.yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5 - 0.5);
+		agoCodeGenOpenCL_ClampPixelCoordinatesToBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_ScaleImage_U8_U8_Bilinear(node->opencl_code);
+		char textBuffer[8192];
+		sprintf(textBuffer,
+			OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height)\n"
+			"{\n"
+			"  float4 scaleInfo = (float4)(%.12f,%.12f,%.12f,%.12f);\n"
+			"  // compute source x, y coordinates\n"
+			"  float fx = mad((float)x, scaleInfo.s0, scaleInfo.s2);\n"
+			"  float fy = mad((float)y, scaleInfo.s1, scaleInfo.s3);\n"
+			"  // check if all pixels stay within borders\n"
+			"  if (fx >= 0.0f && fy >= 0.0f && mad(8.0f, scaleInfo.s0, fx) < (width - 1) && mad(1.0f, scaleInfo.s1, fy) < (height - 1)) {\n"
+			"  	ScaleImage_U8_U8_Bilinear(r, x, y, p, stride, scaleInfo);\n"
+			"  }\n"
+			"  else {\n"
+			"  	// compute x and y upper limits\n"
+			"  	float fxlimit = (float)(width - 1), fylimit = (float)(height - 1);\n"
+			"  	// compute y coordinate and y interpolation factors\n"
+			"  	float fy0, fy1;\n"
+			"  	fy0 = floor(fy); fy1 = fy - fy0; fy0 = 1.0f - fy1;\n"
+			"  	// calculate sy and ystride\n"
+			"  	uint2 ycoord = ClampPixelCoordinatesToBorder(fy, height - 1, stride);\n"
+			"  	// process pixels\n"
+			"  	p += mul24(ycoord.s0, stride);\n"
+			"  	float frac;\n"
+			"  	uint2 xcoord;\n"
+			"  	uint xlimit = width - 1;\n"
+			"  	U8x8 rv; float4 f;  xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s0 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s1 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s2 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s3 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	rv.s0 = amd_pack(f);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s0 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s1 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s2 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	fx += scaleInfo.s0; xcoord = ClampPixelCoordinatesToBorder(fx, xlimit, 1); frac = fx - floor(fx); f.s3 = BilinearSample(p, ycoord.s1, xcoord.s1, fy0, fy1, xcoord.s0, 1.0f - frac, frac);\n"
+			"  	rv.s1 = amd_pack(f);\n"
+			"  	*r = rv;\n"
+			"  }\n"
+			"}\n"
+			), node->opencl_name, scalemat.xscale, scalemat.yscale, scalemat.xoffset, scalemat.yoffset);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleImage_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoData * iBorder = node->paramList[2];
+		if (HafCpu_ScaleImage_U8_U8_Bilinear_Constant(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (AgoConfigScaleMatrix *)node->localDataPtr, iBorder->u.scalar.u.u))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN_S(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8, VX_TYPE_UINT8);
+		if (!status) {
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+			// set the valid region
+			vx_float32 widthOut = (vx_float32)node->paramList[0]->u.img.width;
+			vx_float32 widthIn = (vx_float32)node->paramList[1]->u.img.width;
+			vx_float32 heightOut = (vx_float32)node->paramList[0]->u.img.height;
+			vx_float32 heightIn = (vx_float32)node->paramList[1]->u.img.height;
+			meta->data.u.img.rect_valid.start_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.start_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_y + 0.5f) * heightOut / heightIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		int alignedWidth = (oImg->u.img.width + 15) & ~15;
+		node->localDataSize = sizeof(AgoConfigScaleMatrix) + (alignedWidth * 6) + (iImg->u.img.width+15)&~15;
+		node->localDataPtr = (vx_uint8 *)agoAllocMemory(node->localDataSize);
+		if (!node->localDataPtr) return VX_ERROR_NO_MEMORY;
+		// compute scale matrix from the input and output image sizes
+		AgoConfigScaleMatrix * scalemat = (AgoConfigScaleMatrix *)node->localDataPtr;
+		scalemat->xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat->yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat->xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5 - 0.5);
+		scalemat->yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5 - 0.5);
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+		if (node->localDataPtr) {
+			agoReleaseMemory(node->localDataPtr);
+			node->localDataPtr = nullptr;
+		}
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		AgoConfigScaleMatrix scalemat; // compute scale matrix from the input and output image sizes
+		scalemat.xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat.yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat.xoffset = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width * 0.5 - 0.5);
+		scalemat.yoffset = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height * 0.5 - 0.5);
+		agoCodeGenOpenCL_ClampPixelCoordinatesToBorder(node->opencl_code);
+		agoCodeGenOpenCL_SampleWithConstBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSampleWithConstBorder(node->opencl_code);
+		agoCodeGenOpenCL_BilinearSample(node->opencl_code);
+		agoCodeGenOpenCL_ScaleImage_U8_U8_Bilinear(node->opencl_code);
+		char textBuffer[8192];
+		sprintf(textBuffer,
+			OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride, uint width, uint height, uint borderValue)\n"
+			"{\n"
+			"  float4 scaleInfo = (float4)(%.12f,%.12f,%.12f,%.12f);\n"
+			"  // compute source x, y coordinates\n"
+			"  float fx = mad((float)x, scaleInfo.s0, scaleInfo.s2);\n"
+			"  float fy = mad((float)y, scaleInfo.s1, scaleInfo.s3);\n"
+			"  // check if all pixels stay within borders\n"
+			"  if (fx >= 0.0f && fy >= 0.0f && mad(8.0f, scaleInfo.s0, fx) < (width - 1) && mad(1.0f, scaleInfo.s1, fy) < (height - 1)) {\n"
+			"  	ScaleImage_U8_U8_Bilinear(r, x, y, p, stride, scaleInfo);\n"
+			"  }\n"
+			"  else {\n"
+			"  	// compute y coordinate interpolation factors\n"
+			"  	float fy1 = fy - floor(fy);\n"
+			"  	float fy0 = 1.0f - fy1;\n"
+			"  	// compute pixel values\n"
+			"  	int   sy = (int)floor(fy);\n"
+			"  	float frac;\n"
+			"  	U8x8 rv; float4 f;  frac = fx - floor(fx); f.s0 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s1 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s2 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s3 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	rv.s0 = amd_pack(f);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s0 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s1 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s2 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	fx += scaleInfo.s0; frac = fx - floor(fx); f.s3 = BilinearSampleWithConstBorder(p, (int)floor(fx), sy, width, height, stride, 1.0f - frac, frac, fy0, fy1, borderValue);\n"
+			"  	rv.s1 = amd_pack(f);\n"
+			"  	*r = rv;\n"
+			"  }\n"
+			"}\n"
+			), node->opencl_name, scalemat.xscale, scalemat.yscale, scalemat.xoffset, scalemat.yoffset);
+		node->opencl_code += textBuffer;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_ScaleImage_U8_U8_Area(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_ScaleImage_U8_U8_Area(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+			iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes, (AgoConfigScaleMatrix *)node->localDataPtr))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U8, VX_DF_IMAGE_U8);
+		if (!status) {
+			vx_meta_format meta;
+			meta = &node->metaList[0];
+			meta->data.u.img.width = node->paramList[0]->u.img.width;
+			meta->data.u.img.height = node->paramList[0]->u.img.height;
+			// set the valid region
+			vx_float32 widthOut = (vx_float32)node->paramList[0]->u.img.width;
+			vx_float32 widthIn = (vx_float32)node->paramList[1]->u.img.width;
+			vx_float32 heightOut = (vx_float32)node->paramList[0]->u.img.height;
+			vx_float32 heightIn = (vx_float32)node->paramList[1]->u.img.height;
+			meta->data.u.img.rect_valid.start_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.start_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.start_y + 0.5f) * heightOut / heightIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_x = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_x + 0.5f) * widthOut / widthIn) - 0.5f);
+			meta->data.u.img.rect_valid.end_y = (vx_uint32)(((node->paramList[1]->u.img.rect_valid.end_y + 0.5f) * heightOut / heightIn) - 0.5f);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		int alignedWidth = ((oImg->u.img.width + 15) & ~15) + ((iImg->u.img.width + 15) & ~15);
+		node->localDataSize = sizeof(AgoConfigScaleMatrix) + alignedWidth * 2 + 16;
+		node->localDataPtr = (vx_uint8 *)agoAllocMemory(node->localDataSize);
+		if (!node->localDataPtr) return VX_ERROR_NO_MEMORY;
+		// compute scale matrix from the input and output image sizes
+		AgoConfigScaleMatrix * scalemat = (AgoConfigScaleMatrix *)node->localDataPtr;
+		scalemat->xscale = (vx_float32)((vx_float64)iImg->u.img.width / (vx_float64)oImg->u.img.width);
+		scalemat->yscale = (vx_float32)((vx_float64)iImg->u.img.height / (vx_float64)oImg->u.img.height);
+		scalemat->xoffset = -0.5f;
+		scalemat->yoffset = -0.5f;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+		if (node->localDataPtr) {
+			agoReleaseMemory(node->localDataPtr);
+			node->localDataPtr = nullptr;
+		}
+	}
+#if ENABLE_OPENCL
+	else if (cmd == ago_kernel_cmd_opencl_codegen) {
+		status = VX_SUCCESS;
+		// compute configuration parameters
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		int dstWidth = oImg->u.img.width;
+		int dstHeight = oImg->u.img.height;
+		int srcWidth = iImg->u.img.width;
+		int srcHeight = iImg->u.img.height;
+		// generate code
+		float Sx = (float)srcWidth / (float)dstWidth;
+		float Sy = (float)srcHeight / (float)dstHeight;
+		float fSx = Sx - floorf(Sx);
+		float fSy = Sy - floorf(Sy);
+		int Nx = (int)ceilf(Sx), Nxf = 0;
+		int Ny = (int)ceilf(Sy), Nyf = 0;
+		bool use_sad = (Nx % 4) ? false : true;
+		if ((srcWidth % dstWidth) > 0) {
+			use_sad = false;
+			if ((dstWidth % (srcWidth % dstWidth)) > 0) Nxf++;
+		}
+		if ((srcHeight % dstHeight) > 0) {
+			use_sad = false;
+			if ((dstHeight % (srcHeight % dstHeight)) > 0) Nyf++;
+		}
+		bool need_align = ((Sx * 2.0f) != floorf(Sx * 2.0f)) ? true : false;
+		std::string code;
+		char item[1024];
+		sprintf(item,
+			OPENCL_FORMAT(
+			"void %s(U8x8 * r, uint x, uint y, __global uchar * p, uint stride) // ScaleArea %gx%g using %dx%d window\n"
+			"{\n"
+			), node->opencl_name, Sx, Sy, Nx, Ny); code += item;
+		if (fSx != 0.0f && fSy != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  float X = (float)x * %.12ff;\n"
+				"  float Y = (float)y * %.12ff;\n"
+				"  float fX = fract(X, &X);\n"
+				"  float fY = fract(Y, &Y);\n"
+				"  uint offset = stride * (int)Y + (int)X;\n"
+				), Sx, Sy); code += item;
+		}
+		else if (fSx != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  float X = (float)x * %.12ff;\n"
+				"  float fX = fract(X, &X);\n"
+				"  uint offset = stride * (y * %d) + (int)X;\n"
+				), Sx, Ny); code += item;
+		}
+		else if (fSy != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  float Y = (float)y * %.12ff;\n"
+				"  float fY = fract(Y, &Y);\n"
+				"  uint offset = stride * (int)Y + (x * %d);\n"
+				), Sy, Nx); code += item;
+		}
+		else {
+			sprintf(item,
+				"  uint offset = stride * (y * %d) + (x * %d);\n"
+				, Ny, Nx); code += item;
+		}
+		if (need_align) {
+			code += "  uint align = offset & 3; offset -= align;\n";
+		}
+		code += "  p += offset;\n";
+		if (fSy != 0.0f) {
+			sprintf(item,
+				OPENCL_FORMAT(
+				"  F32x8 ftotal = (F32x8)0.0f;\n"
+				"  float Sy = %.12ff, Syf = 1.0f - fY;\n"
+				), Sy); code += item;
+		}
+		else if (use_sad) {
+			code += "  U32x8 sum = (U32x8)0;\n";
+		}
+		else {
+			code += "  F32x8 f = (F32x8)0.0f;\n";
+		}
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  for (uint iy = 0; iy < %d; iy++) {\n"
+			"    uint4 dw;\n"
+			), Ny + Nyf); code += item;
+		if (fSy != 0.0f) {
+			code += "    F32x8 f = (F32x8)0.0f;\n";
+		}
+		if (fSx == 0.0f) {
+			if (need_align) {
+				for (int ix = 0, bpos = 0, lastdw = 0, nbytesprocessed = 0, jx = 0; ix < 8;) {
+					int nbytes = 8 * Nx - nbytesprocessed;
+					if (nbytes > 16) nbytes = 16;
+					if (bpos > 0 && nbytes > 12) nbytes = 12;
+					int ndw = (nbytes + 3) >> 2;
+					char slist[] = "0123"; slist[ndw] = '\0';
+					char vload[] = "vloadn"; vload[5] = ndw > 1 ? ('0' + ndw) : 0;
+					sprintf(item, "    dw.s%s = %s(0, (__global uint *)&p[%d]);\n", slist, vload, bpos); code += item;
+					for (int idw = 0, ldw = lastdw, jdw = lastdw ? 0 : 1; idw < ndw - (lastdw ? 0 : 1); idw++, jdw++) {
+						sprintf(item, "    dw.s%d = amd_bytealign(dw.s%d, dw.s%d, align);\n", ldw, jdw, ldw); code += item;
+						for (int jj = 0; jj < 4 && (nbytesprocessed + jj < 8 * Nx); jj++) {
+							sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d);\n", ix, jj, ldw); code += item;
+							if (((nbytesprocessed + jj) % Nx) == (Nx - 1)) ix++;
+						}
+						// update for next iteration
+						ldw = lastdw ? idw : idw + 1;
+						nbytesprocessed += 4;
+					}
+					// update for next iteration
+					bpos += nbytes;
+					lastdw = ndw - 1;
+				}
+			}
+			else {
+				for (int ix = 0, bpos = 0; ix < 8;) {
+					sprintf(item, "    dw = *((__global uint4 *)&p[%d]);\n", bpos); code += item;
+					for (int jj = 0; jj < 4; jj++) {
+						if (use_sad) {
+							sprintf(item, "    sum.s%d = amd_sad(dw.s%d, 0u, sum.s%d);\n", ix, jj, ix); code += item;
+							bpos += 4;
+							if ((bpos % Nx) == 0) ix++;
+						}
+						else {
+							for (int k = 0; k < 4 && ix < 8; k++) {
+								sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d);\n", ix, k, jj); code += item;
+								bpos += 1;
+								if ((bpos % Nx) == 0) ix++;
+							}
+						}
+					}
+				}
+			}
+		}
+		else if ((Sx * 8.0f) == floorf(Sx * 8.0f)) {
+			int nbytes = (int)(Sx * 8.0f);
+			float factorOffset = 0.0f, factorRemaining = Sx;
+			int xpos = 0;
+			for (int offset = 0, bpos = 0, ix = 0, lastdw = 0; offset < nbytes;) {
+				int N = nbytes - offset + (need_align && !lastdw ? 4 : 0);
+				if (N > 16) N = 16;
+				if (need_align && offset > 0 && N > 12) N = 12;
+				int ndw = (N + 3) >> 2;
+				char slist[] = "0123"; slist[ndw] = '\0';
+				char vload[] = "vloadn"; vload[5] = ndw > 1 ? ('0' + ndw) : 0;
+				sprintf(item, "    dw.s%s = %s(0, (__global uint *)&p[%d]);\n", slist, vload, bpos); code += item;
+				if (need_align) {
+					if (bpos == 0) bpos += 4;
+					ndw -= (lastdw ? 0 : 1);
+					for (int idw = 0, ldw = lastdw, jdw = lastdw ? 0 : 1; idw < ndw; idw++, jdw++) {
+						sprintf(item, "    dw.s%d = amd_bytealign(dw.s%d, dw.s%d, align);\n", ldw, jdw, ldw); code += item;
+						slist[idw] = '0' + ldw;
+						// update for next iteration
+						ldw = lastdw ? idw : idw + 1;
+					}
+					lastdw = ndw - (lastdw ? 1 : 0);
+				}
+				for (int jj = 0; jj < ndw; jj++, offset += 4, bpos += 4) {
+					int jjdw = slist[jj] - '0';
+					for (int k = 0; k < 4 && ix < 8;) {
+						if (factorOffset == floorf(factorOffset)) {
+							if (factorRemaining >= 1.0f) {
+								sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d);\n", ix, k, jjdw);
+								factorOffset += 1.0f;
+								factorRemaining -= 1.0f;
+								k++;
+							}
+							else {
+								sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d) * %.12ff;\n", ix, k, jjdw, factorRemaining);
+								factorOffset += factorRemaining;
+								factorRemaining = 0.0f;
+							}
+						}
+						else {
+							float factorOffsetRemain = factorOffset - floorf(factorOffset);
+							if ((factorOffsetRemain + factorRemaining) >= 1.0f) {
+								float factor = 1.0f - factorOffsetRemain;
+								sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d) * %.12ff;\n", ix, k, jjdw, factor);
+								factorOffset += factor;
+								factorRemaining -= factor;
+								k++;
+							}
+							else {
+								sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d) * %.12ff;\n", ix, k, jjdw, factorRemaining);
+								factorOffset += factorRemaining;
+								factorRemaining = 0.0f;
+							}
+						}
+						code += item;
+						if (factorRemaining <= 0.0f) {
+							factorRemaining = Sx;
+							ix++;
+						}
+					}
+				}
+			}
+		}
+		else {
+			code += "    float Xs = fX, factor, Xi, Xf;\n";
+			code += "    uint offset, align;\n";
+			for (int ix = 0; ix < 8; ix++) {
+				code += "    Xf = fract(Xs, &Xi); offset = (uint)Xi; align = offset & 3; offset -= align;";
+				if (ix < 7) {
+					sprintf(item, " Xs += %.12ff;", Sx); code += item;
+				}
+				code += "\n";
+				int N = Nx + Nxf;
+				if (N > 12) {
+					printf("ERROR: ScalarArea OCL Nx+Nxf=%d not supported yet\n", N);
+					return -1;
+				}
+				int ndw = (N + 4 + 3) >> 2;
+				char slist[] = "0123"; slist[ndw] = '\0';
+				char vload[] = "vloadn"; vload[5] = ndw > 1 ? ('0' + ndw) : 0;
+				sprintf(item, "    dw.s%s = %s(0, (__global uint *)&p[offset]);\n", slist, vload); code += item;
+				for (int idw = 0; idw < ndw - 1; idw++) {
+					sprintf(item, "    dw.s%d = amd_bytealign(dw.s%d, dw.s%d, align);\n", idw, idw + 1, idw); code += item;
+				}
+				int i = 0;
+				sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d) * (1.0f - Xf);\n", ix, i & 3, i >> 2); code += item;
+				for (i = 1; i < Nx - 1; i++) {
+					sprintf(item, "    f.s%d += amd_unpack%d(dw.s%d);\n", ix, i & 3, i >> 2); code += item;
+				}
+				sprintf(item, "    factor = %.12ff + Xf;", Sx - (Nx - 1)); code += item;
+				sprintf(item, " f.s%d += amd_unpack%d(dw.s%d) * clamp(factor, 0.0f, 1.0f) +", ix, i & 3, i >> 2); i++; code += item;
+				sprintf(item, " amd_unpack%d(dw.s%d) * clamp(factor-1.0f, 0.0f, 1.0f);\n", i & 3, i >> 2); code += item;
+			}
+		}
+		if (fSy != 0.0f) {
+			code += 
+				OPENCL_FORMAT(
+				"    f *= Syf;\n"
+				"    ftotal += f;\n"
+				"    Sy -= Syf;\n"
+				"    Syf = clamp(Sy, 0.0f, 1.0f);\n"
+				);
+		}
+		code +=
+			"    p += stride;\n"
+			"  }\n";
+		if (use_sad) {
+			code +=
+				"  F32x8 f = convert_float8(sum);\n";
+		}
+		const char * fvar = (fSy != 0.0f) ? "ftotal" : "f";
+		sprintf(item,
+			OPENCL_FORMAT(
+			"  %s *= %.12lff;\n"
+			"  U8x8 rv;\n"
+			"  rv.s0 = amd_pack(%s.s0123);\n"
+			"  rv.s1 = amd_pack(%s.s4567);\n"
+			"  *r = rv;\n"
+			"}\n"
+			), fvar, 1.0 / (double)(Sx*Sy), fvar, fvar); code += item;
+		// save the OpenCL program code
+		node->opencl_code += code;
+		node->opencl_type = NODE_OPENCL_TYPE_MEM2REG;
+	}
+#endif
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+#if ENABLE_OPENCL                    
+                    | AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_M2R
+#endif                 
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_OpticalFlowPyrLK_XY_XY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * newXY = node->paramList[0];
+		AgoData * oldPyr = node->paramList[1];
+		AgoData * newPyr = node->paramList[2];
+		AgoData * oldXY = node->paramList[3];
+		AgoData * newXYest = node->paramList[4];
+		vx_enum    termination = node->paramList[5]->u.scalar.u.e;
+		vx_float32 epsilon = node->paramList[6]->u.scalar.u.f;
+		vx_uint32  num_iterations = node->paramList[7]->u.scalar.u.u;
+		vx_bool    use_initial_estimate = node->paramList[8]->u.scalar.u.i ? vx_true_e : vx_false_e;
+		vx_int32   window_dimension = (vx_int32)node->paramList[9]->u.scalar.u.s;
+		ago_pyramid_u8_t *pPyrBuff = (ago_pyramid_u8_t *)oldPyr->buffer;
+		if (oldXY->u.arr.numitems != newXYest->u.arr.numitems || oldXY->u.arr.numitems > newXY->u.arr.capacity) {
+			status = VX_ERROR_INVALID_DIMENSION;
+		}
+		else if (HafCpu_OpticalFlowPyrLK_XY_XY_Generic((vx_keypoint_t *)newXY->buffer, oldPyr->u.pyr.scale, (vx_uint32)oldPyr->u.pyr.levels, (ago_pyramid_u8_t *)oldPyr->buffer,
+			(ago_pyramid_u8_t *)newPyr->buffer, (vx_uint32)newXYest->u.arr.numitems, (vx_keypoint_t *)oldXY->buffer, (vx_keypoint_t *)newXYest->buffer,
+			termination, epsilon, num_iterations, use_initial_estimate, pPyrBuff->width * 4, node->localDataPtr, window_dimension))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			newXY->u.arr.numitems = oldXY->u.arr.numitems;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_OpticalFlowPyrLK_XY_XY(node);
+		if (!status) {
+			if (node->paramList[9]->u.scalar.type != VX_TYPE_SIZE) {
+				status = VX_ERROR_INVALID_TYPE;
+			}
+			else {
+				vx_size window_dimension = node->paramList[9]->u.scalar.u.s;
+				if (window_dimension < 3 || window_dimension > AGO_OPTICALFLOWPYRLK_MAX_DIM) {
+					status = VX_ERROR_INVALID_VALUE;
+				}
+			}
+		}
+	}
+	else if (cmd == ago_kernel_cmd_initialize){
+		// allocate pyramid images for storing scharr output
+		AgoData * oldPyr = node->paramList[1];
+		ago_pyramid_u8_t *pPyrBuff = (ago_pyramid_u8_t *)oldPyr->buffer;
+		AgoData * newXYest = node->paramList[3];
+		int pyrWidth = pPyrBuff[0].width;
+		node->localDataSize = ((pPyrBuff->height*pPyrBuff->width * 4) + newXYest->u.arr.capacity*sizeof(ago_keypoint_t) + 256) + ((pyrWidth + 2) * 4 + 64);		// same as level 0 buffer; will be reused for lower levels. The second term, temp buffer for scharr
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_OpticalFlowPrepareLK_XY_XY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_initialize){
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		// TBD
+	}
+	return status;
+}
+
+int agoKernel_OpticalFlowImageLK_XY_XY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_initialize){
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		// TBD
+	}
+	return status;
+}
+
+int agoKernel_OpticalFlowFinalLK_XY_XY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_initialize){
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		// TBD
+	}
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		// TBD
+	}
+	return status;
+}
+
+int agoKernel_HarrisMergeSortAndPick_XY_HVC(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oXY = node->paramList[0];
+		AgoData * oNum = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_float32 min_distance = node->paramList[3]->u.scalar.u.f;
+		vx_uint32 cornerCount = 0;
+		if (HafCpu_HarrisMergeSortAndPick_XY_HVC((vx_uint32)oXY->u.arr.capacity, (vx_keypoint_t *)oXY->buffer, &cornerCount, 
+			iImg->u.img.width, iImg->u.img.height, (vx_float32 *)iImg->buffer, iImg->u.img.stride_in_bytes, min_distance)) {
+			status = VX_FAILURE;
+		}
+		else {
+			oXY->u.arr.numitems = min(cornerCount, (vx_uint32)oXY->u.arr.capacity);
+			if (oNum) {
+				oNum->u.scalar.u.s = cornerCount;
+			}
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_F32_AMD)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[2]->u.img.width || !node->paramList[2]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		else if (node->paramList[3]->u.scalar.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_TYPE;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_SIZE;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HarrisMergeSortAndPick_XY_XYS(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oXY = node->paramList[0];
+		AgoData * oNum = node->paramList[1];
+		AgoData * iXYS = node->paramList[2];
+		vx_float32 min_distance = node->paramList[3]->u.scalar.u.f;
+		ago_harris_grid_header_t * gridInfo = (ago_harris_grid_header_t *)node->localDataPtr;
+		ago_coord2d_short_t * gridBuf = (ago_coord2d_short_t *)(node->localDataPtr ? &node->localDataPtr[sizeof(ago_harris_grid_header_t)] : nullptr);
+		vx_uint32 cornerCount = 0;
+		if (HafCpu_HarrisMergeSortAndPick_XY_XYS((vx_uint32)oXY->u.arr.capacity, (vx_keypoint_t *)oXY->buffer, &cornerCount,
+			(ago_keypoint_xys_t *)iXYS->buffer, (vx_uint32)iXYS->u.arr.numitems, min_distance, gridInfo, gridBuf)) {
+			status = VX_FAILURE;
+		}
+		else {
+			oXY->u.arr.numitems = min(cornerCount, (vx_uint32)oXY->u.arr.capacity);
+			if (oNum) {
+				oNum->u.scalar.u.s = cornerCount;
+			}
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[2]->u.arr.itemtype != AGO_TYPE_KEYPOINT_XYS)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (node->paramList[3]->u.scalar.type != VX_TYPE_FLOAT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[3]->u.scalar.u.f <= 0.0f)
+			return VX_ERROR_INVALID_VALUE;
+		else if (node->paramList[4] && node->paramList[4]->u.scalar.type != VX_TYPE_UINT32)
+			return VX_ERROR_INVALID_TYPE;
+		else if (node->paramList[5] && node->paramList[5]->u.scalar.type != VX_TYPE_UINT32)
+			return VX_ERROR_INVALID_TYPE;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_SIZE;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize) {
+		vx_float32 min_distance = node->paramList[3]->u.scalar.u.f;
+		if (min_distance > 2.0f) { // no need to check neighorhood when min_distance <= 2.0f
+			// allocate a local buffer for a grid buffer with grid meta data
+			vx_uint32 width = node->paramList[4]->u.scalar.u.u;
+			vx_uint32 height = node->paramList[5]->u.scalar.u.u;
+			vx_uint32 cellSize = (vx_uint32)floor(min_distance / M_SQRT2);
+			vx_uint32 gridWidth = (width + cellSize - 1) / cellSize;
+			vx_uint32 gridHeight = (height + cellSize - 1) / cellSize;
+			vx_uint32 gridBufSize = (vx_uint32)(gridWidth * gridHeight * sizeof(ago_coord2d_short_t));
+			node->localDataSize = sizeof(ago_harris_grid_header_t) + gridBufSize;
+			node->localDataPtr = (vx_uint8 *)agoAllocMemory(node->localDataSize); if (!node->localDataPtr) return VX_ERROR_NO_MEMORY;
+			ago_harris_grid_header_t * gridInfo = (ago_harris_grid_header_t *)node->localDataPtr;
+			gridInfo->width = gridWidth;
+			gridInfo->height = gridHeight;
+			gridInfo->cellSize = cellSize;
+			gridInfo->gridBufSize = gridBufSize;
+		}
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_shutdown) {
+		if (node->localDataPtr) {
+			agoReleaseMemory(node->localDataPtr);
+			node->localDataPtr = nullptr;
+		}
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+int agoKernel_FastCornerMerge_XY_XY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oXY = node->paramList[0];
+		vx_keypoint_t * srcCorners[AGO_MAX_PARAMS] = { 0 };
+		vx_uint32 srcCornerCount[AGO_MAX_PARAMS] = { 0 };
+		vx_uint32 numSrcCornerBuffers = 0;
+		for (vx_uint32 i = 1, j = 0; i < node->paramCount; i++) {
+			if (node->paramList[i] && node->paramList[i]->u.arr.numitems) {
+				srcCorners[numSrcCornerBuffers] = (vx_keypoint_t *)node->paramList[i]->buffer;
+				srcCornerCount[numSrcCornerBuffers] = (vx_uint32)node->paramList[i]->u.arr.numitems;
+				numSrcCornerBuffers++;
+			}
+		}
+		vx_uint32 cornerCount = 0;
+		if (HafCpu_FastCornerMerge_XY_XY((vx_uint32)oXY->u.arr.capacity, (vx_keypoint_t *)oXY->buffer, &cornerCount, numSrcCornerBuffers, srcCorners, srcCornerCount)) {
+			status = VX_FAILURE;
+		}
+		else {
+			oXY->u.arr.numitems = min(cornerCount, (vx_uint32)oXY->u.arr.capacity);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		for (vx_uint32 i = 1; i < node->paramCount; i++) {
+			if (node->paramList[i] && node->paramList[i]->u.arr.itemtype != VX_TYPE_KEYPOINT)
+				return VX_ERROR_INVALID_TYPE;
+		}
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannyEdgeTrace_U8_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iStack = node->paramList[1];
+		if (HafCpu_CannyEdgeTrace_U8_U8(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										iStack->u.cannystack.count, (ago_coord2d_ushort_t *)iStack->buffer))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[0]->u.img.width || !node->paramList[0]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_CannyEdgeTrace_U8_U8XY(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iStack = node->paramList[1];
+		if (HafCpu_CannyEdgeTrace_U8_U8XY(oImg->u.img.width, oImg->u.img.height, oImg->buffer, oImg->u.img.stride_in_bytes,
+										  iStack->u.cannystack.count, (ago_coord2d_ushort_t *)iStack->buffer, iStack->u.cannystack.stackTop))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[0]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[0]->u.img.width || !node->paramList[0]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_IntegralImage_U32_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oImg = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_IntegralImage_U32_U8(oImg->u.img.width, oImg->u.img.height, (vx_uint32 *)oImg->buffer, oImg->u.img.stride_in_bytes,
+										iImg->buffer, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1OUT_1IN(node, VX_DF_IMAGE_U32, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Histogram_DATA_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oDist = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		vx_uint32 numbins = (vx_uint32) oDist->u.dist.numbins;
+		vx_uint32 offset = (vx_uint32)oDist->u.dist.offset;
+		vx_uint32 range = (vx_uint32)oDist->u.dist.range;
+		vx_uint32 window = oDist->u.dist.window;
+		vx_uint32 * histOut = (vx_uint32 *)oDist->buffer;
+		if (HafCpu_HistogramFixedBins_DATA_U8(histOut, numbins, offset, range, window, iImg->u.img.width, iImg->u.img.height, iImg->buffer, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1IN(node, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MeanStdDev_DATA_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oData = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_MeanStdDev_DATA_U8(&((ago_meanstddev_data_t *)oData->buffer)->sum, &((ago_meanstddev_data_t *)oData->buffer)->sumSquared, 
+			iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+		else {
+			((ago_meanstddev_data_t *)oData->buffer)->sampleCount = iImg->u.img.width * iImg->u.img.height;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1IN(node, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMax_DATA_U8(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oData = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_MinMax_DATA_U8(&((ago_minmaxloc_data_t *)oData->buffer)->min, &((ago_minmaxloc_data_t *)oData->buffer)->max,
+			iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y, 
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1IN(node, VX_DF_IMAGE_U8);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMax_DATA_S16(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oData = node->paramList[0];
+		AgoData * iImg = node->paramList[1];
+		if (HafCpu_MinMax_DATA_S16(&((ago_minmaxloc_data_t *)oData->buffer)->min, &((ago_minmaxloc_data_t *)oData->buffer)->max,
+			iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y, 
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = ValidateArguments_Img_1IN(node, VX_DF_IMAGE_S16);
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_Equalize_DATA_DATA(AgoNode * node, AgoKernelCommand cmd)
+{
+	// INFO: use VX_KERNEL_AMD_LUT_U8_U8 kernel
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oLut = node->paramList[0];
+		AgoData * iDist = node->paramList[1];
+		if (HafCpu_Equalize_DATA_DATA(oLut->buffer, 1, (vx_uint32 **)&iDist->buffer)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		if (node->paramList[0]->u.lut.type != VX_TYPE_UINT8)
+			return VX_ERROR_INVALID_FORMAT;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_HistogramMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		AgoData * oDist = node->paramList[0];
+		vx_uint32 * srcDist[AGO_MAX_PARAMS];
+		vx_uint32 numSrcDist = 0;
+		for (vx_uint32 i = 1; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcDist[numSrcDist++] = (vx_uint32 *)node->paramList[i]->buffer;
+			}
+		}
+		if (HafCpu_HistogramMerge_DATA_DATA((vx_uint32 *)oDist->buffer, numSrcDist, srcDist)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate parameters
+		for (vx_uint32 i = 1; i < node->paramCount; i++) {
+			if (node->paramList[i] && node->paramList[i]->u.arr.itemtype != VX_TYPE_KEYPOINT)
+				return VX_ERROR_INVALID_TYPE;
+		}
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_KEYPOINT;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MeanStdDevMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_uint32	  totalSampleCount = 0;
+		vx_uint32     numPartitions = 0;
+		vx_float32    partSum[AGO_MAX_PARAMS];
+		vx_float32    partSumOfSquared[AGO_MAX_PARAMS];
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				totalSampleCount += ((ago_meanstddev_data_t *)node->paramList[i]->buffer)->sampleCount;
+				partSum[numPartitions] = ((ago_meanstddev_data_t *)node->paramList[i]->buffer)->sum;
+				partSumOfSquared[numPartitions] = ((ago_meanstddev_data_t *)node->paramList[i]->buffer)->sumSquared;
+				numPartitions++;
+			}
+		}
+		if (HafCpu_MeanStdDevMerge_DATA_DATA(&node->paramList[0]->u.scalar.u.f, &node->paramList[1]->u.scalar.u.f, totalSampleCount, numPartitions, partSum, partSumOfSquared)) {
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_FLOAT32;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_FLOAT32;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		if (HafCpu_MinMaxMerge_DATA_DATA(&((ago_minmaxloc_data_t *)node->paramList[2]->buffer)->min,
+			&((ago_minmaxloc_data_t *)node->paramList[2]->buffer)->max, numDataPartitions, srcMinValue, srcMaxValue))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			// save the output values to output scalar values too
+			node->paramList[0]->u.scalar.u.i = ((ago_minmaxloc_data_t *)node->paramList[2]->buffer)->min;
+			node->paramList[1]->u.scalar.u.i = ((ago_minmaxloc_data_t *)node->paramList[2]->buffer)->max;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = node->paramList[0]->u.scalar.type;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = node->paramList[1]->u.scalar.type;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Min(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iImg = node->paramList[1];
+		vx_int32 finalMinValue, finalMaxValue;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Min(&node->paramList[0]->u.scalar.u.u, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[1]->u.img.width || !node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Max(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iImg = node->paramList[1];
+		vx_int32 finalMinValue, finalMaxValue;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Max(&node->paramList[0]->u.scalar.u.u, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y, 
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[1]->u.img.width || !node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_None_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iImg = node->paramList[2];
+		vx_int32 finalMinValue, finalMaxValue;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_None_Count_MinMax(&node->paramList[0]->u.scalar.u.u, &node->paramList[1]->u.scalar.u.u, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y, 
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[2]->u.img.width || !node->paramList[2]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_Min(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMinLoc = node->paramList[0];
+		AgoData * iMinCount = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_Min(&minCount, (vx_uint32)iMinLoc->u.arr.capacity, (vx_coordinates2d_t *)iMinLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y, 
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMinLoc->u.arr.numitems = min(minCount, (vx_uint32)iMinLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[2];
+		if (iImg->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 4; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMinLoc = node->paramList[0];
+		AgoData * iMinCount = node->paramList[1];
+		AgoData * iMaxCount = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0, maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_MinMax(&minCount, &maxCount, (vx_uint32)iMinLoc->u.arr.capacity, (vx_coordinates2d_t *)iMinLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y, 
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMinLoc->u.arr.numitems = min(minCount, (vx_uint32)iMinLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[3];
+		if (iImg->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_Max(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMaxLoc = node->paramList[0];
+		AgoData * iMaxCount = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_Max(&maxCount, (vx_uint32)iMaxLoc->u.arr.capacity, (vx_coordinates2d_t *)iMaxLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMaxLoc->u.arr.numitems = min(maxCount, (vx_uint32)iMaxLoc->u.arr.capacity);
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[2];
+		if (iImg->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 4; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMaxLoc = node->paramList[0];
+		AgoData * iMinCount = node->paramList[1];
+		AgoData * iMaxCount = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0, maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_MinMax(&minCount, &maxCount, (vx_uint32)iMaxLoc->u.arr.capacity, (vx_coordinates2d_t *)iMaxLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMaxLoc->u.arr.numitems = min(maxCount, (vx_uint32)iMaxLoc->u.arr.capacity);
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[3];
+		if (iImg->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_MinMax_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 5; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMinLoc = node->paramList[0];
+		AgoData * iMaxLoc = node->paramList[1];
+		AgoData * iMinCount = node->paramList[2];
+		AgoData * iMaxCount = node->paramList[3];
+		AgoData * iImg = node->paramList[4];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0, maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_U8DATA_Loc_MinMax_Count_MinMax(&minCount, &maxCount, (vx_uint32)iMinLoc->u.arr.capacity, (vx_coordinates2d_t *)iMinLoc->buffer,
+			(vx_uint32)iMaxLoc->u.arr.capacity, (vx_coordinates2d_t *)iMaxLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMinLoc->u.arr.numitems = min(minCount, (vx_uint32)iMinLoc->u.arr.capacity);
+			iMaxLoc->u.arr.numitems = min(maxCount, (vx_uint32)iMaxLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[4];
+		if (iImg->u.img.format != VX_DF_IMAGE_U8)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[3];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Min(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iImg = node->paramList[1];
+		vx_int32 finalMinValue, finalMaxValue;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Min(&node->paramList[0]->u.scalar.u.u, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[1]->u.img.width || !node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Max(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iImg = node->paramList[1];
+		vx_int32 finalMinValue, finalMaxValue;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Max(&node->paramList[0]->u.scalar.u.u, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		if (node->paramList[1]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[1]->u.img.width || !node->paramList[1]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_None_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iImg = node->paramList[2];
+		vx_int32 finalMinValue, finalMaxValue;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_None_Count_MinMax(&node->paramList[0]->u.scalar.u.u, &node->paramList[1]->u.scalar.u.u, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		if (node->paramList[2]->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!node->paramList[2]->u.img.width || !node->paramList[2]->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_Min(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMinLoc = node->paramList[0];
+		AgoData * iMinCount = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_Min(&minCount, (vx_uint32)iMinLoc->u.arr.capacity, (vx_coordinates2d_t *)iMinLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMinLoc->u.arr.numitems = min(minCount, (vx_uint32)iMinLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[2];
+		if (iImg->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 4; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMinLoc = node->paramList[0];
+		AgoData * iMinCount = node->paramList[1];
+		AgoData * iMaxCount = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0, maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_MinMax(&minCount, &maxCount, (vx_uint32)iMinLoc->u.arr.capacity, (vx_coordinates2d_t *)iMinLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMinLoc->u.arr.numitems = min(minCount, (vx_uint32)iMinLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[3];
+		if (iImg->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 3; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMaxLoc = node->paramList[0];
+		AgoData * iMaxCount = node->paramList[1];
+		AgoData * iImg = node->paramList[2];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max(&maxCount, (vx_uint32)iMaxLoc->u.arr.capacity, (vx_coordinates2d_t *)iMaxLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMaxLoc->u.arr.numitems = min(maxCount, (vx_uint32)iMaxLoc->u.arr.capacity);
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[2];
+		if (iImg->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 4; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMaxLoc = node->paramList[0];
+		AgoData * iMinCount = node->paramList[1];
+		AgoData * iMaxCount = node->paramList[2];
+		AgoData * iImg = node->paramList[3];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0, maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax(&minCount, &maxCount, (vx_uint32)iMaxLoc->u.arr.capacity, (vx_coordinates2d_t *)iMaxLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMaxLoc->u.arr.numitems = min(maxCount, (vx_uint32)iMaxLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[3];
+		if (iImg->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_int32 srcMinValue[AGO_MAX_PARAMS], srcMaxValue[AGO_MAX_PARAMS];
+		vx_uint32 numDataPartitions = 0;
+		for (vx_uint32 i = 5; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				srcMinValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->min;
+				srcMaxValue[numDataPartitions] = ((ago_minmaxloc_data_t *)node->paramList[i]->buffer)->max;
+				numDataPartitions++;
+			}
+		}
+		AgoData * iMinLoc = node->paramList[0];
+		AgoData * iMaxLoc = node->paramList[1];
+		AgoData * iMinCount = node->paramList[2];
+		AgoData * iMaxCount = node->paramList[3];
+		AgoData * iImg = node->paramList[4];
+		vx_int32 finalMinValue, finalMaxValue;
+		vx_uint32 minCount = 0, maxCount = 0;
+		if (HafCpu_MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax(&minCount, &maxCount, (vx_uint32)iMinLoc->u.arr.capacity, (vx_coordinates2d_t *)iMinLoc->buffer,
+			(vx_uint32)iMaxLoc->u.arr.capacity, (vx_coordinates2d_t *)iMaxLoc->buffer, &finalMinValue, &finalMaxValue,
+			numDataPartitions, srcMinValue, srcMaxValue, iImg->u.img.rect_valid.end_x - iImg->u.img.rect_valid.start_x, iImg->u.img.rect_valid.end_y - iImg->u.img.rect_valid.start_y,
+			(vx_int16 *)(iImg->buffer + (iImg->u.img.rect_valid.start_y*iImg->u.img.stride_in_bytes)) + iImg->u.img.rect_valid.start_x, iImg->u.img.stride_in_bytes))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			iMinLoc->u.arr.numitems = min(minCount, (vx_uint32)iMinLoc->u.arr.capacity);
+			iMaxLoc->u.arr.numitems = min(maxCount, (vx_uint32)iMaxLoc->u.arr.capacity);
+			if (iMinCount) iMinCount->u.scalar.u.u = minCount;
+			if (iMaxCount) iMaxCount->u.scalar.u.u = maxCount;
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		status = VX_SUCCESS;
+		// validate parameters
+		AgoData * iImg = node->paramList[4];
+		if (iImg->u.img.format != VX_DF_IMAGE_S16)
+			return VX_ERROR_INVALID_FORMAT;
+		else if (!iImg->u.img.width || !iImg->u.img.height)
+			return VX_ERROR_INVALID_DIMENSION;
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[1];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		meta = &node->metaList[2];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[3];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+    else if (cmd == ago_kernel_cmd_query_target_support) {
+        node->target_support_flags = 0
+                    | AGO_KERNEL_FLAG_DEVICE_CPU
+                    ;
+        status = VX_SUCCESS;
+    }
+	return status;
+}
+
+int agoKernel_MinMaxLocMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd)
+{
+	vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED;
+	if (cmd == ago_kernel_cmd_execute) {
+		status = VX_SUCCESS;
+		vx_uint32 numDataPartitions = 0;
+		vx_uint32 partLocCount[AGO_MAX_PARAMS];
+		vx_coordinates2d_t * partLocList[AGO_MAX_PARAMS];
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i] && node->paramList[i]->u.arr.numitems) {
+				partLocCount[numDataPartitions] = (vx_uint32) node->paramList[i]->u.arr.numitems;
+				partLocList[numDataPartitions] = (vx_coordinates2d_t *)node->paramList[i]->buffer;
+				numDataPartitions++;
+			}
+		}
+		vx_uint32 countMinMaxLoc = 0;
+		if (HafCpu_MinMaxLocMerge_DATA_DATA(&node->paramList[0]->u.scalar.u.u, (vx_uint32)node->paramList[1]->u.arr.capacity, (vx_coordinates2d_t *)node->paramList[1]->buffer,
+			numDataPartitions, partLocCount, partLocList))
+		{
+			status = VX_FAILURE;
+		}
+		else {
+			node->paramList[1]->u.arr.numitems = min(node->paramList[0]->u.scalar.u.u, (vx_uint32)node->paramList[1]->u.arr.capacity);
+		}
+	}
+	else if (cmd == ago_kernel_cmd_validate) {
+		// validate inputs
+		for (vx_uint32 i = 2; i < node->paramCount; i++) {
+			if (node->paramList[i]) {
+				if (node->paramList[i]->u.arr.itemtype != VX_TYPE_COORDINATES2D)
+					return VX_ERROR_INVALID_TYPE;
+			}
+		}
+		// set output info
+		vx_meta_format meta;
+		meta = &node->metaList[0];
+		meta->data.u.scalar.type = VX_TYPE_UINT32;
+		meta = &node->metaList[1];
+		meta->data.u.arr.itemtype = VX_TYPE_COORDINATES2D;
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) {
+		status = VX_SUCCESS;
+	}
+	else if (cmd == ago_kernel_cmd_query_target_support) {
+		node->target_support_flags = 0
+			| AGO_KERNEL_FLAG_DEVICE_CPU
+			;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
diff --git a/openvx/ago/ago_kernel_api.h b/openvx/ago/ago_kernel_api.h
new file mode 100644
index 0000000..f311a90
--- /dev/null
+++ b/openvx/ago/ago_kernel_api.h
@@ -0,0 +1,354 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __ago_kernels_api_h__
+#define __ago_kernels_api_h__
+
+#include "ago_internal.h"
+
+// import all kernels into framework
+int agoPublishKernels(AgoContext * acontext);
+
+// OpenVX 1.0 built-in kernels
+int ovxKernel_Invalid(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_ColorConvert(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_ChannelExtract(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_ChannelCombine(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Sobel3x3(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Magnitude(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Phase(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_ScaleImage(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_TableLookup(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Histogram(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_EqualizeHistogram(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_AbsDiff(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_MeanStdDev(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Threshold(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_IntegralImage(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Dilate3x3(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Erode3x3(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Median3x3(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Box3x3(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Gaussian3x3(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_CustomConvolution(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_GaussianPyramid(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Accumulate(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_AccumulateWeighted(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_AccumulateSquare(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_MinMaxLoc(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_ConvertDepth(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_CannyEdgeDetector(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_And(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Or(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Xor(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Not(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Multiply(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Add(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Subtract(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_WarpAffine(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_WarpPerspective(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_HarrisCorners(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_FastCorners(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_OpticalFlowPyrLK(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_Remap(AgoNode * node, AgoKernelCommand cmd);
+int ovxKernel_HalfScaleGaussian(AgoNode * node, AgoKernelCommand cmd);
+
+// AMD low-level kernels
+int agoKernel_Set00_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_SetFF_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Not_U8_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Not_U8_U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Not_U1_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Not_U1_U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Lut_U8_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Threshold_U8_U8_Binary(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Threshold_U8_U8_Range(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Threshold_U1_U8_Binary(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Threshold_U1_U8_Range(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ThresholdNot_U8_U8_Binary(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ThresholdNot_U8_U8_Range(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ThresholdNot_U1_U8_Binary(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ThresholdNot_U1_U8_Range(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorDepth_U8_S16_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorDepth_U8_S16_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorDepth_S16_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_U8_U8U8_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_U8_U8U8_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_U8_U8U8_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_U8_U8U8_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_U8_U8U8_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_U8_U8U8_Wrap_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_U8_U8U8_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_U8_U8U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U8_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U8_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U8_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U1_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U1_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U1_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_And_U1_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U8_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U8_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U8_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U1_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U1_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U1_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Or_U1_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U8_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U8_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U8_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U1_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U1_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U1_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xor_U1_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U8_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U8_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U8_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U1_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U1_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U1_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nand_U1_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U8_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U8_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U8_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U1_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U1_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U1_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Nor_U1_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U8_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U8_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U8_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U1_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U1_U8U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U1_U1U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Xnor_U1_U1U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_AbsDiff_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_AccumulateWeighted_U8_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_S16_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_U8U8_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_U8U8_Wrap_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_U8U8_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_U8U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_S16_S16U8_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Accumulate_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_S16U8_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16U8_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16U8_Wrap_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16U8_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_AccumulateSquared_S16_S16U8_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_U8S16_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_U8S16_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_AbsDiff_S16_S16S16_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_S16_S16S16_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Add_S16_S16S16_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_S16S16_Wrap(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sub_S16_S16S16_Sat(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16S16_Wrap_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16S16_Wrap_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16S16_Sat_Trunc(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_S16_S16S16_Sat_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Magnitude_S16_S16S16(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Phase_U8_S16S16(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCopy_U8_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCopy_U8_U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCopy_U1_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCopy_U1_U1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U16_Pos0(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U16_Pos1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U24_Pos0(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U24_Pos1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U24_Pos2(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U32_Pos0(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U32_Pos1(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U32_Pos2(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8_U32_Pos3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8U8U8_U24(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8U8U8_U32(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelExtract_U8U8U8U8_U32(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCombine_U16_U8U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCombine_U24_U8U8U8_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCombine_U32_U8U8U8_UYVY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCombine_U32_U8U8U8_YUYV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ChannelCombine_U32_U8U8U8U8_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_U24_U24U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Mul_U32_U32U8_Sat_Round(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGB_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGB_UYVY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGB_YUYV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGB_IYUV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGB_NV12(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGB_NV21(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGBX_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGBX_UYVY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGBX_YUYV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGBX_IYUV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGBX_NV12(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_RGBX_NV21(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_YUV4_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_YUV4_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleUp2x2_U8_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_UV_UV12(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IYUV_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IYUV_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_IYUV_UYVY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_IYUV_YUYV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_IUV_UV12(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_NV12_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_NV12_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_NV12_UYVY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_NV12_YUYV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FormatConvert_UV12_IUV(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_Y_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_Y_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_U_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_U_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_V_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_V_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IU_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IU_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IV_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IV_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IUV_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_IUV_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_UV12_RGB(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ColorConvert_UV12_RGBX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Box_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Dilate_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Erode_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Median_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Gaussian_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleGaussianHalf_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleGaussianHalf_U8_U8_5x5(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleGaussianOrb_U8_U8_5x5(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Convolve_U8_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Convolve_S16_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_LinearFilter_ANY_ANY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_LinearFilter_ANYx2_ANY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_SobelMagnitude_S16_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_SobelPhase_U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_SobelMagnitudePhase_S16U8_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sobel_S16S16_U8_3x3_GXY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sobel_S16_U8_3x3_GX(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Sobel_S16_U8_3x3_GY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Dilate_U1_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Erode_U1_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Dilate_U1_U1_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Erode_U1_U1_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Dilate_U8_U1_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Erode_U8_U1_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FastCorners_XY_U8_Supression(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FastCorners_XY_U8_NoSupression(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisSobel_HG3_U8_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisSobel_HG3_U8_5x5(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisSobel_HG3_U8_7x7(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisScore_HVC_HG3_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisScore_HVC_HG3_5x5(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisScore_HVC_HG3_7x7(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8_U8_3x3_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8_U8_3x3_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8_U8_5x5_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8_U8_5x5_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8_U8_7x7_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8_U8_7x7_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_3x3_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_3x3_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_5x5_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_5x5_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_7x7_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobelSuppThreshold_U8XY_U8_7x7_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobel_U16_U8_3x3_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobel_U16_U8_3x3_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobel_U16_U8_5x5_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobel_U16_U8_5x5_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobel_U16_U8_7x7_L1NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySobel_U16_U8_7x7_L2NORM(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySuppThreshold_U8_U16_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannySuppThreshold_U8XY_U16_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_NonMaxSupp_XY_ANY_3x3(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U8_U8_Nearest_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U24_U24_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U24_U32_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Remap_U32_U32_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpAffine_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpAffine_U8_U8_Nearest_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpAffine_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpAffine_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpPerspective_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpPerspective_U8_U8_Nearest_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpPerspective_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_WarpPerspective_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleImage_U8_U8_Nearest(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleImage_U8_U8_Bilinear(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleImage_U8_U8_Bilinear_Replicate(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleImage_U8_U8_Bilinear_Constant(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_ScaleImage_U8_U8_Area(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_OpticalFlowPyrLK_XY_XY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_OpticalFlowPrepareLK_XY_XY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_OpticalFlowImageLK_XY_XY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_OpticalFlowFinalLK_XY_XY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisMergeSortAndPick_XY_HVC(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HarrisMergeSortAndPick_XY_XYS(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_FastCornerMerge_XY_XY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannyEdgeTrace_U8_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_CannyEdgeTrace_U8_U8XY(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_IntegralImage_U32_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Histogram_DATA_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MeanStdDev_DATA_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMax_DATA_U8(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMax_DATA_S16(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_Equalize_DATA_DATA(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_HistogramMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MeanStdDevMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Min(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_None_Count_Max(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_None_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_Min(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Min_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_Max(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_Max_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_U8DATA_Loc_MinMax_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Min(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_None_Count_Max(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_None_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_Min(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Min_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax(AgoNode * node, AgoKernelCommand cmd);
+int agoKernel_MinMaxLocMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd);
+
+#endif // __ago_kernels_api_h__
diff --git a/openvx/ago/ago_kernel_list.cpp b/openvx/ago/ago_kernel_list.cpp
new file mode 100644
index 0000000..b781edc
--- /dev/null
+++ b/openvx/ago/ago_kernel_list.cpp
@@ -0,0 +1,544 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_kernel_api.h"
+
+// for argConfig[]
+#define AIN                                    ( AGO_KERNEL_ARG_INPUT_FLAG )
+#define AINOUT                                 ( AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG )
+#define AOUT                                   ( AGO_KERNEL_ARG_OUTPUT_FLAG )
+#define AOPTIN                                 ( AGO_KERNEL_ARG_OPTIONAL_FLAG | AGO_KERNEL_ARG_INPUT_FLAG )
+#define AOPTOUT                                ( AGO_KERNEL_ARG_OPTIONAL_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG )
+#define AIN_AOUT                               { AIN, AOUT }
+#define AINx2_AOUT                             { AIN, AIN, AOUT }
+#define AINx3_AOUT                             { AIN, AIN, AIN, AOUT }
+#define AINx4_AOUT                             { AIN, AIN, AIN, AIN, AOUT }
+#define AINx5_AOUT                             { AIN, AIN, AIN, AIN, AIN, AOUT }
+#define AINx2_AOPTINx2_AOUT                    { AIN, AIN, AOPTIN, AOPTIN, AOUT }
+#define AIN_AOPTOUTx2                          { AIN, AOPTOUT, AOPTOUT }
+#define AIN_AOUT_AIN                           { AIN, AOUT, AIN }
+#define AIN_AOUTx2                             { AIN, AOUT, AOUT }
+#define AIN_AINOUT                             { AIN, AINOUT }
+#define AINx2_AINOUT                           { AIN, AIN, AINOUT }
+#define AIN_AOUTx2_AOPTOUTx4                   { AIN, AOUT, AOUT, AOPTOUT, AOPTOUT, AOPTOUT, AOPTOUT }
+#define AIN_AOUT_AINx2                         { AIN, AOUT, AIN, AIN }
+#define AINx3_AOUT_AOPTOUT                     { AIN, AIN, AIN, AOUT, AOPTOUT }
+#define AINx6_AOUT_AOPTOUT                     { AIN, AIN, AIN, AIN, AIN, AIN, AOUT, AOPTOUT }
+#define AINx4_AOUT_AINx5                       { AIN, AIN, AIN, AIN, AOUT, AIN, AIN, AIN, AIN, AIN }
+#define AOUT_AIN                               { AOUT, AIN }
+#define AOUT_AIN_AOPTIN                        { AOUT, AIN, AOPTIN }
+#define AOUT_AINx2                             { AOUT, AIN, AIN }
+#define AOUT_AINx2_AOPTIN                      { AOUT, AIN, AIN, AOPTIN }
+#define AOUT_AINx3                             { AOUT, AIN, AIN, AIN }
+#define AOUT_AINx4                             { AOUT, AIN, AIN, AIN, AIN }
+#define AOUT_AINx8                             { AOUT, AIN, AIN, AIN, AIN, AIN, AIN, AIN, AIN }
+#define AOUT_AINx9                             { AOUT, AIN, AIN, AIN, AIN, AIN, AIN, AIN, AIN, AIN }
+#define AOUTx2_AIN                             { AOUT, AOUT, AIN }
+#define AOUTx2_AINx2                           { AOUT, AOUT, AIN, AIN }
+#define AOUTx2_AINx2_AOPTIN                    { AOUT, AOUT, AIN, AIN, AOPTIN }
+#define AOUTx2_AINx3                           { AOUT, AOUT, AIN, AIN, AIN }
+#define AOUTx3_AIN                             { AOUT, AOUT, AOUT, AIN }
+#define AOUTx3_AINx2                           { AOUT, AOUT, AOUT, AIN, AIN }
+#define AOUTx4_AIN                             { AOUT, AOUT, AOUT, AOUT, AIN }
+#define AOUTx4_AINx2                           { AOUT, AOUT, AOUT, AOUT, AIN, AIN }
+#define AOUTx3_AINx3                           { AOUT, AOUT, AOUT, AIN, AIN, AIN }
+#define AOUTx2_AIN_AOPTINx7                    { AOUT, AOUT, AIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN }
+#define AOUTx3_AIN_AOPTINx6                    { AOUT, AOUT, AOUT, AIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN }
+#define AOUTx2_AINx2_AOPTINx6                  { AOUT, AOUT, AIN, AIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN }
+#define AOUT_AIN_AOPTINx8                      { AOUT, AIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN }
+#define AINOUT_AIN                             { AINOUT, AIN }
+#define AINOUT_AINx2                           { AINOUT, AIN, AIN }
+#define AOUTx3_AINx2_AOPTINx5                  { AOUT, AOUT, AOUT, AIN, AIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN }
+#define AOUTx4_AINx2_AOPTINx4                  { AOUT, AOUT, AOUT, AOUT, AIN, AIN, AOPTIN, AOPTIN, AOPTIN, AOPTIN }
+#define AOUTx5_AINx2_AOPTINx2                  { AOUT, AOUT, AOUT, AOUT, AOUT, AIN, AIN, AOPTIN, AOPTIN }
+#define AOUTx6_AINx2_AOPTINx2                  { AOUT, AOUT, AOUT, AOUT, AOUT, AOUT, AIN, AIN, AOPTIN, AOPTIN }
+#define AOUT_AOPTOUT_AINx2                     { AOUT, AOPTOUT, AIN, AIN }
+#define AOUT_AOPTOUT_AINx4                     { AOUT, AOPTOUT, AIN, AIN, AIN, AIN }
+#define AOUT_AOPTOUTx2_AINx2                   { AOUT, AOPTOUT, AOPTOUT, AIN, AIN }
+#define AOUTx2_AOPTOUTx2_AINx2                 { AOUT, AOUT, AOPTOUT, AOPTOUT, AIN, AIN }
+
+// for argType[]
+#define ATYPE_I                                { VX_TYPE_IMAGE }
+#define ATYPE_II                               { VX_TYPE_IMAGE, VX_TYPE_IMAGE }
+#define ATYPE_III                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE }
+#define ATYPE_IIII                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE }
+#define ATYPE_IIIII                            { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE }
+#define ATYPE_IIIIII                           { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE }
+#define ATYPE_IIS                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR }
+#define ATYPE_ISI                              { VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_IMAGE }
+#define ATYPE_IIL                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_LUT }
+#define ATYPE_ILI                              { VX_TYPE_IMAGE, VX_TYPE_LUT, VX_TYPE_IMAGE }
+#define ATYPE_ID                               { VX_TYPE_IMAGE, VX_TYPE_DISTRIBUTION }
+#define ATYPE_ISS                              { VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_IIT                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_THRESHOLD }
+#define ATYPE_IITS                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_THRESHOLD, VX_TYPE_SCALAR }
+#define ATYPE_ITI                              { VX_TYPE_IMAGE, VX_TYPE_THRESHOLD, VX_TYPE_IMAGE }
+#define ATYPE_IIC                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_CONVOLUTION }
+#define ATYPE_IIICC                            { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_CONVOLUTION, VX_TYPE_CONVOLUTION }
+#define ATYPE_ICI                              { VX_TYPE_IMAGE, VX_TYPE_CONVOLUTION, VX_TYPE_IMAGE }
+#define ATYPE_IP                               { VX_TYPE_IMAGE, VX_TYPE_PYRAMID }
+#define ATYPE_ISSAASS                          { VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_IISSSI                           { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE }
+#define ATYPE_IISI                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_IMAGE }
+#define ATYPE_IISS                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_IISSS                            { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_ITSSI                            { VX_TYPE_IMAGE, VX_TYPE_THRESHOLD, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE }
+#define ATYPE_IMSI                             { VX_TYPE_IMAGE, VX_TYPE_MATRIX, VX_TYPE_SCALAR, VX_TYPE_IMAGE }
+#define ATYPE_ISSSSSAS                         { VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_ARRAY, VX_TYPE_SCALAR }
+#define ATYPE_ISSAS                            { VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_ARRAY, VX_TYPE_SCALAR }
+#define ATYPE_PPAAASSSSS                       { VX_TYPE_PYRAMID, VX_TYPE_PYRAMID, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_IRSI                             { VX_TYPE_IMAGE, VX_TYPE_REMAP, VX_TYPE_SCALAR, VX_TYPE_IMAGE }
+#define ATYPE_IIIS                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR }
+#define ATYPE_AI                               { VX_TYPE_ARRAY, VX_TYPE_IMAGE }
+#define ATYPE_AIS                              { VX_TYPE_ARRAY, VX_TYPE_IMAGE, VX_TYPE_SCALAR }
+#define ATYPE_ASIS                             { VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_IMAGE, VX_TYPE_SCALAR }
+#define ATYPE_ASASSS                           { VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_IcIT                             { VX_TYPE_IMAGE, AGO_TYPE_CANNY_STACK, VX_TYPE_IMAGE, VX_TYPE_THRESHOLD }
+#define ATYPE_IcITS                            { VX_TYPE_IMAGE, AGO_TYPE_CANNY_STACK, VX_TYPE_IMAGE, VX_TYPE_THRESHOLD, VX_TYPE_SCALAR }
+#define ATYPE_IIR                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_REMAP }
+#define ATYPE_IIM                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_MATRIX }
+#define ATYPE_IIIMM                            { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_MATRIX, VX_TYPE_MATRIX }
+#define ATYPE_IIRS                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_REMAP, VX_TYPE_SCALAR }
+#define ATYPE_IIMS                             { VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_MATRIX, VX_TYPE_SCALAR }
+#define ATYPE_IIx                              { VX_TYPE_IMAGE, VX_TYPE_IMAGE, AGO_TYPE_SCALE_MATRIX }
+#define ATYPE_AAA                              { VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY }
+#define ATYPE_AAAAS                            { VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_SCALAR }
+#define ATYPE_AAIISSSSS                        { VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_IMAGE, VX_TYPE_IMAGE, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_APPAASSSSS                       { VX_TYPE_ARRAY, VX_TYPE_PYRAMID, VX_TYPE_PYRAMID, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_SCALAR }
+#define ATYPE_ASAAAAAAAA                       { VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY }
+#define ATYPE_Ic                               { VX_TYPE_IMAGE, AGO_TYPE_CANNY_STACK }
+#define ATYPE_DI                               { VX_TYPE_DISTRIBUTION, VX_TYPE_IMAGE }
+#define ATYPE_mI                               { AGO_TYPE_MINMAXLOC_DATA, VX_TYPE_IMAGE }
+#define ATYPE_sI                               { AGO_TYPE_MEANSTDDEV_DATA, VX_TYPE_IMAGE }
+#define ATYPE_LDDDDDDDDD                       { VX_TYPE_LUT, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION }
+#define ATYPE_DDDDDDDDDD                       { VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION, VX_TYPE_DISTRIBUTION }
+#define ATYPE_SSssssssss                       { VX_TYPE_SCALAR, VX_TYPE_SCALAR, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA, AGO_TYPE_MEANSTDDEV_DATA }
+#define ATYPE_SSmmmmmmmm                       { VX_TYPE_SCALAR, VX_TYPE_SCALAR, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA, AGO_TYPE_MINMAXLOC_DATA }
+#define ATYPE_SIm                              { VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA }
+#define ATYPE_SSIm                             { VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA }
+#define ATYPE_ASIm                             { VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA }
+#define ATYPE_ASSIm                            { VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA }
+#define ATYPE_AASSIm                           { VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA }
+#define ATYPE_SAAAAAAAAA                       { VX_TYPE_SCALAR, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY }
+
+// for kernOpType & kernOpInfo
+#define KOP_UNKNOWN    AGO_KERNEL_OP_TYPE_UNKNOWN,         0,
+#define KOP_ELEMWISE   AGO_KERNEL_OP_TYPE_ELEMENT_WISE,    0,
+#define KOP_FIXED(N)   AGO_KERNEL_OP_TYPE_FIXED_NEIGHBORS, N,
+
+// list of all built-in kernels
+static struct {
+	vx_enum id;
+	int(*func)(AgoNode * node, AgoKernelCommand cmd);
+	const char * name;
+	vx_uint32 flags;
+	vx_uint8  argConfig[AGO_MAX_PARAMS];
+	vx_enum argType[AGO_MAX_PARAMS];
+	vx_uint8 kernOpType;
+	vx_uint8 kernOpInfo;
+} ago_kernel_list[] = {
+#define OVX_KERNEL_ENTRY(kernel_id,name,kname,argCfg,argType)       \
+	{                                                               \
+		kernel_id, ovxKernel_ ## name, "org.khronos.openvx." kname, \
+		AGO_KERNEL_FLAG_GROUP_OVX10, argCfg, argType                \
+	}
+#define AGO_KERNEL_ENTRY(kernel_id,cpu_avail,gpu_avail,name,argCfg,argType,kernOp) \
+	{                                                               \
+		kernel_id, agoKernel_ ## name, "com.amd.openvx." #name,     \
+		AGO_KERNEL_FLAG_GROUP_AMDLL | (cpu_avail ? AGO_KERNEL_FLAG_DEVICE_CPU : 0) | (gpu_avail ? AGO_KERNEL_FLAG_DEVICE_GPU : 0), argCfg, argType, kernOp \
+	}
+	// OpenVX 1.0 built-in kernels
+
+	OVX_KERNEL_ENTRY( VX_KERNEL_INVALID               , Invalid, "invalid",                        { },                  { }                ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_COLOR_CONVERT         , ColorConvert, "color_convert",             AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_CHANNEL_EXTRACT       , ChannelExtract, "channel_extract",         AINx2_AOUT,           ATYPE_ISI          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_CHANNEL_COMBINE       , ChannelCombine, "channel_combine",         AINx2_AOPTINx2_AOUT,  ATYPE_IIIII        ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_SOBEL_3x3             , Sobel3x3, "sobel_3x3",                     AIN_AOPTOUTx2,        ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_MAGNITUDE             , Magnitude, "magnitude",                    AINx2_AOUT,           ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_PHASE                 , Phase, "phase",                            AINx2_AOUT,           ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_SCALE_IMAGE           , ScaleImage, "scale_image",                 AIN_AOUT_AIN,         ATYPE_IIS          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_TABLE_LOOKUP          , TableLookup, "table_lookup",               AINx2_AOUT,           ATYPE_ILI          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_HISTOGRAM             , Histogram, "histogram",                    AIN_AOUT,             ATYPE_ID           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_EQUALIZE_HISTOGRAM    , EqualizeHistogram, "equalize_histogram",   AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_ABSDIFF               , AbsDiff, "absdiff",                        AINx2_AOUT,           ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_MEAN_STDDEV           , MeanStdDev, "mean_stddev",                 AIN_AOUTx2,           ATYPE_ISS          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_THRESHOLD             , Threshold, "threshold",                    AINx2_AOUT,           ATYPE_ITI          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_INTEGRAL_IMAGE        , IntegralImage, "integral_image",           AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_DILATE_3x3            , Dilate3x3, "dilate_3x3",                   AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_ERODE_3x3             , Erode3x3, "erode_3x3",                     AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_MEDIAN_3x3            , Median3x3, "median_3x3",                   AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_BOX_3x3               , Box3x3, "box_3x3",                         AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_GAUSSIAN_3x3          , Gaussian3x3, "gaussian_3x3",               AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_CUSTOM_CONVOLUTION    , CustomConvolution, "custom_convolution",   AINx2_AOUT,           ATYPE_ICI          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_GAUSSIAN_PYRAMID      , GaussianPyramid, "gaussian_pyramid",       AIN_AOUT,             ATYPE_IP           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_ACCUMULATE            , Accumulate, "accumulate",                  AIN_AINOUT,           ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_ACCUMULATE_WEIGHTED   , AccumulateWeighted, "accumulate_weighted", AINx2_AINOUT,         ATYPE_ISI          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_ACCUMULATE_SQUARE     , AccumulateSquare, "accumulate_square",     AINx2_AINOUT,         ATYPE_ISI          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_MINMAXLOC             , MinMaxLoc, "minmaxloc",                    AIN_AOUTx2_AOPTOUTx4, ATYPE_ISSAASS      ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_CONVERTDEPTH          , ConvertDepth, "convertdepth",              AIN_AOUT_AINx2,       ATYPE_IISS         ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_CANNY_EDGE_DETECTOR   , CannyEdgeDetector, "canny_edge_detector",  AINx4_AOUT,           ATYPE_ITSSI        ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_AND                   , And, "and",                                AINx2_AOUT,           ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_OR                    , Or, "or",                                  AINx2_AOUT,           ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_XOR                   , Xor, "xor",                                AINx2_AOUT,           ATYPE_III          ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_NOT                   , Not, "not",                                AIN_AOUT,             ATYPE_II           ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_MULTIPLY              , Multiply, "multiply",                      AINx5_AOUT,           ATYPE_IISSSI       ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_ADD                   , Add, "add",                                AINx3_AOUT,           ATYPE_IISI         ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_SUBTRACT              , Subtract, "subtract",                      AINx3_AOUT,           ATYPE_IISI         ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_WARP_AFFINE           , WarpAffine, "warp_affine",                 AINx3_AOUT,           ATYPE_IMSI         ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_WARP_PERSPECTIVE      , WarpPerspective, "warp_perspective",       AINx3_AOUT,           ATYPE_IMSI         ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_HARRIS_CORNERS        , HarrisCorners, "harris_corners",           AINx6_AOUT_AOPTOUT,   ATYPE_ISSSSSAS     ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_FAST_CORNERS          , FastCorners, "fast_corners",               AINx3_AOUT_AOPTOUT,   ATYPE_ISSAS        ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_OPTICAL_FLOW_PYR_LK   , OpticalFlowPyrLK, "optical_flow_pyr_lk",   AINx4_AOUT_AINx5,     ATYPE_PPAAASSSSS   ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_REMAP                 , Remap, "remap",                            AINx3_AOUT,           ATYPE_IRSI         ),
+	OVX_KERNEL_ENTRY( VX_KERNEL_HALFSCALE_GAUSSIAN    , HalfScaleGaussian, "halfscale_gaussian",   AIN_AOUT_AIN,         ATYPE_IIS          ),
+	// AMD low-level kernel primitives
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SET_00_U8                                               , 1, 1, Set00_U8, { AOUT },                                           ATYPE_I                 , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SET_FF_U8                                               , 1, 1, SetFF_U8, { AOUT },                                           ATYPE_I                 , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOT_U8_U8                                               , 1, 1, Not_U8_U8, AOUT_AIN,                                          ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOT_U8_U1                                               , 1, 1, Not_U8_U1, AOUT_AIN,                                          ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOT_U1_U8                                               , 1, 1, Not_U1_U8, AOUT_AIN,                                          ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOT_U1_U1                                               , 1, 1, Not_U1_U1, AOUT_AIN,                                          ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_LUT_U8_U8                                               , 1, 1, Lut_U8_U8, AOUT_AINx2,                                        ATYPE_IIL               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_U8_U8_BINARY                                  , 1, 1, Threshold_U8_U8_Binary, AOUT_AINx2,                           ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_U8_U8_RANGE                                   , 1, 1, Threshold_U8_U8_Range, AOUT_AINx2,                            ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_U1_U8_BINARY                                  , 1, 1, Threshold_U1_U8_Binary, AOUT_AINx2,                           ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_U1_U8_RANGE                                   , 1, 1, Threshold_U1_U8_Range, AOUT_AINx2,                            ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_BINARY                              , 1, 1, ThresholdNot_U8_U8_Binary, AOUT_AINx2,                        ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_RANGE                               , 1, 1, ThresholdNot_U8_U8_Range, AOUT_AINx2,                         ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_BINARY                              , 1, 1, ThresholdNot_U1_U8_Binary, AOUT_AINx2,                        ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_RANGE                               , 1, 1, ThresholdNot_U1_U8_Range, AOUT_AINx2,                         ATYPE_IIT               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_DEPTH_U8_S16_WRAP                                 , 1, 1, ColorDepth_U8_S16_Wrap, AOUT_AINx2,                           ATYPE_IIS               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_DEPTH_U8_S16_SAT                                  , 1, 1, ColorDepth_U8_S16_Sat, AOUT_AINx2,                            ATYPE_IIS               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_DEPTH_S16_U8                                      , 1, 1, ColorDepth_S16_U8, AOUT_AINx2,                                ATYPE_IIS               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_U8_U8U8_WRAP                                        , 1, 1, Add_U8_U8U8_Wrap, AOUT_AINx2,                                 ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_U8_U8U8_SAT                                         , 1, 1, Add_U8_U8U8_Sat, AOUT_AINx2,                                  ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_U8_U8U8_WRAP                                        , 1, 1, Sub_U8_U8U8_Wrap, AOUT_AINx2,                                 ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_U8_U8U8_SAT                                         , 1, 1, Sub_U8_U8U8_Sat, AOUT_AINx2,                                  ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_U8_U8U8_WRAP_TRUNC                                  , 1, 1, Mul_U8_U8U8_Wrap_Trunc, AOUT_AINx3,                           ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_U8_U8U8_WRAP_ROUND                                  , 1, 1, Mul_U8_U8U8_Wrap_Round, AOUT_AINx3,                           ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_U8_U8U8_SAT_TRUNC                                   , 1, 1, Mul_U8_U8U8_Sat_Trunc, AOUT_AINx3,                            ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_U8_U8U8_SAT_ROUND                                   , 1, 1, Mul_U8_U8U8_Sat_Round, AOUT_AINx3,                            ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U8_U8U8                                             , 1, 1, And_U8_U8U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U8_U8U1                                             , 1, 1, And_U8_U8U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U8_U1U8                                             , 1, 1, And_U8_U1U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U8_U1U1                                             , 1, 1, And_U8_U1U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U1_U8U8                                             , 1, 1, And_U1_U8U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U1_U8U1                                             , 1, 1, And_U1_U8U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U1_U1U8                                             , 1, 1, And_U1_U1U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_AND_U1_U1U1                                             , 1, 1, And_U1_U1U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U8_U8U8                                              , 1, 1, Or_U8_U8U8, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U8_U8U1                                              , 1, 1, Or_U8_U8U1, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U8_U1U8                                              , 1, 1, Or_U8_U1U8, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U8_U1U1                                              , 1, 1, Or_U8_U1U1, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U1_U8U8                                              , 1, 1, Or_U1_U8U8, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U1_U8U1                                              , 1, 1, Or_U1_U8U1, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U1_U1U8                                              , 1, 1, Or_U1_U1U8, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OR_U1_U1U1                                              , 1, 1, Or_U1_U1U1, AOUT_AINx2,                                       ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U8_U8U8                                             , 1, 1, Xor_U8_U8U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U8_U8U1                                             , 1, 1, Xor_U8_U8U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U8_U1U8                                             , 1, 1, Xor_U8_U1U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U8_U1U1                                             , 1, 1, Xor_U8_U1U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U1_U8U8                                             , 1, 1, Xor_U1_U8U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U1_U8U1                                             , 1, 1, Xor_U1_U8U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U1_U1U8                                             , 1, 1, Xor_U1_U1U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XOR_U1_U1U1                                             , 1, 1, Xor_U1_U1U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U8_U8U8                                            , 1, 1, Nand_U8_U8U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U8_U8U1                                            , 1, 1, Nand_U8_U8U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U8_U1U8                                            , 1, 1, Nand_U8_U1U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U8_U1U1                                            , 1, 1, Nand_U8_U1U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U1_U8U8                                            , 1, 1, Nand_U1_U8U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U1_U8U1                                            , 1, 1, Nand_U1_U8U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U1_U1U8                                            , 1, 1, Nand_U1_U1U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NAND_U1_U1U1                                            , 1, 1, Nand_U1_U1U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U8_U8U8                                             , 1, 1, Nor_U8_U8U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U8_U8U1                                             , 1, 1, Nor_U8_U8U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U8_U1U8                                             , 1, 1, Nor_U8_U1U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U8_U1U1                                             , 1, 1, Nor_U8_U1U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U1_U8U8                                             , 1, 1, Nor_U1_U8U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U1_U8U1                                             , 1, 1, Nor_U1_U8U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U1_U1U8                                             , 1, 1, Nor_U1_U1U8, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NOR_U1_U1U1                                             , 1, 1, Nor_U1_U1U1, AOUT_AINx2,                                      ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U8_U8U8                                            , 1, 1, Xnor_U8_U8U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U8_U8U1                                            , 1, 1, Xnor_U8_U8U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U8_U1U8                                            , 1, 1, Xnor_U8_U1U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U8_U1U1                                            , 1, 1, Xnor_U8_U1U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U1_U8U8                                            , 1, 1, Xnor_U1_U8U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U1_U8U1                                            , 1, 1, Xnor_U1_U8U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U1_U1U8                                            , 1, 1, Xnor_U1_U1U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_XNOR_U1_U1U1                                            , 1, 1, Xnor_U1_U1U1, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ABS_DIFF_U8_U8U8                                        , 1, 1, AbsDiff_U8_U8U8, AOUT_AINx2,                                  ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ACCUMULATE_WEIGHTED_U8_U8U8                             , 1, 1, AccumulateWeighted_U8_U8U8, AINOUT_AINx2,                     ATYPE_IIS               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_S16_U8U8                                            , 1, 1, Add_S16_U8U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_U8U8                                            , 1, 1, Sub_S16_U8U8, AOUT_AINx2,                                     ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_U8U8_WRAP_TRUNC                                 , 1, 1, Mul_S16_U8U8_Wrap_Trunc, AOUT_AINx3,                          ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_U8U8_WRAP_ROUND                                 , 1, 1, Mul_S16_U8U8_Wrap_Round, AOUT_AINx3,                          ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_U8U8_SAT_TRUNC                                  , 1, 1, Mul_S16_U8U8_Sat_Trunc, AOUT_AINx3,                           ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_U8U8_SAT_ROUND                                  , 1, 1, Mul_S16_U8U8_Sat_Round, AOUT_AINx3,                           ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_S16_S16U8_WRAP                                      , 1, 1, Add_S16_S16U8_Wrap, AOUT_AINx2,                               ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_S16_S16U8_SAT                                       , 1, 1, Add_S16_S16U8_Sat, AOUT_AINx2,                                ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ACCUMULATE_S16_S16U8_SAT                                , 1, 1, Accumulate_S16_S16U8_Sat, AINOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_S16U8_WRAP                                      , 1, 1, Sub_S16_S16U8_Wrap, AOUT_AINx2,                               ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_S16U8_SAT                                       , 1, 1, Sub_S16_S16U8_Sat, AOUT_AINx2,                                ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_TRUNC                                , 1, 1, Mul_S16_S16U8_Wrap_Trunc, AOUT_AINx3,                         ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_ROUND                                , 1, 1, Mul_S16_S16U8_Wrap_Round, AOUT_AINx3,                         ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16U8_SAT_TRUNC                                 , 1, 1, Mul_S16_S16U8_Sat_Trunc, AOUT_AINx3,                          ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16U8_SAT_ROUND                                 , 1, 1, Mul_S16_S16U8_Sat_Round, AOUT_AINx3,                          ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ACCUMULATE_SQUARED_S16_S16U8_SAT                        , 1, 1, AccumulateSquared_S16_S16U8_Sat, AINOUT_AINx2,                ATYPE_IIS               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_U8S16_WRAP                                      , 1, 1, Sub_S16_U8S16_Wrap, AOUT_AINx2,                               ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_U8S16_SAT                                       , 1, 1, Sub_S16_U8S16_Sat, AOUT_AINx2,                                ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ABS_DIFF_S16_S16S16_SAT                                 , 1, 1, AbsDiff_S16_S16S16_Sat, AOUT_AINx2,                           ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_S16_S16S16_WRAP                                     , 1, 1, Add_S16_S16S16_Wrap, AOUT_AINx2,                              ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ADD_S16_S16S16_SAT                                      , 1, 1, Add_S16_S16S16_Sat, AOUT_AINx2,                               ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_S16S16_WRAP                                     , 1, 1, Sub_S16_S16S16_Wrap, AOUT_AINx2,                              ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SUB_S16_S16S16_SAT                                      , 1, 1, Sub_S16_S16S16_Sat, AOUT_AINx2,                               ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16S16_WRAP_TRUNC                               , 1, 1, Mul_S16_S16S16_Wrap_Trunc, AOUT_AINx3,                        ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16S16_WRAP_ROUND                               , 1, 1, Mul_S16_S16S16_Wrap_Round, AOUT_AINx3,                        ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16S16_SAT_TRUNC                                , 1, 1, Mul_S16_S16S16_Sat_Trunc, AOUT_AINx3,                         ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_S16_S16S16_SAT_ROUND                                , 1, 1, Mul_S16_S16S16_Sat_Round, AOUT_AINx3,                         ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MAGNITUDE_S16_S16S16                                    , 1, 1, Magnitude_S16_S16S16, AOUT_AINx2,                             ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_PHASE_U8_S16S16                                         , 1, 1, Phase_U8_S16S16, AOUT_AINx2,                                  ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COPY_U8_U8                                      , 1, 1, ChannelCopy_U8_U8, AOUT_AIN,                                  ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COPY_U8_U1                                      , 1, 1, ChannelCopy_U8_U1, AOUT_AIN,                                  ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COPY_U1_U8                                      , 1, 1, ChannelCopy_U1_U8, AOUT_AIN,                                  ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COPY_U1_U1                                      , 1, 1, ChannelCopy_U1_U1, AOUT_AIN,                                  ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS0                             , 1, 1, ChannelExtract_U8_U16_Pos0, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS1                             , 1, 1, ChannelExtract_U8_U16_Pos1, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS0                             , 1, 1, ChannelExtract_U8_U24_Pos0, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS1                             , 1, 1, ChannelExtract_U8_U24_Pos1, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS2                             , 1, 1, ChannelExtract_U8_U24_Pos2, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0                             , 1, 1, ChannelExtract_U8_U32_Pos0, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1                             , 1, 1, ChannelExtract_U8_U32_Pos1, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2                             , 1, 1, ChannelExtract_U8_U32_Pos2, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3                             , 1, 1, ChannelExtract_U8_U32_Pos3, AOUT_AIN,                         ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8_U24                              , 1, 1, ChannelExtract_U8U8U8_U24, AOUTx3_AIN,                        ATYPE_IIII              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8_U32                              , 1, 1, ChannelExtract_U8U8U8_U32, AOUTx3_AIN,                        ATYPE_IIII              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8U8_U32                            , 1, 1, ChannelExtract_U8U8U8U8_U32, AOUTx4_AIN,                      ATYPE_IIIII             , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COMBINE_U16_U8U8                                , 1, 1, ChannelCombine_U16_U8U8, AOUT_AINx2,                          ATYPE_III               , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COMBINE_U24_U8U8U8_RGB                          , 1, 1, ChannelCombine_U24_U8U8U8_RGB, AOUT_AINx3,                    ATYPE_IIII              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_UYVY                         , 1, 1, ChannelCombine_U32_U8U8U8_UYVY, AOUT_AINx3,                   ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_YUYV                         , 1, 1, ChannelCombine_U32_U8U8U8_YUYV, AOUT_AINx3,                   ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8U8_RGBX                       , 1, 1, ChannelCombine_U32_U8U8U8U8_RGBX, AOUT_AINx4,                 ATYPE_IIIII             , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_U24_U24U8_SAT_ROUND                                 , 1, 1, Mul_U24_U24U8_Sat_Round, AOUT_AINx3,                          ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MUL_U32_U32U8_SAT_ROUND                                 , 1, 1, Mul_U32_U32U8_Sat_Round, AOUT_AINx3,                          ATYPE_IIIS              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGB_RGBX                                  , 1, 1, ColorConvert_RGB_RGBX, AOUT_AIN,                              ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY                                  , 1, 1, ColorConvert_RGB_UYVY, AOUT_AIN,                              ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV                                  , 1, 1, ColorConvert_RGB_YUYV, AOUT_AIN,                              ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV                                  , 1, 1, ColorConvert_RGB_IYUV, AOUT_AINx3,                            ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12                                  , 1, 1, ColorConvert_RGB_NV12, AOUT_AINx2,                            ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21                                  , 1, 1, ColorConvert_RGB_NV21, AOUT_AINx2,                            ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGBX_RGB                                  , 1, 1, ColorConvert_RGBX_RGB, AOUT_AIN,                              ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY                                 , 1, 1, ColorConvert_RGBX_UYVY, AOUT_AIN,                             ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV                                 , 1, 1, ColorConvert_RGBX_YUYV, AOUT_AIN,                             ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV                                 , 1, 1, ColorConvert_RGBX_IYUV, AOUT_AINx3,                           ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12                                 , 1, 1, ColorConvert_RGBX_NV12, AOUT_AINx2,                           ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21                                 , 1, 1, ColorConvert_RGBX_NV21, AOUT_AINx2,                           ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_YUV4_RGB                                  , 1, 1, ColorConvert_YUV4_RGB, AOUTx3_AIN,                            ATYPE_IIII              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_YUV4_RGBX                                 , 1, 1, ColorConvert_YUV4_RGBX, AOUTx3_AIN,                           ATYPE_IIII              , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8                                      , 1, 1, ScaleUp2x2_U8_U8, AOUT_AIN,                                   ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12                                  , 1, 1, FormatConvert_UV_UV12, AOUTx2_AIN,                            ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB                                  , 1, 1, ColorConvert_IYUV_RGB, AOUTx3_AIN,                            ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX                                 , 1, 1, ColorConvert_IYUV_RGBX, AOUTx3_AIN,                           ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY                                , 1, 1, FormatConvert_IYUV_UYVY, AOUTx3_AIN,                          ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV                                , 1, 1, FormatConvert_IYUV_YUYV, AOUTx3_AIN,                          ATYPE_IIII              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12                                 , 1, 1, FormatConvert_IUV_UV12, AOUTx2_AIN,                           ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB                                  , 1, 1, ColorConvert_NV12_RGB, AOUTx2_AIN,                            ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX                                 , 1, 1, ColorConvert_NV12_RGBX, AOUTx2_AIN,                           ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY                                , 1, 1, FormatConvert_NV12_UYVY, AOUTx2_AIN,                          ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV                                , 1, 1, FormatConvert_NV12_YUYV, AOUTx2_AIN,                          ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FORMAT_CONVERT_UV12_IUV                                 , 1, 1, FormatConvert_UV12_IUV, AOUT_AINx2,                           ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB                                     , 1, 1, ColorConvert_Y_RGB, AOUT_AIN,                                 ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX                                    , 1, 1, ColorConvert_Y_RGBX, AOUT_AIN,                                ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_U_RGB                                     , 1, 1, ColorConvert_U_RGB, AOUT_AIN,                                 ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_U_RGBX                                    , 1, 1, ColorConvert_U_RGBX, AOUT_AIN,                                ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_V_RGB                                     , 1, 1, ColorConvert_V_RGB, AOUT_AIN,                                 ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_V_RGBX                                    , 1, 1, ColorConvert_V_RGBX, AOUT_AIN,                                ATYPE_II                , KOP_ELEMWISE  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB                                    , 1, 1, ColorConvert_IU_RGB, AOUT_AIN,                                ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX                                   , 1, 1, ColorConvert_IU_RGBX, AOUT_AIN,                               ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB                                    , 1, 1, ColorConvert_IV_RGB, AOUT_AIN,                                ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX                                   , 1, 1, ColorConvert_IV_RGBX, AOUT_AIN,                               ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGB                                   , 1, 1, ColorConvert_IUV_RGB, AOUTx2_AIN,                             ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGBX                                  , 1, 1, ColorConvert_IUV_RGBX, AOUTx2_AIN,                            ATYPE_III               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB                                  , 1, 1, ColorConvert_UV12_RGB, AOUT_AIN,                              ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX                                 , 1, 1, ColorConvert_UV12_RGBX, AOUT_AIN,                             ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_BOX_U8_U8_3x3                                           , 1, 1, Box_U8_U8_3x3, AOUT_AIN,                                      ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_DILATE_U8_U8_3x3                                        , 1, 1, Dilate_U8_U8_3x3, AOUT_AIN,                                   ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ERODE_U8_U8_3x3                                         , 1, 1, Erode_U8_U8_3x3, AOUT_AIN,                                    ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MEDIAN_U8_U8_3x3                                        , 1, 1, Median_U8_U8_3x3, AOUT_AIN,                                   ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_GAUSSIAN_U8_U8_3x3                                      , 1, 1, Gaussian_U8_U8_3x3, AOUT_AIN,                                 ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_3x3                           , 1, 1, ScaleGaussianHalf_U8_U8_3x3, AOUT_AIN,                        ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5                           , 1, 1, ScaleGaussianHalf_U8_U8_5x5, AOUT_AIN,                        ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_GAUSSIAN_ORB_U8_U8_5x5                            , 1, 1, ScaleGaussianOrb_U8_U8_5x5, AOUT_AIN,                         ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CONVOLVE_U8_U8                                          , 1, 1, Convolve_U8_U8, AOUT_AINx2,                                   ATYPE_IIC               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CONVOLVE_S16_U8                                         , 1, 1, Convolve_S16_U8, AOUT_AINx2,                                  ATYPE_IIC               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_LINEAR_FILTER_ANY_ANY                                   , 1, 1, LinearFilter_ANY_ANY, AOUT_AINx2,                             ATYPE_IIM               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_LINEAR_FILTER_ANYx2_ANY                                 , 1, 1, LinearFilter_ANYx2_ANY, AOUTx2_AINx3,                         ATYPE_IIIMM             , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SOBEL_MAGNITUDE_S16_U8_3x3                              , 1, 1, SobelMagnitude_S16_U8_3x3, AOUT_AIN,                          ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SOBEL_PHASE_U8_U8_3x3                                   , 1, 1, SobelPhase_U8_U8_3x3, AOUT_AIN,                               ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SOBEL_MAGNITUDE_PHASE_S16U8_U8_3x3                      , 1, 1, SobelMagnitudePhase_S16U8_U8_3x3, AOUTx2_AIN,                 ATYPE_III               , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY                                 , 1, 1, Sobel_S16S16_U8_3x3_GXY, AOUTx2_AIN,                          ATYPE_III               , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX                                     , 1, 1, Sobel_S16_U8_3x3_GX, AOUT_AIN,                                ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY                                     , 1, 1, Sobel_S16_U8_3x3_GY, AOUT_AIN,                                ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_DILATE_U1_U8_3x3                                        , 1, 1, Dilate_U1_U8_3x3, AOUT_AIN,                                   ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ERODE_U1_U8_3x3                                         , 1, 1, Erode_U1_U8_3x3, AOUT_AIN,                                    ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_DILATE_U1_U1_3x3                                        , 1, 1, Dilate_U1_U1_3x3, AOUT_AIN,                                   ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ERODE_U1_U1_3x3                                         , 1, 1, Erode_U1_U1_3x3, AOUT_AIN,                                    ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_DILATE_U8_U1_3x3                                        , 1, 1, Dilate_U8_U1_3x3, AOUT_AIN,                                   ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_ERODE_U8_U1_3x3                                         , 1, 1, Erode_U8_U1_3x3, AOUT_AIN,                                    ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FAST_CORNERS_XY_U8_SUPRESSION                           , 1, 1, FastCorners_XY_U8_Supression, AOUT_AOPTOUT_AINx2,             ATYPE_ASIS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FAST_CORNERS_XY_U8_NOSUPRESSION                         , 1, 1, FastCorners_XY_U8_NoSupression, AOUT_AOPTOUT_AINx2,           ATYPE_ASIS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_3x3                                 , 1, 1, HarrisSobel_HG3_U8_3x3, AOUT_AIN,                             ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_5x5                                 , 1, 1, HarrisSobel_HG3_U8_5x5, AOUT_AIN,                             ATYPE_II                , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_7x7                                 , 1, 1, HarrisSobel_HG3_U8_7x7, AOUT_AIN,                             ATYPE_II                , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_3x3                                , 1, 1, HarrisScore_HVC_HG3_3x3, AOUT_AINx4,                          ATYPE_IISSS             , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_5x5                                , 1, 1, HarrisScore_HVC_HG3_5x5, AOUT_AINx4,                          ATYPE_IISSS             , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_7x7                                , 1, 1, HarrisScore_HVC_HG3_7x7, AOUT_AINx4,                          ATYPE_IISSS             , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_3x3_L1NORM             , 0, 1, CannySobelSuppThreshold_U8_U8_3x3_L1NORM, AOUT_AINx2,         ATYPE_IIT               , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_3x3_L2NORM             , 0, 1, CannySobelSuppThreshold_U8_U8_3x3_L2NORM, AOUT_AINx2,         ATYPE_IIT               , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_5x5_L1NORM             , 0, 1, CannySobelSuppThreshold_U8_U8_5x5_L1NORM, AOUT_AINx2,         ATYPE_IIT               , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_5x5_L2NORM             , 0, 1, CannySobelSuppThreshold_U8_U8_5x5_L2NORM, AOUT_AINx2,         ATYPE_IIT               , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_7x7_L1NORM             , 0, 1, CannySobelSuppThreshold_U8_U8_7x7_L1NORM, AOUT_AINx2,         ATYPE_IIT               , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_7x7_L2NORM             , 0, 1, CannySobelSuppThreshold_U8_U8_7x7_L2NORM, AOUT_AINx2,         ATYPE_IIT               , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_3x3_L1NORM           , 1, 0, CannySobelSuppThreshold_U8XY_U8_3x3_L1NORM, AOUTx2_AINx2,     ATYPE_IcIT              , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_3x3_L2NORM           , 1, 0, CannySobelSuppThreshold_U8XY_U8_3x3_L2NORM, AOUTx2_AINx2,     ATYPE_IcIT              , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_5x5_L1NORM           , 1, 0, CannySobelSuppThreshold_U8XY_U8_5x5_L1NORM, AOUTx2_AINx2,     ATYPE_IcIT              , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_5x5_L2NORM           , 1, 0, CannySobelSuppThreshold_U8XY_U8_5x5_L2NORM, AOUTx2_AINx2,     ATYPE_IcIT              , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_7x7_L1NORM           , 1, 0, CannySobelSuppThreshold_U8XY_U8_7x7_L1NORM, AOUTx2_AINx2,     ATYPE_IcIT              , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_7x7_L2NORM           , 1, 0, CannySobelSuppThreshold_U8XY_U8_7x7_L2NORM, AOUTx2_AINx2,     ATYPE_IcIT              , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM                           , 1, 1, CannySobel_U16_U8_3x3_L1NORM, AOUT_AIN,                       ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L2NORM                           , 1, 1, CannySobel_U16_U8_3x3_L2NORM, AOUT_AIN,                       ATYPE_II                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM                           , 1, 1, CannySobel_U16_U8_5x5_L1NORM, AOUT_AIN,                       ATYPE_II                , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L2NORM                           , 1, 1, CannySobel_U16_U8_5x5_L2NORM, AOUT_AIN,                       ATYPE_II                , KOP_FIXED(5)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM                           , 1, 1, CannySobel_U16_U8_7x7_L1NORM, AOUT_AIN,                       ATYPE_II                , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM                           , 1, 1, CannySobel_U16_U8_7x7_L2NORM, AOUT_AIN,                       ATYPE_II                , KOP_FIXED(7)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8_U16_3x3                         , 0, 1, CannySuppThreshold_U8_U16_3x3, AOUT_AINx2_AOPTIN,             ATYPE_IITS              , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3                       , 1, 1, CannySuppThreshold_U8XY_U16_3x3, AOUTx2_AINx2_AOPTIN,         ATYPE_IcITS             , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_NON_MAX_SUPP_XY_ANY_3x3                                 , 0, 1, NonMaxSupp_XY_ANY_3x3, AOUT_AIN,                              ATYPE_AI                , KOP_FIXED(3)  ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U8_U8_NEAREST                                     , 1, 1, Remap_U8_U8_Nearest, AOUT_AINx2,                              ATYPE_IIR               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U8_U8_NEAREST_CONSTANT                            , 1, 1, Remap_U8_U8_Nearest_Constant, AOUT_AINx3,                     ATYPE_IIRS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U8_U8_BILINEAR                                    , 1, 1, Remap_U8_U8_Bilinear, AOUT_AINx2,                             ATYPE_IIR               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U8_U8_BILINEAR_CONSTANT                           , 1, 1, Remap_U8_U8_Bilinear_Constant, AOUT_AINx3,                    ATYPE_IIRS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U24_U24_BILINEAR                                  , 1, 1, Remap_U24_U24_Bilinear, AOUT_AINx2,                           ATYPE_IIR               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U24_U32_BILINEAR                                  , 1, 1, Remap_U24_U32_Bilinear, AOUT_AINx2,                           ATYPE_IIR               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_REMAP_U32_U32_BILINEAR                                  , 1, 1, Remap_U32_U32_Bilinear, AOUT_AINx2,                           ATYPE_IIR               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_AFFINE_U8_U8_NEAREST                               , 1, 1, WarpAffine_U8_U8_Nearest, AOUT_AINx2,                         ATYPE_IIM               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_AFFINE_U8_U8_NEAREST_CONSTANT                      , 1, 1, WarpAffine_U8_U8_Nearest_Constant, AOUT_AINx3,                ATYPE_IIMS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_AFFINE_U8_U8_BILINEAR                              , 1, 1, WarpAffine_U8_U8_Bilinear, AOUT_AINx2,                        ATYPE_IIM               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_AFFINE_U8_U8_BILINEAR_CONSTANT                     , 1, 1, WarpAffine_U8_U8_Bilinear_Constant, AOUT_AINx3,               ATYPE_IIMS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_NEAREST                          , 1, 1, WarpPerspective_U8_U8_Nearest, AOUT_AINx2,                    ATYPE_IIM               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_NEAREST_CONSTANT                 , 1, 1, WarpPerspective_U8_U8_Nearest_Constant, AOUT_AINx3,           ATYPE_IIMS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_BILINEAR                         , 1, 1, WarpPerspective_U8_U8_Bilinear, AOUT_AINx2,                   ATYPE_IIM               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_BILINEAR_CONSTANT                , 1, 1, WarpPerspective_U8_U8_Bilinear_Constant, AOUT_AINx3,          ATYPE_IIMS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST                               , 1, 1, ScaleImage_U8_U8_Nearest, AOUT_AIN,                           ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR                              , 1, 1, ScaleImage_U8_U8_Bilinear, AOUT_AIN,                          ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_REPLICATE                    , 1, 1, ScaleImage_U8_U8_Bilinear_Replicate, AOUT_AIN,                ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_CONSTANT                     , 1, 1, ScaleImage_U8_U8_Bilinear_Constant, AOUT_AINx2,               ATYPE_IIS               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA                                  , 1, 1, ScaleImage_U8_U8_Area, AOUT_AIN,                              ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OPTICAL_FLOW_PYR_LK_XY_XY                               , 1, 1, OpticalFlowPyrLK_XY_XY, AOUT_AINx9,                           ATYPE_APPAASSSSS        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OPTICAL_FLOW_PREPARE_LK_XY_XY                           , 1, 1, OpticalFlowPrepareLK_XY_XY, AOUT_AINx4,                       ATYPE_AAAAS             , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OPTICAL_FLOW_IMAGE_LK_XY_XY                             , 1, 1, OpticalFlowImageLK_XY_XY, AOUT_AINx8,                         ATYPE_AAIISSSSS         , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_OPTICAL_FLOW_FINAL_LK_XY_XY                             , 1, 1, OpticalFlowFinalLK_XY_XY, AOUT_AINx2,                         ATYPE_AAA               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_MERGE_SORT_AND_PICK_XY_HVC                       , 1, 0, HarrisMergeSortAndPick_XY_HVC, AOUT_AOPTOUT_AINx2,            ATYPE_ASIS              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HARRIS_MERGE_SORT_AND_PICK_XY_XYS                       , 1, 0, HarrisMergeSortAndPick_XY_XYS, AOUT_AOPTOUT_AINx4,            ATYPE_ASASSS            , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_FAST_CORNER_MERGE_XY_XY                                 , 1, 0, FastCornerMerge_XY_XY, AOUTx2_AIN_AOPTINx7,                   ATYPE_ASAAAAAAAA        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8                                  , 1, 0, CannyEdgeTrace_U8_U8, AINOUT_AIN,                             ATYPE_Ic                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8XY                                , 1, 0, CannyEdgeTrace_U8_U8XY, AINOUT_AIN,                           ATYPE_Ic                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_INTEGRAL_IMAGE_U32_U8                                   , 1, 0, IntegralImage_U32_U8, AOUT_AIN,                               ATYPE_II                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HISTOGRAM_DATA_U8                                       , 1, 0, Histogram_DATA_U8, AOUT_AIN,                                  ATYPE_DI                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MEAN_STD_DEV_DATA_U8                                    , 1, 0, MeanStdDev_DATA_U8, AOUT_AIN,                                 ATYPE_sI                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_DATA_U8                                         , 1, 0, MinMax_DATA_U8, AOUT_AIN,                                     ATYPE_mI                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_DATA_S16                                        , 1, 0, MinMax_DATA_S16, AOUT_AIN,                                    ATYPE_mI                , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_EQUALIZE_DATA_DATA                                      , 1, 0, Equalize_DATA_DATA, AOUT_AIN_AOPTINx8,                        ATYPE_LDDDDDDDDD        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_HISTOGRAM_MERGE_DATA_DATA                               , 1, 0, HistogramMerge_DATA_DATA, AOUT_AIN_AOPTINx8,                  ATYPE_DDDDDDDDDD        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MEAN_STD_DEV_MERGE_DATA_DATA                            , 1, 0, MeanStdDevMerge_DATA_DATA, AOUTx2_AIN_AOPTINx7,               ATYPE_SSssssssss        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_MERGE_DATA_DATA                                 , 1, 0, MinMaxMerge_DATA_DATA, AOUTx3_AIN_AOPTINx6,                   ATYPE_SSmmmmmmmm        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MIN              , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_None_Count_Min, AOUT_AINx2,         ATYPE_SIm               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MAX              , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_None_Count_Max, AOUT_AINx2,         ATYPE_SIm               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MINMAX           , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_None_Count_MinMax, AOUTx2_AINx2,    ATYPE_SSIm              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MIN_COUNT_MIN               , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_Min_Count_Min, AOUT_AOPTOUT_AINx2,  ATYPE_ASIm              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MIN_COUNT_MINMAX            , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_Min_Count_MinMax, AOUT_AOPTOUTx2_AINx2, ATYPE_ASSIm         , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MAX_COUNT_MAX               , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_Max_Count_Max, AOUT_AOPTOUT_AINx2,  ATYPE_ASIm              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MAX_COUNT_MINMAX            , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_Max_Count_MinMax, AOUT_AOPTOUTx2_AINx2, ATYPE_ASSIm         , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MINMAX_COUNT_MINMAX         , 1, 0, MinMaxLoc_DATA_U8DATA_Loc_MinMax_Count_MinMax, AOUTx2_AOPTOUTx2_AINx2, ATYPE_AASSIm   , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MIN             , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_None_Count_Min, AOUT_AINx2,        ATYPE_SIm               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MAX             , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_None_Count_Max, AOUT_AINx2,        ATYPE_SIm               , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MINMAX          , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_None_Count_MinMax, AOUTx2_AINx2,   ATYPE_SSIm              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MIN_COUNT_MIN              , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_Min_Count_Min, AOUT_AOPTOUT_AINx2, ATYPE_ASIm              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MIN_COUNT_MINMAX           , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_Min_Count_MinMax, AOUT_AOPTOUTx2_AINx2, ATYPE_ASSIm        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MAX              , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max, AOUT_AOPTOUT_AINx2, ATYPE_ASIm              , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MINMAX           , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax, AOUT_AOPTOUTx2_AINx2, ATYPE_ASSIm        , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MINMAX_COUNT_MINMAX        , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax, AOUTx2_AOPTOUTx2_AINx2, ATYPE_AASSIm  , KOP_UNKNOWN   ),
+	AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_DATA                             , 1, 0, MinMaxLocMerge_DATA_DATA, AOUTx2_AIN_AOPTINx7,                ATYPE_SAAAAAAAAA        , KOP_UNKNOWN   ),
+#undef AGO_KERNEL_ENTRY
+#undef OVX_KERNEL_ENTRY
+};
+size_t ago_kernel_count = sizeof(ago_kernel_list) / sizeof(ago_kernel_list[0]);
+
+int agoPublishKernels(AgoContext * acontext)
+{
+	int ovxKernelCount = 0;
+	int agoKernelCount = 0, agoKernelCountCpu = 0, agoKernelCountGpu = 0;
+	for (vx_size i = 0; i < ago_kernel_count; i++)
+	{
+		AgoKernel * kernel = new AgoKernel;
+		agoResetReference(&kernel->ref, VX_TYPE_KERNEL, acontext, NULL);
+		kernel->id = ago_kernel_list[i].id;
+		kernel->func = ago_kernel_list[i].func;
+		kernel->flags = ago_kernel_list[i].flags;
+		kernel->kernOpType = ago_kernel_list[i].kernOpType;
+		kernel->kernOpInfo = ago_kernel_list[i].kernOpInfo;
+		kernel->finalized = true;
+		kernel->ref.internal_count = 1;
+		strcpy(kernel->name, ago_kernel_list[i].name);
+		memcpy(kernel->argConfig, ago_kernel_list[i].argConfig, sizeof(kernel->argConfig));
+		kernel->argCount = 0;
+		for (vx_uint32 j = 0; j < AGO_MAX_PARAMS; j++) {
+			// if arg[j] is valid, then there are atleast j+1 arguments
+			if (kernel->argConfig[j])
+				kernel->argCount = j + 1;
+		}
+		memcpy(kernel->argType, ago_kernel_list[i].argType, sizeof(kernel->argType));
+		for (vx_uint32 j = 0; j < kernel->argCount; j++) {
+			// initialize for vx_parameter use
+			agoResetReference(&kernel->parameters[j].ref, VX_TYPE_PARAMETER, acontext, &kernel->ref);
+			kernel->parameters[j].index = j;
+			kernel->parameters[j].direction = VX_INPUT;
+			if (kernel->argConfig[j] & AGO_KERNEL_ARG_OUTPUT_FLAG)
+				kernel->parameters[j].direction = (kernel->argConfig[j] & AGO_KERNEL_ARG_INPUT_FLAG) ? VX_BIDIRECTIONAL : VX_OUTPUT;
+			kernel->parameters[j].type = ago_kernel_list[i].argType[j];
+			kernel->parameters[j].state = (kernel->argConfig[j] & AGO_KERNEL_ARG_OPTIONAL_FLAG) ? VX_PARAMETER_STATE_OPTIONAL : VX_PARAMETER_STATE_REQUIRED;
+			kernel->parameters[j].scope = &kernel->ref;
+		}
+		agoAddKernel(&acontext->kernelList, kernel);
+		int kernelGroup = kernel->flags & AGO_KERNEL_FLAG_GROUP_MASK;
+		if (kernelGroup == AGO_KERNEL_FLAG_GROUP_OVX10) ovxKernelCount++;
+		else if (kernelGroup == AGO_KERNEL_FLAG_GROUP_AMDLL) {
+			agoKernelCount++;
+			if (kernel->flags & AGO_KERNEL_FLAG_DEVICE_CPU) agoKernelCountCpu++;
+			if (kernel->flags & AGO_KERNEL_FLAG_DEVICE_GPU) agoKernelCountGpu++;
+		}
+	}
+#if ENABLE_DEBUG_MESSAGES
+	printf("OK: ago imported %d(VX) + %d(AMD:[cpu-%d][gpu-%d]) kernels\n", ovxKernelCount, agoKernelCount, agoKernelCountCpu, agoKernelCountGpu); 
+#endif
+	return 0;
+}
diff --git a/openvx/ago/ago_kernels.h b/openvx/ago/ago_kernels.h
new file mode 100644
index 0000000..9f503b4
--- /dev/null
+++ b/openvx/ago/ago_kernels.h
@@ -0,0 +1,444 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __ago_kernel_h__
+#define __ago_kernel_h__
+
+#include <VX/vx.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+enum vx_kernel_amd_e {
+	VX_KERNEL_AMD_INVALID = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_KHR_BASE) + 0x0,
+
+	// kernel enumeration naming:
+	//    VX_KERNEL_AMD_<name>_<destination-spec>_<source-spec>_<options>
+
+	// Element-wise 1-channel: U8 = const (2)
+	VX_KERNEL_AMD_SET_00_U8,
+	VX_KERNEL_AMD_SET_FF_U8,
+
+	// Element-wise 1-channel: U8 = op U8 (6)
+	VX_KERNEL_AMD_NOT_U8_U8,
+	VX_KERNEL_AMD_LUT_U8_U8,
+	VX_KERNEL_AMD_THRESHOLD_U8_U8_BINARY,
+	VX_KERNEL_AMD_THRESHOLD_U8_U8_RANGE,
+	VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_BINARY,
+	VX_KERNEL_AMD_THRESHOLD_NOT_U8_U8_RANGE,
+
+	// Element-wise 1-channel: U8 = op U1 (1)
+	VX_KERNEL_AMD_NOT_U8_U1,
+
+	// Element-wise 1-channel: U1 = op U8 (5)
+	VX_KERNEL_AMD_THRESHOLD_U1_U8_BINARY,
+	VX_KERNEL_AMD_THRESHOLD_U1_U8_RANGE,
+	VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_BINARY,
+	VX_KERNEL_AMD_THRESHOLD_NOT_U1_U8_RANGE,
+	VX_KERNEL_AMD_NOT_U1_U8,
+
+	// Element-wise 1-channel: U1 = op U1 (1)
+	VX_KERNEL_AMD_NOT_U1_U1,
+
+	// Element-wise 1-channel: U8 = op S16 (2)
+	VX_KERNEL_AMD_COLOR_DEPTH_U8_S16_WRAP,
+	VX_KERNEL_AMD_COLOR_DEPTH_U8_S16_SAT,
+
+	// Element-wise 1-channel: S16 = op U8 (3)
+	VX_KERNEL_AMD_COLOR_DEPTH_S16_U8,
+	VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GX,
+	VX_KERNEL_AMD_SOBEL_S16_U8_3x3_GY,
+
+	// Element-wise 1-channel: U8 = U8 op U8 (16)
+	VX_KERNEL_AMD_ADD_U8_U8U8_WRAP,
+	VX_KERNEL_AMD_ADD_U8_U8U8_SAT,
+	VX_KERNEL_AMD_SUB_U8_U8U8_WRAP,
+	VX_KERNEL_AMD_SUB_U8_U8U8_SAT,
+	VX_KERNEL_AMD_MUL_U8_U8U8_WRAP_TRUNC,
+	VX_KERNEL_AMD_MUL_U8_U8U8_WRAP_ROUND,
+	VX_KERNEL_AMD_MUL_U8_U8U8_SAT_TRUNC,
+	VX_KERNEL_AMD_MUL_U8_U8U8_SAT_ROUND,
+	VX_KERNEL_AMD_AND_U8_U8U8,
+	VX_KERNEL_AMD_OR_U8_U8U8,
+	VX_KERNEL_AMD_XOR_U8_U8U8,
+	VX_KERNEL_AMD_NAND_U8_U8U8,
+	VX_KERNEL_AMD_NOR_U8_U8U8,
+	VX_KERNEL_AMD_XNOR_U8_U8U8,
+	VX_KERNEL_AMD_ABS_DIFF_U8_U8U8,
+	VX_KERNEL_AMD_ACCUMULATE_WEIGHTED_U8_U8U8,
+
+	// Element-wise 1-channel: U8 = U8 op U1 (6)
+	VX_KERNEL_AMD_AND_U8_U8U1,
+	VX_KERNEL_AMD_OR_U8_U8U1,
+	VX_KERNEL_AMD_XOR_U8_U8U1,
+	VX_KERNEL_AMD_NAND_U8_U8U1,
+	VX_KERNEL_AMD_NOR_U8_U8U1,
+	VX_KERNEL_AMD_XNOR_U8_U8U1,
+
+	// Element-wise 1-channel: U8 = U1 op U8 (6)
+	VX_KERNEL_AMD_AND_U8_U1U8,
+	VX_KERNEL_AMD_OR_U8_U1U8,
+	VX_KERNEL_AMD_XOR_U8_U1U8,
+	VX_KERNEL_AMD_NAND_U8_U1U8,
+	VX_KERNEL_AMD_NOR_U8_U1U8,
+	VX_KERNEL_AMD_XNOR_U8_U1U8,
+
+	// Element-wise 1-channel: U8 = U1 op U1 (6)
+	VX_KERNEL_AMD_AND_U8_U1U1,
+	VX_KERNEL_AMD_OR_U8_U1U1,
+	VX_KERNEL_AMD_XOR_U8_U1U1,
+	VX_KERNEL_AMD_NAND_U8_U1U1,
+	VX_KERNEL_AMD_NOR_U8_U1U1,
+	VX_KERNEL_AMD_XNOR_U8_U1U1,
+
+	// Element-wise 1-channel: U1 = U8 op U8 (6)
+	VX_KERNEL_AMD_AND_U1_U8U8,
+	VX_KERNEL_AMD_OR_U1_U8U8,
+	VX_KERNEL_AMD_XOR_U1_U8U8,
+	VX_KERNEL_AMD_NAND_U1_U8U8,
+	VX_KERNEL_AMD_NOR_U1_U8U8,
+	VX_KERNEL_AMD_XNOR_U1_U8U8,
+
+	// Element-wise 1-channel: U1 = U8 op U1 (6)
+	VX_KERNEL_AMD_AND_U1_U8U1,
+	VX_KERNEL_AMD_OR_U1_U8U1,
+	VX_KERNEL_AMD_XOR_U1_U8U1,
+	VX_KERNEL_AMD_NAND_U1_U8U1,
+	VX_KERNEL_AMD_NOR_U1_U8U1,
+	VX_KERNEL_AMD_XNOR_U1_U8U1,
+
+	// Element-wise 1-channel: U1 = U1 op U8 (6)
+	VX_KERNEL_AMD_AND_U1_U1U8,
+	VX_KERNEL_AMD_OR_U1_U1U8,
+	VX_KERNEL_AMD_XOR_U1_U1U8,
+	VX_KERNEL_AMD_NAND_U1_U1U8,
+	VX_KERNEL_AMD_NOR_U1_U1U8,
+	VX_KERNEL_AMD_XNOR_U1_U1U8,
+
+	// Element-wise 1-channel: U1 = U1 op U1 (6)
+	VX_KERNEL_AMD_AND_U1_U1U1,
+	VX_KERNEL_AMD_OR_U1_U1U1,
+	VX_KERNEL_AMD_XOR_U1_U1U1,
+	VX_KERNEL_AMD_NAND_U1_U1U1,
+	VX_KERNEL_AMD_NOR_U1_U1U1,
+	VX_KERNEL_AMD_XNOR_U1_U1U1,
+
+	// Element-wise 1-channel: S16 = U8 op U8 (6)
+	VX_KERNEL_AMD_ADD_S16_U8U8,
+	VX_KERNEL_AMD_SUB_S16_U8U8,
+	VX_KERNEL_AMD_MUL_S16_U8U8_WRAP_TRUNC,
+	VX_KERNEL_AMD_MUL_S16_U8U8_WRAP_ROUND,
+	VX_KERNEL_AMD_MUL_S16_U8U8_SAT_TRUNC,
+	VX_KERNEL_AMD_MUL_S16_U8U8_SAT_ROUND,
+
+	// Element-wise 1-channel: S16 = U8 op S16 (2)
+	VX_KERNEL_AMD_SUB_S16_U8S16_WRAP,
+	VX_KERNEL_AMD_SUB_S16_U8S16_SAT,
+
+	// Element-wise 1-channel: S16 = S16 op U8 (10)
+	VX_KERNEL_AMD_ADD_S16_S16U8_WRAP,
+	VX_KERNEL_AMD_ADD_S16_S16U8_SAT,
+	VX_KERNEL_AMD_ACCUMULATE_S16_S16U8_SAT,
+	VX_KERNEL_AMD_SUB_S16_S16U8_WRAP,
+	VX_KERNEL_AMD_SUB_S16_S16U8_SAT,
+	VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_TRUNC,
+	VX_KERNEL_AMD_MUL_S16_S16U8_WRAP_ROUND,
+	VX_KERNEL_AMD_MUL_S16_S16U8_SAT_TRUNC,
+	VX_KERNEL_AMD_MUL_S16_S16U8_SAT_ROUND,
+	VX_KERNEL_AMD_ACCUMULATE_SQUARED_S16_S16U8_SAT,
+
+	// Element-wise 1-channel: S16 = S16 op S16 (10)
+	VX_KERNEL_AMD_ABS_DIFF_S16_S16S16_SAT,
+	VX_KERNEL_AMD_ADD_S16_S16S16_WRAP,
+	VX_KERNEL_AMD_ADD_S16_S16S16_SAT,
+	VX_KERNEL_AMD_SUB_S16_S16S16_WRAP,
+	VX_KERNEL_AMD_SUB_S16_S16S16_SAT,
+	VX_KERNEL_AMD_MUL_S16_S16S16_WRAP_TRUNC,
+	VX_KERNEL_AMD_MUL_S16_S16S16_WRAP_ROUND,
+	VX_KERNEL_AMD_MUL_S16_S16S16_SAT_TRUNC,
+	VX_KERNEL_AMD_MUL_S16_S16S16_SAT_ROUND,
+	VX_KERNEL_AMD_MAGNITUDE_S16_S16S16,
+
+	// Element-wise 1-channel: U8 = S16 op S16 (1)
+	VX_KERNEL_AMD_PHASE_U8_S16S16,
+
+	// Element-wise n-channel: U8 = op U8 (1)
+	VX_KERNEL_AMD_CHANNEL_COPY_U8_U8,
+
+	// Element-wise n-channel: U8 = op U1 (1)
+	VX_KERNEL_AMD_CHANNEL_COPY_U8_U1,
+
+	// Element-wise n-channel: U1 = op U8 (1)
+	VX_KERNEL_AMD_CHANNEL_COPY_U1_U8,
+
+	// Element-wise n-channel: U1 = op U1 (1)
+	VX_KERNEL_AMD_CHANNEL_COPY_U1_U1,
+
+	// Element-wise n-channel: U8 = op U16 (2)
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS0,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U16_POS1,
+
+	// Element-wise n-channel: U8 = op U24 (4)
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS0,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS1,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U24_POS2,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8_U24,
+
+	// Element-wise n-channel: U8 = op U32 (6)
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS0,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS1,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS2,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8_U32_POS3,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8_U32,
+	VX_KERNEL_AMD_CHANNEL_EXTRACT_U8U8U8U8_U32,
+
+	// Element-wise n-channel: U16 = U8 op U8 (1)
+	VX_KERNEL_AMD_CHANNEL_COMBINE_U16_U8U8,
+
+	// Element-wise n-channel: U24 = U8 op U8 op U8 (1)
+	VX_KERNEL_AMD_CHANNEL_COMBINE_U24_U8U8U8_RGB,
+
+	// Element-wise n-channel: U32 = U8 op U8 op U8 (2)
+	VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_UYVY,
+	VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8_YUYV,
+
+	// Element-wise n-channel: U32 = U8 op U8 op U8 op U8 (1)
+	VX_KERNEL_AMD_CHANNEL_COMBINE_U32_U8U8U8U8_RGBX,
+
+	// Element-wise n-channel: U24/U32 = U24/U32 op U8 (2)
+	VX_KERNEL_AMD_MUL_U24_U24U8_SAT_ROUND,
+	VX_KERNEL_AMD_MUL_U32_U32U8_SAT_ROUND,
+
+	// Element-wise n-channel: arbitrary (40)
+	VX_KERNEL_AMD_COLOR_CONVERT_RGB_RGBX,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGB_UYVY,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGB_YUYV,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGB_IYUV,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV12,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGB_NV21,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGBX_RGB,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGBX_UYVY,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGBX_YUYV,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGBX_IYUV,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV12,
+	VX_KERNEL_AMD_COLOR_CONVERT_RGBX_NV21,
+	VX_KERNEL_AMD_COLOR_CONVERT_YUV4_RGB,
+	VX_KERNEL_AMD_COLOR_CONVERT_YUV4_RGBX,
+	VX_KERNEL_AMD_SCALE_UP_2x2_U8_U8,       // IUV 4:2:0 to 4:4:4 format conversion
+	VX_KERNEL_AMD_FORMAT_CONVERT_UV_UV12,   // UV12 4:2:0 to 4:4:4 format conversion
+	VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGB,
+	VX_KERNEL_AMD_COLOR_CONVERT_IYUV_RGBX,
+	VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_UYVY,
+	VX_KERNEL_AMD_FORMAT_CONVERT_IYUV_YUYV,
+	VX_KERNEL_AMD_FORMAT_CONVERT_IUV_UV12,  // UV 4:2:0 from IYUV to NV12
+	VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGB,
+	VX_KERNEL_AMD_COLOR_CONVERT_NV12_RGBX,
+	VX_KERNEL_AMD_FORMAT_CONVERT_NV12_UYVY,
+	VX_KERNEL_AMD_FORMAT_CONVERT_NV12_YUYV,
+	VX_KERNEL_AMD_FORMAT_CONVERT_UV12_IUV,  // UV 4:2:0 from NV12 to IYUV
+	VX_KERNEL_AMD_COLOR_CONVERT_Y_RGB,     // Y plane
+	VX_KERNEL_AMD_COLOR_CONVERT_Y_RGBX,    // Y plane
+	VX_KERNEL_AMD_COLOR_CONVERT_U_RGB,     // U plane
+	VX_KERNEL_AMD_COLOR_CONVERT_U_RGBX,    // U plane
+	VX_KERNEL_AMD_COLOR_CONVERT_V_RGB,     // V plane
+	VX_KERNEL_AMD_COLOR_CONVERT_V_RGBX,    // V plane
+	VX_KERNEL_AMD_COLOR_CONVERT_IU_RGB,    // U plane in IYUV 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_IU_RGBX,   // U plane in IYUV 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_IV_RGB,    // V plane in IYUV 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_IV_RGBX,   // V plane in IYUV 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGB,   // U & V planes in IYUV 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_IUV_RGBX,  // U & V planes in IYUV 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGB,  // UV plane in NV12 4:2:0
+	VX_KERNEL_AMD_COLOR_CONVERT_UV12_RGBX, // UV plane in NV12 4:2:0
+
+	// Fixed Neighbors: U8 = op U8 (16)
+	VX_KERNEL_AMD_BOX_U8_U8_3x3,
+	VX_KERNEL_AMD_DILATE_U8_U8_3x3,
+	VX_KERNEL_AMD_ERODE_U8_U8_3x3,
+	VX_KERNEL_AMD_MEDIAN_U8_U8_3x3,
+	VX_KERNEL_AMD_GAUSSIAN_U8_U8_3x3,
+	VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_3x3,
+	VX_KERNEL_AMD_SCALE_GAUSSIAN_HALF_U8_U8_5x5,
+	VX_KERNEL_AMD_SCALE_GAUSSIAN_ORB_U8_U8_5x5,
+	VX_KERNEL_AMD_SOBEL_PHASE_U8_U8_3x3,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_3x3_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_3x3_L2NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_5x5_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_5x5_L2NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_7x7_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8_U8_7x7_L2NORM,
+	VX_KERNEL_AMD_CONVOLVE_U8_U8,
+
+	// Fixed Neighbors: S16 = op U8 (2)
+	VX_KERNEL_AMD_CONVOLVE_S16_U8,
+	VX_KERNEL_AMD_SOBEL_MAGNITUDE_S16_U8_3x3,
+
+	// Fixed Neighbors: S16U8 = op U8 (1)
+	VX_KERNEL_AMD_SOBEL_MAGNITUDE_PHASE_S16U8_U8_3x3,
+
+	// Fixed Neighbors: S16S16 = op U8 (1)
+	VX_KERNEL_AMD_SOBEL_S16S16_U8_3x3_GXY,
+
+	// Fixed Neighbors: F32/S16/U8 = op F32/S16/U8 (1)
+	VX_KERNEL_AMD_LINEAR_FILTER_ANY_ANY,
+
+	// Fixed Neighbors: F32/S16/U8 = op U8 (1)
+	VX_KERNEL_AMD_LINEAR_FILTER_ANYx2_ANY,
+
+	// Fixed Neighbors: U1 = op U8 (2)
+	VX_KERNEL_AMD_DILATE_U1_U8_3x3,
+	VX_KERNEL_AMD_ERODE_U1_U8_3x3,
+
+	// Fixed Neighbors: U1 = op U1 (2)
+	VX_KERNEL_AMD_DILATE_U1_U1_3x3,
+	VX_KERNEL_AMD_ERODE_U1_U1_3x3,
+
+	// Fixed Neighbors: U8 = op U1 (2)
+	VX_KERNEL_AMD_DILATE_U8_U1_3x3,
+	VX_KERNEL_AMD_ERODE_U8_U1_3x3,
+
+	// Fixed Neighbors: XY = op U8 (2)
+	VX_KERNEL_AMD_FAST_CORNERS_XY_U8_SUPRESSION,
+	VX_KERNEL_AMD_FAST_CORNERS_XY_U8_NOSUPRESSION,
+
+	// Fixed Neighbors: HG3 = op U8 (3)
+	VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_3x3,
+	VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_5x5,
+	VX_KERNEL_AMD_HARRIS_SOBEL_HG3_U8_7x7,
+
+	// Fixed Neighbors: Fxy = op HG3 (3)
+	VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_3x3,
+	VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_5x5,
+	VX_KERNEL_AMD_HARRIS_SCORE_HVC_HG3_7x7,
+
+	// Fixed Neighbors: U8xy = op U8 (6)
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_3x3_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_3x3_L2NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_5x5_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_5x5_L2NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_7x7_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_SUPP_THRESHOLD_U8XY_U8_7x7_L2NORM,
+
+	// Fixed Neighbors: U16(mag,phase) = op U8 (6)
+	VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_3x3_L2NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_5x5_L2NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L1NORM,
+	VX_KERNEL_AMD_CANNY_SOBEL_U16_U8_7x7_L2NORM,
+
+	// Fixed Neighbors: U8 = U16(mag,phase) (1)
+	VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8_U16_3x3,
+
+	// Fixed Neighbors: U8xy = U16(mag,phase) (1)
+	VX_KERNEL_AMD_CANNY_SUPP_THRESHOLD_U8XY_U16_3x3,
+
+	// Fixed Neighbors: xy = ANY (1)
+	VX_KERNEL_AMD_NON_MAX_SUPP_XY_ANY_3x3,
+
+	// Arbitrary Neighbors: U8 = op U8 (20)
+	VX_KERNEL_AMD_REMAP_U8_U8_NEAREST,
+	VX_KERNEL_AMD_REMAP_U8_U8_NEAREST_CONSTANT,
+	VX_KERNEL_AMD_REMAP_U8_U8_BILINEAR,
+	VX_KERNEL_AMD_REMAP_U8_U8_BILINEAR_CONSTANT,
+	VX_KERNEL_AMD_WARP_AFFINE_U8_U8_NEAREST,
+	VX_KERNEL_AMD_WARP_AFFINE_U8_U8_NEAREST_CONSTANT,
+	VX_KERNEL_AMD_WARP_AFFINE_U8_U8_BILINEAR,
+	VX_KERNEL_AMD_WARP_AFFINE_U8_U8_BILINEAR_CONSTANT,
+	VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_NEAREST,
+	VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_NEAREST_CONSTANT,
+	VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_BILINEAR,
+	VX_KERNEL_AMD_WARP_PERSPECTIVE_U8_U8_BILINEAR_CONSTANT,
+	VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_NEAREST,
+	VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR,
+	VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_REPLICATE,
+	VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_BILINEAR_CONSTANT,
+	VX_KERNEL_AMD_SCALE_IMAGE_U8_U8_AREA,
+	VX_KERNEL_AMD_REMAP_U24_U24_BILINEAR,
+	VX_KERNEL_AMD_REMAP_U24_U32_BILINEAR,
+	VX_KERNEL_AMD_REMAP_U32_U32_BILINEAR,
+
+	// Point Neighbors: XY = op XY (4)
+	VX_KERNEL_AMD_OPTICAL_FLOW_PYR_LK_XY_XY,
+	VX_KERNEL_AMD_OPTICAL_FLOW_PREPARE_LK_XY_XY,
+	VX_KERNEL_AMD_OPTICAL_FLOW_IMAGE_LK_XY_XY,
+	VX_KERNEL_AMD_OPTICAL_FLOW_FINAL_LK_XY_XY,
+
+	// Sequential: XY = op HVC (1)
+	VX_KERNEL_AMD_HARRIS_MERGE_SORT_AND_PICK_XY_HVC,
+
+	// Sequential: XY = op XY (2)
+	VX_KERNEL_AMD_FAST_CORNER_MERGE_XY_XY,
+	VX_KERNEL_AMD_HARRIS_MERGE_SORT_AND_PICK_XY_XYS,
+
+	// Sequential: U8 = op U8 (1)
+	VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8,
+
+	// Sequential: U8 = op U8XY (1)
+	VX_KERNEL_AMD_CANNY_EDGE_TRACE_U8_U8XY,
+
+	// Sequential: U32 = op U8 (1)
+	VX_KERNEL_AMD_INTEGRAL_IMAGE_U32_U8,
+
+	// Sequential: DATA = op U8 (3)
+	VX_KERNEL_AMD_HISTOGRAM_DATA_U8,
+	VX_KERNEL_AMD_MEAN_STD_DEV_DATA_U8,
+	VX_KERNEL_AMD_MIN_MAX_DATA_U8,
+
+	// Sequential: DATA = op S16 (1)
+	VX_KERNEL_AMD_MIN_MAX_DATA_S16,
+
+	// Sequential: DATA = op DATA (1)
+	VX_KERNEL_AMD_EQUALIZE_DATA_DATA,
+
+	// Sequential: DATA = DATA op DATA (20)
+	VX_KERNEL_AMD_HISTOGRAM_MERGE_DATA_DATA,
+	VX_KERNEL_AMD_MEAN_STD_DEV_MERGE_DATA_DATA,
+	VX_KERNEL_AMD_MIN_MAX_MERGE_DATA_DATA,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MIN,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_NONE_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MIN_COUNT_MIN,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MIN_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MAX_COUNT_MAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MAX_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_U8DATA_LOC_MINMAX_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MIN,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_NONE_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MIN_COUNT_MIN,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MIN_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MINMAX_COUNT_MINMAX,
+	VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_DATA,
+
+	VX_KERNEL_AMD_MAX_1_0, // Used for bounds checking in the internal conformance test
+};
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  // __ago_kernel_h__
diff --git a/openvx/ago/ago_platform.cpp b/openvx/ago/ago_platform.cpp
new file mode 100644
index 0000000..eaa6395
--- /dev/null
+++ b/openvx/ago/ago_platform.cpp
@@ -0,0 +1,228 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_platform.h"
+
+// macro to port VisualStudio __cpuid to g++
+#if !_WIN32
+#define __cpuid(out, infoType) asm("cpuid": "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]): "a" (infoType));
+#endif
+
+#if _WIN32 && ENABLE_OPENCL
+#pragma comment(lib, "OpenCL.lib")
+#endif
+
+bool agoIsCpuHardwareSupported()
+{
+	bool isHardwareSupported = false;
+	int CPUInfo[4] = { -1 };
+	__cpuid(CPUInfo, 0);
+	if (CPUInfo[0] > 1) {
+		__cpuid(CPUInfo, 1);
+		// check for SSE4.2 support
+		if (CPUInfo[2] & 0x100000)
+			isHardwareSupported = true;
+	}
+	return isHardwareSupported;
+}
+
+uint32_t agoControlFpSetRoundEven()
+{
+	uint32_t state;
+#if _WIN32
+	state = _controlfp(0, 0);
+	_controlfp(_RC_NEAR, _MCW_RC); // round to nearest even: RC_CHOP gives matching output with sample code
+	return state;
+#else
+	state = fegetround();
+	fesetround(FE_TONEAREST);
+#endif
+	return state;
+}
+
+void agoControlFpReset(uint32_t state)
+{
+#if _WIN32
+	_controlfp(state, _MCW_RC);
+#else
+	fesetround(state);
+#endif
+}
+
+bool agoGetEnvironmentVariable(const char * name, char * value, size_t valueSize)
+{
+#if _WIN32
+	DWORD len = GetEnvironmentVariableA(name, value, (DWORD)valueSize);
+	value[valueSize-1] = 0;
+	return (len > 0) ? true : false;
+#else
+	const char * v = getenv(name);
+	if (v) {
+		strncpy(value, v, valueSize);
+		value[valueSize-1] = 0;
+	}
+	return v ? true : false;
+#endif
+}
+
+ago_module agoOpenModule(const char * libFileName)
+{
+#if _WIN32
+	return (ago_module)LoadLibraryA(libFileName);
+#else
+	return (ago_module) dlopen(libFileName, RTLD_NOW | RTLD_LOCAL);
+#endif
+}
+
+void * agoGetFunctionAddress(ago_module module, const char * functionName)
+{
+#if _WIN32
+	return GetProcAddress((HMODULE)module, functionName);
+#else
+	return dlsym(module, functionName);
+#endif
+}
+
+void agoCloseModule(ago_module module)
+{
+#if _WIN32
+	FreeLibrary((HMODULE)module);
+#else
+	dlclose(module);
+#endif
+}
+
+int64_t agoGetClockCounter()
+{
+#if _WIN32
+	LARGE_INTEGER v;
+	QueryPerformanceCounter(&v);
+	return v.QuadPart;
+#else
+	return chrono::high_resolution_clock::now().time_since_epoch().count();
+#endif
+}
+
+int64_t agoGetClockFrequency()
+{
+#if _WIN32
+	LARGE_INTEGER v;
+	QueryPerformanceFrequency(&v);
+	return v.QuadPart;
+#else
+	return chrono::high_resolution_clock::period::den / chrono::high_resolution_clock::period::num;
+#endif
+}
+
+#if __linux__
+#include "ago_internal.h"
+
+#include <mutex>
+#include <condition_variable>
+#include <fenv.h>
+#include <dlfcn.h>
+
+#define VX_SEMAPHORE    1
+#define VX_THREAD       2
+
+typedef struct {
+	int type; // should be VX_SEMAPHORE
+	int count;
+	mutex mtx;
+	condition_variable cv;
+} vx_semaphore;
+
+// TBD
+void EnterCriticalSection(CRITICAL_SECTION cs)
+{
+}
+void LeaveCriticalSection(CRITICAL_SECTION cs)
+{
+}
+void InitializeCriticalSection(CRITICAL_SECTION cs)
+{
+}
+void DeleteCriticalSection(CRITICAL_SECTION cs)
+{
+}
+
+HANDLE CreateSemaphore(void *, LONG, LONG, void *)
+{
+	vx_semaphore * sem = new vx_semaphore;
+	sem->type = VX_SEMAPHORE;
+	sem->count = 0;
+	return sem;
+}
+HANDLE CreateThread(void *, size_t dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, void *)
+{
+	return nullptr;
+}
+void CloseHandle(HANDLE h)
+{
+	if(h) {
+		if(*(int*)h == VX_SEMAPHORE) {
+			vx_semaphore * sem = (vx_semaphore *)h;
+			sem->type = 0;
+			delete sem;
+		}
+		else if(*(int*)h == VX_THREAD) {
+			// TBD
+		}
+	}
+}
+DWORD WaitForSingleObject(HANDLE h, DWORD dwMilliseconds)
+{
+	if(h) {
+		if(*(int*)h == VX_SEMAPHORE) {
+			vx_semaphore * sem = (vx_semaphore *)h;
+			{
+				unique_lock<mutex> lk(sem->mtx);
+				sem->cv.wait(lk); // TBD: implement with timeout
+			}
+			{
+				lock_guard<mutex> lk(sem->mtx);
+				sem->count--;
+			}
+		}
+	}
+	return 0;
+}
+BOOL ReleaseSemaphore(HANDLE h, LONG lReleaseCount, LPLONG lpPreviousCount)
+{
+	if(h) {
+		if(*(int*)h == VX_SEMAPHORE) {
+			vx_semaphore * sem = (vx_semaphore *)h;
+			{
+				lock_guard<mutex> lk(sem->mtx);
+				if(lpPreviousCount) *lpPreviousCount = sem->count;
+				sem->count += lReleaseCount;
+			}
+			for(LONG i = 0; i < lReleaseCount; i++) {
+				sem->cv.notify_one();
+			}
+		}
+	}
+	return 0;
+}
+
+#endif
diff --git a/openvx/ago/ago_platform.h b/openvx/ago/ago_platform.h
new file mode 100644
index 0000000..67b8e14
--- /dev/null
+++ b/openvx/ago/ago_platform.h
@@ -0,0 +1,137 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __ago_platform_h__
+#define __ago_platform_h__
+
+// OpenCL: enabled unless disabled explicitly by setting ENABLE_OPENCL=0
+#ifndef ENABLE_OPENCL
+#define ENABLE_OPENCL  1
+#endif
+
+#define _CRT_SECURE_NO_WARNINGS
+#define _USE_MATH_DEFINES
+#include <VX/vx.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <fenv.h>
+#include <vector>
+#include <list>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <chrono>
+#include <thread>
+using namespace std;
+
+#if _WIN32
+#include <Windows.h>
+#include <intrin.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#if ENABLE_OPENCL
+#include <CL/cl.h>
+#endif
+
+// platform specific shared library file extension
+#if _WIN32
+#define SHARED_LIBRARY_PREFIX    ""
+#define SHARED_LIBRARY_EXTENSION ".dll"
+#elif __APPLE__
+#define SHARED_LIBRARY_PREFIX    "lib"
+#define SHARED_LIBRARY_EXTENSION ".dylib"
+#else
+#define SHARED_LIBRARY_PREFIX    "lib"
+#define SHARED_LIBRARY_EXTENSION ".so"
+#endif
+
+// platform specific alignment attributes
+#if _WIN32
+#define DECL_ALIGN(n) __declspec(align(n))
+#define ATTR_ALIGN(n)
+#else
+#define DECL_ALIGN(n)
+#define ATTR_ALIGN(n) __attribute__((aligned(n)))
+#endif
+
+// macro to port VisualStudio m128i fields of __m128i to g++
+#if _WIN32
+#define M128I(m128i_register) m128i_register
+#else
+#define M128I(m128i_register) (*((_m128i_union*)&m128i_register))
+typedef union {
+	char               m128i_i8[16];
+	short              m128i_i16[8];
+	int                m128i_i32[4];
+	long long          m128i_i64[2];
+	unsigned char      m128i_u8[16];
+	unsigned short     m128i_u16[8];
+	unsigned int       m128i_u32[4];
+	unsigned long long m128i_u64[2];
+} _m128i_union;
+#endif
+
+// platform independent data types
+typedef struct _ago_module    * ago_module;
+
+// platform independent functions
+bool       agoIsCpuHardwareSupported();
+uint32_t   agoControlFpSetRoundEven();
+void       agoControlFpReset(uint32_t state);
+int64_t    agoGetClockCounter();
+int64_t    agoGetClockFrequency();
+bool       agoGetEnvironmentVariable(const char * name, char * value, size_t valueSize); // returns true if success
+ago_module agoOpenModule(const char * libFileName);
+void *     agoGetFunctionAddress(ago_module module, const char * functionName);
+void       agoCloseModule(ago_module module);
+
+#if !_WIN32
+typedef void * CRITICAL_SECTION;
+typedef void * HANDLE;
+typedef unsigned long DWORD;
+typedef void * LPVOID;
+typedef int BOOL;
+typedef long LONG, * LPLONG;
+typedef DWORD (*LPTHREAD_START_ROUTINE)(LPVOID lpThreadParameter);
+extern void EnterCriticalSection(CRITICAL_SECTION cs);
+extern void LeaveCriticalSection(CRITICAL_SECTION cs);
+extern void InitializeCriticalSection(CRITICAL_SECTION cs);
+extern void DeleteCriticalSection(CRITICAL_SECTION cs);
+extern void CloseHandle(HANDLE h);
+extern HANDLE CreateSemaphore(void *, LONG, LONG, void *);
+extern HANDLE CreateThread(void *, size_t dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, void *);
+extern DWORD WaitForSingleObject(HANDLE hHandle, DWORD dwMilliseconds);
+extern BOOL ReleaseSemaphore(HANDLE hSemaphore, LONG lReleaseCount, LPLONG lpPreviousCount);
+#define WINAPI
+#define INFINITE 0xFFFFFFFF
+#define WAIT_OBJECT_0 0
+#endif
+
+#endif
diff --git a/openvx/ago/ago_util.cpp b/openvx/ago/ago_util.cpp
new file mode 100644
index 0000000..7a3f108
--- /dev/null
+++ b/openvx/ago/ago_util.cpp
@@ -0,0 +1,2776 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+#include <math.h>
+#include <sstream>
+
+// global locks
+static vx_bool g_cs_context_initialized = vx_false_e;
+static CRITICAL_SECTION g_cs_context;
+
+// enumeration constants
+static struct { const char * name; vx_enum value; vx_size size; } s_table_constants[] = {
+		{ "CHANNEL_0", VX_CHANNEL_0 },
+		{ "CHANNEL_1", VX_CHANNEL_1 },
+		{ "CHANNEL_2", VX_CHANNEL_2 },
+		{ "CHANNEL_3", VX_CHANNEL_3 },
+		{ "CHANNEL_R", VX_CHANNEL_R },
+		{ "CHANNEL_G", VX_CHANNEL_G },
+		{ "CHANNEL_B", VX_CHANNEL_B },
+		{ "CHANNEL_A", VX_CHANNEL_A },
+		{ "CHANNEL_Y", VX_CHANNEL_Y },
+		{ "CHANNEL_U", VX_CHANNEL_U },
+		{ "CHANNEL_V", VX_CHANNEL_V },
+		{ "WRAP", VX_CONVERT_POLICY_WRAP },
+		{ "SATURATE", VX_CONVERT_POLICY_SATURATE },
+		{ "NEAREST_NEIGHBOR", VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR },
+		{ "BILINEAR", VX_INTERPOLATION_TYPE_BILINEAR },
+		{ "AREA", VX_INTERPOLATION_TYPE_AREA },
+		{ "BINARY", VX_THRESHOLD_TYPE_BINARY },
+		{ "RANGE", VX_THRESHOLD_TYPE_RANGE },
+		{ "NORM_L1", VX_NORM_L1 },
+		{ "NORM_L2", VX_NORM_L2 },
+		{ "ROUND_POLICY_TO_ZERO", VX_ROUND_POLICY_TO_ZERO },
+		{ "ROUND_POLICY_TO_NEAREST_EVEN", VX_ROUND_POLICY_TO_NEAREST_EVEN },
+		{ "CRITERIA_ITERATIONS", VX_TERM_CRITERIA_ITERATIONS },
+		{ "CRITERIA_EPSILON", VX_TERM_CRITERIA_EPSILON },
+		{ "CRITERIA_BOTH", VX_TERM_CRITERIA_BOTH },
+		{ "BORDER_MODE_UNDEFINED", VX_BORDER_MODE_UNDEFINED },
+		{ "BORDER_MODE_REPLICATE", VX_BORDER_MODE_REPLICATE },
+		{ "BORDER_MODE_CONSTANT", VX_BORDER_MODE_CONSTANT },
+		{ "VX_DIRECTIVE_DISABLE_LOGGING", VX_DIRECTIVE_DISABLE_LOGGING },
+		{ "VX_DIRECTIVE_ENABLE_LOGGING", VX_DIRECTIVE_ENABLE_LOGGING },
+		{ "VX_DIRECTIVE_READ_ONLY", VX_DIRECTIVE_AMD_READ_ONLY },
+		{ "RECTANGLE", VX_TYPE_RECTANGLE, sizeof(vx_rectangle_t) },
+		{ "KEYPOINT", VX_TYPE_KEYPOINT, sizeof(vx_keypoint_t) },
+		{ "COORDINATES2D", VX_TYPE_COORDINATES2D, sizeof(vx_coordinates2d_t) },
+		{ "COORDINATES3D", VX_TYPE_COORDINATES3D, sizeof(vx_coordinates3d_t) },
+		{ "DF_IMAGE", VX_TYPE_DF_IMAGE, sizeof(vx_df_image) },
+		{ "ENUM", VX_TYPE_ENUM, sizeof(vx_enum) },
+		{ "UINT64", VX_TYPE_UINT64, sizeof(vx_uint64) },
+		{ "INT64", VX_TYPE_INT64, sizeof(vx_int64) },
+		{ "UINT32", VX_TYPE_UINT32, sizeof(vx_uint32) },
+		{ "INT32", VX_TYPE_INT32, sizeof(vx_int32) },
+		{ "UINT16", VX_TYPE_UINT16, sizeof(vx_uint16) },
+		{ "INT16", VX_TYPE_INT16, sizeof(vx_int16) },
+		{ "UINT8", VX_TYPE_UINT8, sizeof(vx_uint8) },
+		{ "INT8", VX_TYPE_INT8, sizeof(vx_int8) },
+		{ "FLOAT32", VX_TYPE_FLOAT32, sizeof(vx_float32) },
+		{ "FLOAT64", VX_TYPE_FLOAT64, sizeof(vx_float64) },
+		{ "SIZE", VX_TYPE_SIZE, sizeof(vx_size) },
+		{ "BOOL", VX_TYPE_BOOL, sizeof(vx_bool) },
+		{ "KEYPOINT_XYS", AGO_TYPE_KEYPOINT_XYS, sizeof(ago_keypoint_xys_t) },
+		{ "STRING", VX_TYPE_STRING_AMD },
+		// for debug purposes only
+		{ "VX_TYPE_LUT", VX_TYPE_LUT },
+		{ "VX_TYPE_DISTRIBUTION", VX_TYPE_DISTRIBUTION },
+		{ "VX_TYPE_PYRAMID", VX_TYPE_PYRAMID },
+		{ "VX_TYPE_THRESHOLD", VX_TYPE_THRESHOLD },
+		{ "VX_TYPE_MATRIX", VX_TYPE_MATRIX },
+		{ "VX_TYPE_CONVOLUTION", VX_TYPE_CONVOLUTION },
+		{ "VX_TYPE_SCALAR", VX_TYPE_SCALAR },
+		{ "VX_TYPE_ARRAY", VX_TYPE_ARRAY },
+		{ "VX_TYPE_IMAGE", VX_TYPE_IMAGE },
+		{ "VX_TYPE_REMAP", VX_TYPE_REMAP },
+		{ "VX_TYPE_STRING", VX_TYPE_STRING_AMD },
+		{ "AGO_TYPE_MEANSTDDEV_DATA", AGO_TYPE_MEANSTDDEV_DATA },
+		{ "AGO_TYPE_MINMAXLOC_DATA", AGO_TYPE_MINMAXLOC_DATA },
+		{ "AGO_TYPE_CANNY_STACK", AGO_TYPE_CANNY_STACK },
+		{ "AGO_TYPE_SCALE_MATRIX", AGO_TYPE_SCALE_MATRIX },
+		{ NULL, 0 }
+};
+
+const char * agoEnum2Name(vx_enum e)
+{
+	for (vx_uint32 i = 0; s_table_constants[i].name; i++) {
+		if (s_table_constants[i].value == e) 
+			return s_table_constants[i].name;
+	}
+	return NULL;
+}
+
+size_t agoType2Size(vx_context context, vx_enum type)
+{
+	for (vx_uint32 i = 0; s_table_constants[i].name; i++) {
+		if (s_table_constants[i].value == type)
+			return s_table_constants[i].size;
+	}
+	if (context) {
+		return agoGetUserStructSize(context, type);
+	}
+	return 0;
+}
+
+int agoChannelEnum2Index(vx_enum channel)
+{
+	int index = -1;
+	if (channel >= VX_CHANNEL_0 && channel <= VX_CHANNEL_3)
+		index = channel - VX_CHANNEL_0;
+	else if (channel >= VX_CHANNEL_R && channel <= VX_CHANNEL_A)
+		index = channel - VX_CHANNEL_R;
+	else if (channel >= VX_CHANNEL_Y && channel <= VX_CHANNEL_V)
+		index = channel - VX_CHANNEL_Y;
+	return index;
+}
+
+vx_enum agoName2Enum(const char * name)
+{
+	for (vx_uint32 i = 0; s_table_constants[i].name; i++) {
+		if (!strcmp(name, s_table_constants[i].name))
+			return s_table_constants[i].value;
+	}
+	return 0;
+}
+
+std::vector<std::string> split(const std::string &s, char delim) {
+	std::vector<std::string> elems;
+	std::stringstream ss(s);
+	std::string item;
+	while (std::getline(ss, item, delim)) {
+		elems.push_back(item);
+	}
+	return elems;
+}
+
+std::vector<std::string> split(const char * s_, char delim) {
+	const std::string s(s_);
+	return split(s, delim);
+}
+
+void agoLockGlobalContext()
+{
+	if (!g_cs_context_initialized) {
+		InitializeCriticalSection(&g_cs_context);
+		g_cs_context_initialized = vx_true_e;
+	}
+	EnterCriticalSection(&g_cs_context);
+}
+
+void agoUnlockGlobalContext()
+{
+	LeaveCriticalSection(&g_cs_context);
+}
+
+void * agoAllocMemory(vx_size size)
+{
+	// to keep track of allocations
+	static vx_int32 s_ago_alloc_id_count = 0;
+	// make the buffer allocation 256-bit aligned and add header for debug
+	vx_size size_alloc = size;
+	if (size_alloc & 31) size_alloc += 32 - (size_alloc & 31);
+	size_alloc += sizeof(vx_uint32) + sizeof(AgoAllocInfo) + 32 + AGO_MEMORY_ALLOC_EXTRA_PADDING + AGO_MEMORY_ALLOC_EXTRA_PADDING;
+	if (size_alloc & 31) size_alloc += 32 - (size_alloc & 31);
+	vx_uint8 * mem = (vx_uint8 *)calloc(1, size_alloc); if (!mem) return nullptr;
+	((vx_uint32 *)mem)[0] = 0xfadedcab; // marker for debug
+	vx_uint8 * mem_aligned = mem + sizeof(vx_uint32) + sizeof(AgoAllocInfo) + AGO_MEMORY_ALLOC_EXTRA_PADDING;
+	mem_aligned += ((((size_t)mem_aligned) & 31) ? (32 - (((size_t)mem_aligned) & 31)) : 0);
+	AgoAllocInfo * mem_info = &((AgoAllocInfo *)(mem_aligned - AGO_MEMORY_ALLOC_EXTRA_PADDING))[-1];
+	mem_info->allocated = mem;
+	mem_info->requested_size = size;
+	mem_info->retain_count = 1;
+	mem_info->allocate_id = s_ago_alloc_id_count++;
+	return mem_aligned;
+}
+
+void agoRetainMemory(void * mem)
+{
+	AgoAllocInfo * mem_info = &((AgoAllocInfo *)((vx_uint8 *)mem - AGO_MEMORY_ALLOC_EXTRA_PADDING))[-1];
+	// increment retain_count
+	mem_info->retain_count++;
+}
+
+void agoReleaseMemory(void * mem)
+{
+	AgoAllocInfo * mem_info = &((AgoAllocInfo *)((vx_uint8 *)mem - AGO_MEMORY_ALLOC_EXTRA_PADDING))[-1];
+	// decrement retain_count
+	mem_info->retain_count--;
+	if (((vx_uint32 *)mem_info->allocated)[0] != 0xfadedcab) {
+		agoAddLogEntry(NULL, VX_SUCCESS, "WARNING: agoReleaseMemory: invalid pointer\n");
+	}
+	else if (mem_info->retain_count < 0) {
+		agoAddLogEntry(NULL, VX_SUCCESS, "WARNING: agoReleaseMemory: detected retain_count=%d for allocate_id=%d with size=%d\n", mem_info->retain_count, mem_info->allocate_id, (vx_uint32)mem_info->requested_size);
+	}
+	else if (mem_info->retain_count == 0) {
+		// free the allocated pointer
+		free(mem_info->allocated);
+	}
+}
+
+void agoResetReference(AgoReference * ref, vx_enum type, vx_context context, vx_reference scope)
+{
+	ref->magic = AGO_MAGIC_VALID;
+	ref->type = type;
+	ref->context = context;
+	ref->scope = scope;
+	ref->external_count = 0;
+	ref->internal_count = 0;
+	ref->read_count = 0;
+	ref->write_count = 0;
+	ref->enable_logging = ENABLE_LOG_MESSAGES_DEFAULT;
+	if (context) ref->enable_logging = context->ref.enable_logging;
+	if (scope) ref->enable_logging = scope->enable_logging;
+}
+
+void agoAddData(AgoDataList * dataList, AgoData * data)
+{
+	if (dataList->tail) dataList->tail->next = data;
+	else dataList->head = data;
+	dataList->tail = data;
+	dataList->count++;
+}
+
+void agoAddNode(AgoNodeList * nodeList, AgoNode * node)
+{
+	if (nodeList->tail) nodeList->tail->next = node;
+	else nodeList->head = node;
+	nodeList->tail = node;
+	nodeList->count++;
+}
+
+void agoAddKernel(AgoKernelList * kernelList, AgoKernel * kernel)
+{
+	if (kernelList->tail) kernelList->tail->next = kernel;
+	else kernelList->head = kernel;
+	kernelList->tail = kernel;
+	kernelList->count++;
+}
+
+void agoAddGraph(AgoGraphList * graphList, AgoGraph * graph)
+{
+	if (graphList->tail) graphList->tail->next = graph;
+	else graphList->head = graph;
+	graphList->tail = graph;
+	graphList->count++;
+}
+
+AgoGraph * agoRemoveGraph(AgoGraphList * list, AgoGraph * item)
+{
+	if (list->head == item) {
+		if (list->tail == item)
+			list->head = list->tail = NULL;
+		else
+			list->head = item->next;
+		list->count--;
+		item->next = 0;
+		return item;
+	}
+	else {
+		for (AgoGraph * cur = list->head; cur->next; cur = cur->next) {
+			if (cur->next == item) {
+				if (list->tail == item)
+					list->tail = cur;
+				cur->next = item->next;
+				list->count--;
+				item->next = 0;
+				return item;
+			}
+		}
+	}
+	return 0;
+}
+
+AgoKernel * agoRemoveKernel(AgoKernelList * list, AgoKernel * item)
+{
+	if (list->head == item) {
+		if (list->tail == item)
+			list->head = list->tail = NULL;
+		else
+			list->head = item->next;
+		list->count--;
+		item->next = 0;
+		return item;
+	}
+	else {
+		for (AgoKernel * cur = list->head; cur->next; cur = cur->next) {
+			if (cur->next == item) {
+				if (list->tail == item)
+					list->tail = cur;
+				cur->next = item->next;
+				list->count--;
+				item->next = 0;
+				return item;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoRemoveNode(AgoNodeList * list, AgoNode * item, bool moveToTrash)
+{
+	int status = -1;
+	if (!item) {
+		return status;
+	}
+	if (list->head) {
+		if (list->head == item) {
+			if (list->tail == item)
+				list->head = list->tail = NULL;
+			else
+				list->head = item->next;
+			list->count--;
+			item->next = 0;
+			status = 0;
+		}
+		else {
+			for (AgoNode * cur = list->head; cur->next; cur = cur->next) {
+				if (cur->next == item) {
+					if (list->tail == item)
+						list->tail = cur;
+					cur->next = item->next;
+					list->count--;
+					item->next = 0;
+					status = 0;
+					break;
+				}
+			}
+		}
+	}
+	if (status != 0) {
+		// check in trash, if it has no external references
+		if (list->trash) {
+			for (AgoNode * cur = list->trash; cur == list->trash || cur->next; cur = cur->next) {
+				if (cur == item || cur->next == item) {
+					if (cur == item) list->trash = item->next;
+					else cur->next = item->next;
+					list->count--;
+					item->next = 0;
+					status = 0;
+					break;
+				}
+			}
+		}
+	}
+	if (status == 0) {
+		if (moveToTrash) {
+			// still has external references, so keep into trash
+			item->ref.internal_count = 0;
+			item->next = list->trash;
+			list->trash = item;
+		}
+		else {
+			// not needed anymore, just release it
+			delete item;
+		}
+	}
+	return status;
+}
+
+int agoRemoveData(AgoDataList * list, AgoData * item, AgoData ** trash)
+{
+	int status = -1;
+	if (!item) {
+		return status;
+	}
+	if (list->head == item) {
+		if (list->tail == item)
+			list->head = list->tail = NULL;
+		else
+			list->head = item->next;
+		list->count--;
+		item->next = 0;
+		status = 0;
+	}
+	else if (list->head) {
+		for (AgoData * cur = list->head; cur->next; cur = cur->next) {
+			if (cur->next == item) {
+				if (list->tail == item)
+					list->tail = cur;
+				cur->next = item->next;
+				list->count--;
+				item->next = 0;
+				status = 0;
+				break;
+			}
+		}
+	}
+	if (status != 0) {
+		// check in trash
+		if (list->trash) {
+			for (AgoData * cur = list->trash; cur && (cur == list->trash || cur->next); cur = cur->next) {
+				if (cur == item || cur->next == item) {
+					if (cur == item) list->trash = item->next;
+					else cur->next = item->next;
+					list->count--;
+					item->next = 0;
+					status = 0;
+					break;
+				}
+			}
+		}
+	}
+	if (status == 0) {
+		if (trash) {
+			// keep in trash
+			item->next = *trash;
+			*trash = item;
+		}
+		else {
+			// remove parent/child cross references in existing list
+			for (int i = 0; i < 2; i++) {
+				for (AgoData * data = i ? list->trash : list->head; data; data = data->next) {
+					if (data->parent == item)
+						data->parent = nullptr;
+					for (vx_uint32 child = 0; child < data->numChildren; child++)
+						if (data->children[child] == item)
+							data->children[child] = nullptr;
+				}
+			}
+			// not needed anymore, just release it
+			delete item;
+		}
+	}
+	return status;
+}
+
+int agoRemoveDataTree(AgoDataList * list, AgoData * item, AgoData ** trash)
+{
+	for (vx_uint32 child = 0; child < item->numChildren; child++) {
+		if (item->children[child]) {
+			if (agoRemoveDataTree(list, item->children[child], trash) < 0)
+				return -1;
+			item->children[child] = nullptr;
+		}
+	}
+	return agoRemoveData(list, item, trash);
+}
+
+void agoRemoveDataInGraph(AgoGraph * agraph, AgoData * adata)
+{
+#if ENABLE_DEBUG_MESSAGES
+	char name[256];
+	agoGetDataName(name, adata);
+	debug_printf("INFO: agoRemoveDataInGraph: removing data %s\n", name[0] ? name : "<?>");
+#endif
+	// clear parent link in it's children and give them a name, when missing
+	if (adata->children) {
+		char dataName[256];
+		agoGetDataName(dataName, adata);
+		for (vx_uint32 i = 0; dataName[i]; i++) {
+			if (dataName[i] == '[' || dataName[i] == ']')
+				dataName[i] = '!';
+		}
+		for (vx_uint32 i = 0; i < adata->numChildren; i++) {
+			if (adata->children[i]) {
+				// make sure that adata is the owner parent, before clearing the parent link
+				if (adata->children[i]->parent == adata) {
+					if (dataName[0] && !adata->children[i]->name.length()) {
+						char nameChild[256];
+						sprintf(nameChild, "%s!%d!", dataName, i);
+						adata->children[i]->name = nameChild;
+					}
+					adata->children[i]->parent = NULL;
+				}
+			}
+		}
+	}
+	// clear child link in it's paren link
+	if (adata->parent) {
+		for (vx_uint32 i = 0; i < adata->parent->numChildren; i++) {
+			if (adata->parent->children[i] == adata) {
+				adata->parent->children[i] = NULL;
+			}
+		}
+	}
+	// move the virtual data to trash
+	if (agoRemoveData(&agraph->dataList, adata, &agraph->dataList.trash)) {
+		char name[256];
+		agoGetDataName(name, adata);
+		agoAddLogEntry(&adata->ref, VX_FAILURE, "ERROR: agoRemoveDataInGraph: agoRemoveData(*,%s) failed\n", name[0] ? name : "<?>");
+	}
+}
+
+void agoReplaceDataInGraph(AgoGraph * agraph, AgoData * dataFind, AgoData * dataReplace)
+{
+	// replace all references to dataFind in the node parameters with dataReplace
+	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
+		for (vx_uint32 arg = 0; arg < anode->paramCount; arg++) {
+			if (anode->paramList[arg]) {
+				if (anode->paramList[arg] == dataFind) {
+					anode->paramList[arg] = dataReplace;
+				}
+			}
+		}
+	}
+
+	// replace all ROI master links
+	for (AgoData * adata = agraph->dataList.head; adata; adata = adata->next) {
+		if (adata->ref.type == VX_TYPE_IMAGE && adata->u.img.isROI && adata->u.img.roiMasterImage == dataFind) {
+			adata->u.img.roiMasterImage = dataReplace;
+		}
+	}
+
+	//
+	// remove dataFind from graph and replaces its links with dataReplace
+	//
+
+	// replace parent link in it's children and give them a name, when missing
+	if (dataFind->children) {
+		char dataName[256];
+		agoGetDataName(dataName, dataFind);
+		for (vx_uint32 i = 0; dataName[i]; i++) {
+			if (dataName[i] == '[' || dataName[i] == ']')
+				dataName[i] = '!';
+		}
+		for (vx_uint32 i = 0; i < dataFind->numChildren; i++) {
+			if (dataFind->children[i]) {
+				if (dataName[0] && !dataFind->children[i]->name.length()) {
+					char nameChild[256];
+					sprintf(nameChild, "%s!%d!", dataName, i);
+					dataFind->children[i]->name = nameChild;
+				}
+				dataFind->children[i]->parent = dataReplace;
+			}
+		}
+	}
+	// replace child link in it's parent link with dataReplace
+	bool removed = false;
+	if (dataFind->parent) {
+		bool found_in_parent = false;
+		for (vx_uint32 i = 0; i < dataFind->parent->numChildren; i++) {
+			if (dataFind->parent->children[i] == dataFind) {
+				dataFind->parent->children[i] = dataReplace;
+				found_in_parent = true;
+			}
+		}
+		if (found_in_parent) {
+			// TBD: need to handle the case if vxVerifyGraph is called second time with changes to the graph
+			agoRemoveData(&agraph->dataList, dataFind, &agraph->ref.context->graph_garbage_data);
+			removed = true;
+		}
+	}
+	if (!removed) {
+		// move the virtual data into trash
+		if (agoRemoveDataTree(&agraph->dataList, dataFind, &agraph->dataList.trash)) {
+			char name[256];
+			agoGetDataName(name, dataFind);
+			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReplaceDataInGraph: agoRemoveDataTree(*,%s) failed\n", name[0] ? name : "<?>");
+		}
+	}
+}
+
+int agoShutdownNode(AgoNode * node)
+{
+	vx_status status = VX_SUCCESS;
+	if (node->initialized) {
+		AgoKernel * kernel = node->akernel;
+		if (kernel) {
+			if (kernel->func) {
+				status = kernel->func(node, ago_kernel_cmd_shutdown);
+			}
+			else if (kernel->deinitialize_f) {
+				status = kernel->deinitialize_f(node, (vx_reference *)node->paramList, node->paramCount);
+			}
+			if (status) {
+				return status;
+			}
+			node->akernel = nullptr;
+		}
+		if (node->localDataPtr_allocated) {
+			agoReleaseMemory(node->localDataPtr_allocated);
+			node->localDataPtr_allocated = nullptr;
+		}
+		node->initialized = false;
+	}
+	return status;
+}
+
+void agoResetDataList(AgoDataList * dataList)
+{
+	for (int i = 0; i < 2; i++) {
+		for (AgoData * data = i ? dataList->trash : dataList->head; data;) {
+			// save next item pointer
+			AgoData * next = data->next;
+			// release current item
+			delete data;
+			// proceed to next item
+			data = next;
+		}
+	}
+	memset(dataList, 0, sizeof(*dataList));
+}
+
+void agoResetNodeList(AgoNodeList * nodeList)
+{
+	for (int i = 0; i < 2; i++) {
+		for (AgoNode * node = i ? nodeList->trash : nodeList->head; node;) {
+			// save next item pointer
+			AgoNode * next = node->next;
+			// release current item
+			delete node;
+			// proceed to next item
+			node = next;
+		}
+	}
+	memset(nodeList, 0, sizeof(*nodeList));
+}
+
+void agoResetKernelList(AgoKernelList * kernelList)
+{
+	for (AgoKernel * kernel = kernelList->head; kernel;) {
+		// save next item pointer
+		AgoKernel * next = kernel->next;
+		// release current item
+		delete kernel;
+		// proceed to next item
+		kernel = next;
+	}
+	memset(kernelList, 0, sizeof(*kernelList));
+}
+
+static void agoResetSuperNodeList(AgoSuperNode * supernodeList)
+{
+	for (AgoSuperNode * supernode = supernodeList; supernode;) {
+		// save next item pointer
+		AgoSuperNode * next = supernode->next;
+		// release current item
+#if ENABLE_OPENCL
+		agoGpuOclReleaseSuperNode(supernode);
+#endif
+		// TBD: agoResetSuperNode(supernode);
+		delete supernode;
+		// proceed to next item
+		supernode = next;
+	}
+}
+
+AgoKernel * agoFindKernelByEnum(AgoContext * acontext, vx_enum kernel_id)
+{
+	// search context
+	for (AgoKernel * kernel = acontext->kernelList.head; kernel; kernel = kernel->next) {
+		if (kernel->id == kernel_id) return kernel;
+	}
+	return 0;
+}
+
+AgoKernel * agoFindKernelByName(AgoContext * acontext, const vx_char * name)
+{
+	// search context
+	for (AgoKernel * kernel = acontext->kernelList.head; kernel; kernel = kernel->next) {
+		if (!strcmp(kernel->name, name)) return kernel;
+	}
+	if (!strstr(name, ".")) {
+		char fullName[VX_MAX_KERNEL_NAME];
+		// search for org.khronos.openvx.<name>
+		sprintf(fullName, "org.khronos.openvx.%s", name);
+		for (AgoKernel * kernel = acontext->kernelList.head; kernel; kernel = kernel->next) {
+			if (!strcmp(kernel->name, fullName)) return kernel;
+		}
+		// search for org.amd.openvx.<name>
+		sprintf(fullName, "com.amd.openvx.%s", name);
+		for (AgoKernel * kernel = acontext->kernelList.head; kernel; kernel = kernel->next) {
+			if (!strcmp(kernel->name, fullName)) return kernel;
+		}
+	}
+	return 0;
+}
+
+AgoData * agoFindDataByName(AgoContext * acontext, AgoGraph * agraph, vx_char * name)
+{
+	// check for <object>[index] syntax
+	char actualName[256]; strcpy(actualName, name);
+	int index[4] = { -1, -1, -1, -1 }; // index >=0 indicates special object
+	const char * s = strstr(name, "[");
+	if (s && name[strlen(name) - 1] == ']') {
+		actualName[s - name] = 0;
+		for (int i = 0; i < 4 && *s == '['; i++) {
+			s++; if (*s == '-') s++;
+			index[i] = atoi(s);
+			for (; *s != ']'; s++) {
+				if (!(*s >= '0' || *s <= '9')) return NULL;
+			}
+			if (*s != ']') return NULL;
+			s++;
+		}
+	}
+	// search graph
+	AgoData * data = NULL;
+	if (agraph) {
+		for (data = agraph->dataList.head; data; data = data->next) {
+			if (!strcmp(data->name.c_str(), actualName)) break;
+		}
+	}
+	if (!data) {
+		// search context
+		for (data = acontext->dataList.head; data; data = data->next) {
+			if (!strcmp(data->name.c_str(), actualName)) break;
+		}
+	}
+	if(data) {
+		for (int i = 0; i < 4 && index[i] >= 0; i++) {
+			if (index[i] < (int)data->numChildren) {
+				data = data->children[index[i]];
+			}
+			else return NULL;
+		}
+	}
+	return data;
+}
+
+void agoMarkChildrenAsPartOfDelay(AgoData * adata)
+{
+	// recursively mark children as part of delay
+	for (vx_uint32 child = 0; child < adata->numChildren; child++) {
+		if (adata->children[child]) {
+			adata->children[child]->isDelayed = vx_true_e;
+			agoMarkChildrenAsPartOfDelay(adata->children[child]);
+		}
+	}
+}
+
+bool agoIsPartOfDelay(AgoData * adata)
+{
+	return adata->isDelayed ? true : false;
+}
+
+AgoData * agoGetSiblingTraceToDelay(AgoData * data, int trace[], int& traceCount)
+{
+	if (data && data->isDelayed) {
+		traceCount = 0;
+		while (data && data->ref.type != VX_TYPE_DELAY && traceCount < AGO_MAX_DEPTH_FROM_DELAY_OBJECT) {
+			trace[traceCount++] = data->siblingIndex;
+			data = data->parent;
+		}
+	}
+	return (data->ref.type == VX_TYPE_DELAY) ? data : nullptr;
+}
+
+AgoData * agoGetDataFromSiblingTrace(AgoData * data, int trace[], int traceCount)
+{
+	for (int i = traceCount - 1; data && i >= 0; i--) {
+		vx_uint32 child = (vx_uint32)trace[i];
+		if (child < data->numChildren)
+			data = data->children[child];
+		else
+			data = nullptr;
+	}
+	return data;
+}
+
+void agoGetDescriptionFromData(AgoContext * acontext, char * desc, AgoData * data)
+{
+	const char * virt = data->isVirtual ? "-virtual" : "";
+	desc[0] = 0;
+	if (data->ref.type == VX_TYPE_DELAY) {
+		sprintf(desc + strlen(desc), "delay%s:%d,[", virt, data->u.delay.count);
+		agoGetDescriptionFromData(acontext, desc + strlen(desc), data->children[0]);
+		sprintf(desc + strlen(desc), "]");
+	}
+	else if (data->ref.type == VX_TYPE_IMAGE) {
+		if (data->u.img.isROI) {
+			sprintf(desc + strlen(desc), "image-roi:%s,%d,%d,%d,%d", data->u.img.roiMasterImage->name.c_str(), data->u.img.rect_roi.start_x, data->u.img.rect_roi.start_y, data->u.img.rect_roi.end_x, data->u.img.rect_roi.end_y);
+		}
+		else if (data->u.img.isUniform) {
+			sprintf(desc + strlen(desc), "image-uniform%s:%4.4s,%d,%d", virt, FORMAT_STR(data->u.img.format), data->u.img.width, data->u.img.height);
+			for (vx_size i = 0; i < data->u.img.components; i++) sprintf(desc + strlen(desc), ",%d", (unsigned int)data->u.img.uniform[i]);
+		}
+		else sprintf(desc + strlen(desc), "image%s:%4.4s,%d,%d", virt, FORMAT_STR(data->u.img.format), data->u.img.width, data->u.img.height);
+	}
+	else if (data->ref.type == VX_TYPE_PYRAMID) {
+		char scale[64];
+		if (data->u.pyr.scale == VX_SCALE_PYRAMID_HALF) sprintf(scale, "HALF");
+		else if (data->u.pyr.scale == VX_SCALE_PYRAMID_ORB) sprintf(scale, "ORB");
+		else sprintf(scale, "%g", data->u.pyr.scale);
+		sprintf(desc + strlen(desc), "pyramid%s:%4.4s,%d,%d," VX_FMT_SIZE ",%s", virt, FORMAT_STR(data->u.pyr.format), data->u.pyr.width, data->u.pyr.height, data->u.pyr.levels, scale);
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		if (data->u.arr.itemtype >= VX_TYPE_USER_STRUCT_START && data->u.arr.itemtype <= VX_TYPE_USER_STRUCT_END) {
+			const char * name = agoGetUserStructName(acontext, data->u.arr.itemtype);
+			if (name)
+				sprintf(desc + strlen(desc), "array%s:%s," VX_FMT_SIZE "", virt, name, data->u.arr.capacity);
+			else
+				sprintf(desc + strlen(desc), "array%s:USER-STRUCT-" VX_FMT_SIZE "," VX_FMT_SIZE "", virt, data->u.arr.itemsize, data->u.arr.capacity);
+		}
+		else
+			sprintf(desc + strlen(desc), "array%s:%s," VX_FMT_SIZE "", virt, agoEnum2Name(data->u.arr.itemtype), data->u.arr.capacity);
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		if (data->u.scalar.type == VX_TYPE_ENUM) {
+			const char * name = agoEnum2Name(data->u.scalar.u.e);
+			if (name) 
+				sprintf(desc + strlen(desc), "scalar%s:ENUM,%s", virt, name);
+			else
+				sprintf(desc + strlen(desc), "scalar%s:ENUM,0x%08x", virt, data->u.scalar.u.e);
+		}
+		else if (data->u.scalar.type == VX_TYPE_UINT32) sprintf(desc + strlen(desc), "scalar%s:UINT32,%u", virt, data->u.scalar.u.u);
+		else if (data->u.scalar.type == VX_TYPE_INT32) sprintf(desc + strlen(desc), "scalar%s:INT32,%d", virt, data->u.scalar.u.i);
+		else if (data->u.scalar.type == VX_TYPE_UINT16) sprintf(desc + strlen(desc), "scalar%s:UINT16,%u", virt, data->u.scalar.u.u);
+		else if (data->u.scalar.type == VX_TYPE_INT16) sprintf(desc + strlen(desc), "scalar%s:INT16,%d", virt, data->u.scalar.u.i);
+		else if (data->u.scalar.type == VX_TYPE_UINT8) sprintf(desc + strlen(desc), "scalar%s:UINT8,%u", virt, data->u.scalar.u.u);
+		else if (data->u.scalar.type == VX_TYPE_INT8) sprintf(desc + strlen(desc), "scalar%s:INT8,%u", virt, data->u.scalar.u.i);
+		else if (data->u.scalar.type == VX_TYPE_CHAR) sprintf(desc + strlen(desc), "scalar%s:CHAR,%u", virt, data->u.scalar.u.i);
+		else if (data->u.scalar.type == VX_TYPE_FLOAT32) sprintf(desc + strlen(desc), "scalar%s:FLOAT32,%g", virt, data->u.scalar.u.f);
+		else if (data->u.scalar.type == VX_TYPE_SIZE) sprintf(desc + strlen(desc), "scalar%s:SIZE," VX_FMT_SIZE "", virt, data->u.scalar.u.s);
+		else if (data->u.scalar.type == VX_TYPE_BOOL) sprintf(desc + strlen(desc), "scalar%s:BOOL,%d", virt, data->u.scalar.u.i);
+		else if (data->u.scalar.type == VX_TYPE_DF_IMAGE) sprintf(desc + strlen(desc), "scalar%s:DF_IMAGE,%4.4s", virt, (const char *)&data->u.scalar.u.df);
+		else if (data->u.scalar.type == VX_TYPE_FLOAT64) sprintf(desc + strlen(desc), "scalar%s:FLOAT64,%lg", virt, data->u.scalar.u.f64);
+		else if (data->u.scalar.type == VX_TYPE_INT64) sprintf(desc + strlen(desc), "scalar%s:INT64,%" PRId64, virt, data->u.scalar.u.i64);
+		else if (data->u.scalar.type == VX_TYPE_UINT64) sprintf(desc + strlen(desc), "scalar%s:UINT64,%" PRIu64, virt, data->u.scalar.u.u64);
+		else if (data->u.scalar.type == VX_TYPE_STRING_AMD) sprintf(desc + strlen(desc), "scalar%s:STRING,%s", virt, data->buffer ? (const char *)data->buffer : "");
+		else sprintf(desc + strlen(desc), "scalar%s:UNSUPPORTED,NULL", virt);
+	}
+	else if (data->ref.type == VX_TYPE_DISTRIBUTION) {
+		sprintf(desc + strlen(desc), "distribution%s:" VX_FMT_SIZE ",%d,%u", virt, data->u.dist.numbins, data->u.dist.offset, data->u.dist.range);
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		sprintf(desc + strlen(desc), "lut%s:%s," VX_FMT_SIZE "", virt, agoEnum2Name(data->u.lut.type), data->u.lut.count);
+	}
+	else if (data->ref.type == VX_TYPE_THRESHOLD) {
+		sprintf(desc + strlen(desc), "threshold%s:%s,%s", virt, agoEnum2Name(data->u.thr.thresh_type), agoEnum2Name(data->u.thr.data_type));
+		if (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_BINARY)
+			sprintf(desc + strlen(desc), ":I,%d", data->u.thr.threshold_lower);
+		else if (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE)
+			sprintf(desc + strlen(desc), ":I,%d,%d", data->u.thr.threshold_lower, data->u.thr.threshold_upper);
+	}
+	else if (data->ref.type == VX_TYPE_CONVOLUTION) {
+		sprintf(desc + strlen(desc), "convolution%s:" VX_FMT_SIZE "," VX_FMT_SIZE "", virt, data->u.conv.columns, data->u.conv.rows);
+		if (data->u.conv.shift)
+			sprintf(desc + strlen(desc), ",%u", data->u.conv.shift);
+	}
+	else if (data->ref.type == VX_TYPE_MATRIX) {
+		sprintf(desc + strlen(desc), "matrix%s:%s," VX_FMT_SIZE "," VX_FMT_SIZE "", virt, agoEnum2Name(data->u.mat.type), data->u.mat.columns, data->u.mat.rows);
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		sprintf(desc + strlen(desc), "remap%s:%u,%u,%u,%u", virt, data->u.remap.src_width, data->u.remap.src_height, data->u.remap.dst_width, data->u.remap.dst_height);
+	}
+	else if (data->ref.type == AGO_TYPE_MEANSTDDEV_DATA) {
+		sprintf(desc + strlen(desc), "ago-meanstddev-data%s:", virt);
+	}
+	else if (data->ref.type == AGO_TYPE_MINMAXLOC_DATA) {
+		sprintf(desc + strlen(desc), "ago-minmaxloc-data%s:", virt);
+	}
+	else if (data->ref.type == AGO_TYPE_CANNY_STACK) {
+		sprintf(desc + strlen(desc), "ago-canny-stack%s:%u", virt, data->u.cannystack.count);
+	}
+	else if (data->ref.type == AGO_TYPE_SCALE_MATRIX) {
+		sprintf(desc + strlen(desc), "ago-scale-matrix%s:%.12e,%.12e,%.12e,%.12e", virt, data->u.scalemat.xscale, data->u.scalemat.yscale, data->u.scalemat.xoffset, data->u.scalemat.yoffset);
+	}
+	else sprintf(desc + strlen(desc), "UNSUPPORTED%s:0x%08x", virt, data->ref.type);
+}
+
+int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData * data, const char * desc)
+{
+	if (!strncmp(desc, "delay:", 6) || !strncmp(desc, "delay-virtual:", 14)) {
+		if (!strncmp(desc, "delay-virtual:", 14)) {
+			data->isVirtual = vx_true_e;
+			desc += 14;
+		}
+		else desc += 6;
+		// get configuration
+		data->ref.type = VX_TYPE_DELAY;
+		if (sscanf(desc, "%u", &data->u.delay.count) != 1) return -1;
+		if (data->u.delay.count < 1) return -1;
+		while (*desc && *desc != '[') desc++;
+		vx_uint32 epos = (vx_uint32)strlen(desc) - 1;
+		if ((*desc != '[') || (desc[epos] != ']')) return -1;
+		desc++; epos--;
+		char desc_child[1024];
+		strncpy(desc_child, desc, epos);
+		if (data->children) 
+			delete [] data->children;
+		data->numChildren = (vx_uint32)data->u.delay.count;
+		data->children = new AgoData * [data->numChildren];
+		for (vx_uint32 child = 0; child < data->numChildren; child++) {
+			if ((data->children[child] = agoCreateDataFromDescription(acontext, agraph, desc_child, false)) == NULL) return -1;
+			data->children[child]->parent = data;
+			data->children[child]->siblingIndex = (vx_int32)child;
+			if (data->children[child]->isVirtual != data->isVirtual) return -1;
+			if (child == 0) {
+				data->u.delay.type = data->children[child]->ref.type;
+				if (data->u.delay.type == VX_TYPE_DELAY) {
+					agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: delay of delay is not permitted\n");
+					return -1;
+				}
+			}
+		}
+		// mark the children of delay element as part of delay object
+		agoMarkChildrenAsPartOfDelay(data);
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for delay\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "image:", 6) || !strncmp(desc, "image-virtual:", 14)) {
+		if (!strncmp(desc, "image-virtual:", 14)) {
+			data->u.img.isVirtual = vx_true_e;
+			data->isVirtual = vx_true_e;
+			desc += 14;
+		}
+		else desc += 6;
+		// get configuration
+		data->ref.type = VX_TYPE_IMAGE;
+		memcpy(&data->u.img.format, desc, sizeof(data->u.img.format));
+		if (sscanf(desc + 5, "%d,%d", &data->u.img.width, &data->u.img.height) != 2) return -1;
+		if (data->isVirtual && !data->isNotFullyConfigured && (data->u.img.format == VX_DF_IMAGE_VIRT || data->u.img.width == 0 || data->u.img.height == 0)) {
+			// incomplete information needs to process this again later
+			data->isNotFullyConfigured = vx_true_e;
+			return 0;
+		}
+		if (agoGetImageComponentsAndPlanes(data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range)) return -1;
+		if (data->u.img.planes > 1) {
+			if (data->children) 
+				delete [] data->children;
+			data->numChildren = (vx_uint32)data->u.img.planes;
+			data->children = new AgoData *[data->numChildren];
+			for (vx_uint32 child = 0; child < data->numChildren; child++) {
+				vx_df_image format;
+				vx_uint32 width, height;
+				if (agoGetImagePlaneFormat(data->u.img.format, data->u.img.width, data->u.img.height, child, &format, &width, &height)) return -1;
+				char imgdesc[64]; sprintf(imgdesc, "image%s:%4.4s,%d,%d", data->isVirtual ? "-virtual" : "", FORMAT_STR(format), width, height);
+				if ((data->children[child] = agoCreateDataFromDescription(acontext, agraph, imgdesc, false)) == NULL) return -1;
+				if (agoGetImageComponentsAndPlanes(data->children[child]->u.img.format, &data->children[child]->u.img.components, &data->children[child]->u.img.planes, &data->children[child]->u.img.pixel_size_in_bits, &data->children[child]->u.img.color_space, &data->children[child]->u.img.channel_range)) return -1;
+				data->children[child]->siblingIndex = (vx_int32)child;
+				data->children[child]->parent = data;
+				data->children[child]->u.img.x_scale_factor_is_2 = (data->children[child]->u.img.width  != data->u.img.width ) ? 1 : 0;
+				data->children[child]->u.img.y_scale_factor_is_2 = (data->children[child]->u.img.height != data->u.img.height) ? 1 : 0;
+			}
+		}
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for image\n");
+			return -1;
+		}
+		// set valid region of the image to FULL AREA
+		data->u.img.rect_valid.start_x = 0;
+		data->u.img.rect_valid.start_y = 0;
+		data->u.img.rect_valid.end_x = data->u.img.width;
+		data->u.img.rect_valid.end_y = data->u.img.height;
+		return 0;
+	}
+	else if (!strncmp(desc, "image-uniform:", 14) || !strncmp(desc, "image-uniform-virtual:", 22)) {
+		if (!strncmp(desc, "image-uniform-virtual:", 22)) {
+			data->u.img.isVirtual = vx_true_e;
+			data->isVirtual = vx_true_e;
+			desc += 22;
+		}
+		else desc += 14;
+		// get configuration
+		data->ref.type = VX_TYPE_IMAGE;
+		data->u.img.isUniform = vx_true_e;
+		memcpy(&data->u.img.format, desc, sizeof(data->u.img.format));
+		if (sscanf(desc + 5, "%d,%d," VX_FMT_SIZE "," VX_FMT_SIZE "," VX_FMT_SIZE "," VX_FMT_SIZE "", &data->u.img.width, &data->u.img.height, &data->u.img.uniform[0], &data->u.img.uniform[1], &data->u.img.uniform[2], &data->u.img.uniform[3]) < 2) return -1;
+		if (agoGetImageComponentsAndPlanes(data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range)) return -1;
+		data->isInitialized = vx_true_e;
+		if (data->u.img.planes > 1) {
+			if (data->children) 
+				delete [] data->children;
+			data->numChildren = (vx_uint32)data->u.img.planes;
+			data->children = new AgoData *[data->numChildren];
+			for (vx_uint32 child = 0; child < data->numChildren; child++) {
+				vx_df_image format;
+				vx_uint32 width, height;
+				if (agoGetImagePlaneFormat(data->u.img.format, data->u.img.width, data->u.img.height, child, &format, &width, &height)) return -1;
+				vx_uint32 value = (vx_uint32)data->u.img.uniform[child];
+
+				// special handling required for NV12/NV21 image formats
+				if (data->u.img.format == VX_DF_IMAGE_NV21 && child == 1) value = (value << 8) | (vx_uint32)data->u.img.uniform[child + 1];
+				else if (data->u.img.format == VX_DF_IMAGE_NV12 && child == 1) value = value | ((vx_uint32)data->u.img.uniform[child + 1] << 8);
+
+				char imgdesc[64]; sprintf(imgdesc, "image-uniform%s:%4.4s,%d,%d,%d", data->isVirtual ? "-virtual" : "", FORMAT_STR(format), width, height, value);
+				if ((data->children[child] = agoCreateDataFromDescription(acontext, agraph, imgdesc, false)) == NULL) return -1;
+				if (agoGetImageComponentsAndPlanes(data->children[child]->u.img.format, &data->children[child]->u.img.components, &data->children[child]->u.img.planes, &data->children[child]->u.img.pixel_size_in_bits, &data->children[child]->u.img.color_space, &data->children[child]->u.img.channel_range)) return -1;
+				data->children[child]->isInitialized = vx_true_e;
+				data->children[child]->parent = data;
+				data->children[child]->u.img.x_scale_factor_is_2 = (data->children[child]->u.img.width  != data->u.img.width ) ? 1 : 0;
+				data->children[child]->u.img.y_scale_factor_is_2 = (data->children[child]->u.img.height != data->u.img.height) ? 1 : 0;
+				// set min/max values as uniform value
+				if (data->children[child]->u.img.format == VX_DF_IMAGE_U8 ||
+					data->children[child]->u.img.format == VX_DF_IMAGE_S16 ||
+					data->children[child]->u.img.format == VX_DF_IMAGE_U16 ||
+					data->children[child]->u.img.format == VX_DF_IMAGE_S32 ||
+					data->children[child]->u.img.format == VX_DF_IMAGE_U32 ||
+					data->children[child]->u.img.format == VX_DF_IMAGE_U1_AMD)
+				{
+					data->children[child]->u.img.hasMinMax = vx_true_e;
+					data->children[child]->u.img.minValue = (vx_int32)data->children[child]->u.img.uniform[0];
+					data->children[child]->u.img.maxValue = (vx_int32)data->children[child]->u.img.uniform[0];
+				}
+			}
+		}
+		// set min/max values as uniform value
+		if (data->u.img.format == VX_DF_IMAGE_U8 ||
+			data->u.img.format == VX_DF_IMAGE_S16 ||
+			data->u.img.format == VX_DF_IMAGE_U16 ||
+			data->u.img.format == VX_DF_IMAGE_S32 ||
+			data->u.img.format == VX_DF_IMAGE_U32 ||
+			data->u.img.format == VX_DF_IMAGE_U1_AMD)
+		{
+			data->u.img.hasMinMax = vx_true_e;
+			data->u.img.minValue = (vx_int32)data->u.img.uniform[0];
+			data->u.img.maxValue = (vx_int32)data->u.img.uniform[0];
+		}
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for image-uniform\n");
+			return -1;
+		}
+		// set valid region of the image to FULL AREA
+		data->u.img.rect_valid.start_x = 0;
+		data->u.img.rect_valid.start_y = 0;
+		data->u.img.rect_valid.end_x = data->u.img.width;
+		data->u.img.rect_valid.end_y = data->u.img.height;
+		return 0;
+	}
+	else if (!strncmp(desc, "image-roi:", 10)) {
+		desc += 10;
+		// get configuration
+		data->ref.type = VX_TYPE_IMAGE;
+		data->u.img.isROI = vx_true_e;
+		const char *s = strstr(desc, ","); if (!s) return -1;
+		char master_name[128];
+		memcpy(master_name, desc, s - desc); master_name[s - desc] = 0;
+		s++;
+		if (sscanf(s, "%u,%u,%u,%u", &data->u.img.rect_roi.start_x, &data->u.img.rect_roi.start_y, &data->u.img.rect_roi.end_x, &data->u.img.rect_roi.end_y) != 4) return -1;
+		vx_rectangle_t rect = data->u.img.rect_roi;
+		// traverse and link ROI to top-level image
+		AgoData * pdata = agoFindDataByName(acontext, agraph, master_name);
+		while (pdata && pdata->ref.type == VX_TYPE_IMAGE && pdata->u.img.isROI) {
+			rect.start_x += pdata->u.img.rect_roi.start_x;
+			rect.start_y += pdata->u.img.rect_roi.start_y;
+			rect.end_x += pdata->u.img.rect_roi.start_x;
+			rect.end_y += pdata->u.img.rect_roi.start_y;
+			pdata = pdata->u.img.roiMasterImage;
+		}
+		if (!pdata || pdata->ref.type != VX_TYPE_IMAGE) {
+			agoAddLogEntry(&pdata->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: image-roi: master image is invalid: %s\n", master_name);
+			return -1;
+		}
+		data->isVirtual = pdata->isVirtual;
+		data->isInitialized = pdata->isInitialized;
+		data->u.img = pdata->u.img;
+		data->u.img.roiMasterImage = pdata;
+		data->u.img.isROI = vx_true_e;
+		data->u.img.rect_roi = rect;
+		data->u.img.width = data->u.img.rect_roi.end_x - data->u.img.rect_roi.start_x;
+		data->u.img.height = data->u.img.rect_roi.end_y - data->u.img.rect_roi.start_y;
+		// create ROI entries for children, if image has multiple planes
+		data->numChildren = pdata->numChildren;
+		if (pdata->children) {
+			data->children = new AgoData *[data->numChildren];
+			for (vx_uint32 child = 0; child < data->numChildren; child++) {
+				data->children[child] = new AgoData;
+				agoResetReference(&data->children[child]->ref, data->children[child]->ref.type, acontext, data->children[child]->isVirtual ? &agraph->ref : NULL);
+				data->children[child]->ref.internal_count++;
+				data->children[child]->ref.type = pdata->children[child]->ref.type;
+				data->children[child]->isVirtual = pdata->children[child]->isVirtual;
+				data->children[child]->isInitialized = pdata->children[child]->isInitialized;
+				data->children[child]->u.img = pdata->children[child]->u.img;
+				data->children[child]->u.img.roiMasterImage = pdata->children[child];
+				data->children[child]->u.img.isROI = vx_true_e;
+				data->children[child]->u.img.rect_roi = rect;
+				data->children[child]->parent = data;
+				if (pdata->children[child]->u.img.width < pdata->u.img.width) {
+					// this is a 2x2 decimated plane of an image: IYUV, NV12, NV21
+					data->children[child]->u.img.rect_roi.start_x = data->u.img.rect_roi.start_x >> 1;
+					data->children[child]->u.img.rect_roi.end_x = data->children[child]->u.img.rect_roi.start_x + ((data->u.img.width + 1) >> 1);
+				}
+				if (pdata->children[child]->u.img.height < pdata->u.img.height) {
+					// this is a 2x2 decimated plane of an image: IYUV, NV12, NV21
+					data->children[child]->u.img.rect_roi.start_y = data->u.img.rect_roi.start_y >> 1;
+					data->children[child]->u.img.rect_roi.end_y = data->children[child]->u.img.rect_roi.start_y + ((data->u.img.height + 1) >> 1);
+				}
+				data->children[child]->u.img.width = data->children[child]->u.img.rect_roi.end_x - data->children[child]->u.img.rect_roi.start_x;
+				data->children[child]->u.img.height = data->children[child]->u.img.rect_roi.end_y - data->children[child]->u.img.rect_roi.start_y;
+				data->children[child]->u.img.x_scale_factor_is_2 = (data->children[child]->u.img.width  != data->u.img.width ) ? 1 : 0;
+				data->children[child]->u.img.y_scale_factor_is_2 = (data->children[child]->u.img.height != data->u.img.height) ? 1 : 0;
+			}
+		}
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for image-roi\n");
+			return -1;
+		}
+		// set valid region of the image to FULL AREA
+		data->u.img.rect_valid.start_x = 0;
+		data->u.img.rect_valid.start_y = 0;
+		data->u.img.rect_valid.end_x = data->u.img.width;
+		data->u.img.rect_valid.end_y = data->u.img.height;
+		return 0;
+	}
+	else if (!strncmp(desc, "pyramid:", 8) || !strncmp(desc, "pyramid-virtual:", 8 + 8)) {
+		data->isVirtual = !strncmp(desc, "pyramid-virtual:", 8 + 8) ? vx_true_e : vx_false_e;
+		desc += 8 + (data->isVirtual ? 8 : 0);
+		// get configuration
+		data->ref.type = VX_TYPE_PYRAMID;
+		memcpy(&data->u.pyr.format, desc, sizeof(data->u.pyr.format));
+		char scale[64] = "";
+		if (sscanf(desc + 5, "%d,%d," VX_FMT_SIZE ",%s", &data->u.pyr.width, &data->u.pyr.height, &data->u.pyr.levels, scale) != 4) return -1;
+		if (!strncmp(scale, "HALF", 4)) data->u.pyr.scale = VX_SCALE_PYRAMID_HALF;
+		else if (!strncmp(scale, "ORB", 3)) data->u.pyr.scale = VX_SCALE_PYRAMID_ORB;
+		else data->u.pyr.scale = (vx_float32)atof(scale);
+		if (data->isVirtual && !data->isNotFullyConfigured && (data->u.pyr.format == VX_DF_IMAGE_VIRT || data->u.pyr.width == 0 || data->u.pyr.height == 0)) {
+			// incomplete information needs to process this again later
+			data->isNotFullyConfigured = vx_true_e;
+			return 0;
+		}
+		if (data->children) 
+			delete [] data->children;
+		data->numChildren = (vx_uint32)data->u.pyr.levels;
+		data->children = new AgoData *[data->numChildren];
+		for (vx_uint32 level = 0, width = data->u.pyr.width, height = data->u.pyr.height; level < data->u.pyr.levels; level++) {
+			char imgdesc[64];
+			sprintf(imgdesc, "image%s:%4.4s,%d,%d", data->isVirtual ? "-virtual" : "", FORMAT_STR(data->u.pyr.format), width, height);
+			if ((data->children[level] = agoCreateDataFromDescription(acontext, agraph, imgdesc, false)) == NULL) return -1;
+			if (agoGetImageComponentsAndPlanes(data->u.pyr.format, &data->children[level]->u.img.components, &data->children[level]->u.img.planes, &data->children[level]->u.img.pixel_size_in_bits, &data->children[level]->u.img.color_space, &data->children[level]->u.img.channel_range)) return -1;
+			data->children[level]->siblingIndex = (vx_int32)level;
+			data->children[level]->parent = data;
+			if (data->u.pyr.scale == VX_SCALE_PYRAMID_ORB) {
+				float orb_scale_factor[4] = {
+					VX_SCALE_PYRAMID_ORB,
+					VX_SCALE_PYRAMID_ORB * VX_SCALE_PYRAMID_ORB,
+					VX_SCALE_PYRAMID_ORB * VX_SCALE_PYRAMID_ORB * VX_SCALE_PYRAMID_ORB,
+					VX_SCALE_PYRAMID_HALF
+				};
+				width = (vx_uint32)ceilf(orb_scale_factor[level & 3] * data->children[level & ~3]->u.img.width);
+				height = (vx_uint32)ceilf(orb_scale_factor[level & 3] * data->children[level & ~3]->u.img.height);
+			}
+			else {
+				width = (vx_uint32)ceilf(data->u.pyr.scale * width);
+				height = (vx_uint32)ceilf(data->u.pyr.scale * height);
+			}
+		}
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for pyramid\n");
+			return -1;
+		}
+		// set valid region of the pyramid to FULL AREA
+		data->u.pyr.rect_valid.start_x = 0;
+		data->u.pyr.rect_valid.start_y = 0;
+		data->u.pyr.rect_valid.end_x = data->u.pyr.width;
+		data->u.pyr.rect_valid.end_y = data->u.pyr.height;
+		return 0;
+	}
+	else if (!strncmp(desc, "array:", 6) || !strncmp(desc, "array-virtual:", 6 + 8)) {
+		if (!strncmp(desc, "array-virtual:", 6 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 6;
+		// get configuration
+		data->ref.type = VX_TYPE_ARRAY;
+		const char *s = strstr(desc, ","); if (!s) return -1;
+		char data_type[64];
+		memcpy(data_type, desc, s - desc); data_type[s - desc] = 0;
+		(void)sscanf(++s, "" VX_FMT_SIZE "", &data->u.arr.capacity);
+		data->u.arr.itemtype = agoName2Enum(data_type);
+		if (!data->u.arr.itemtype) data->u.arr.itemtype = atoi(data_type);
+		if (data->isVirtual && !data->isNotFullyConfigured && (!strcmp(data_type, "0") || !data->u.arr.capacity)) {
+			// incomplete information needs to process this again later
+			data->isNotFullyConfigured = vx_true_e;
+			return 0;
+		}
+		data->u.arr.itemsize = agoType2Size(acontext, data->u.arr.itemtype);
+		if (!data->u.arr.itemsize) {
+			vx_enum id = agoGetUserStructType(acontext, data_type);
+			if (!id) {
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: invalid data type in array: %s\n", data_type);
+				return -1;
+			}
+			data->u.arr.itemtype = id;
+		}
+		// sanity check and update
+		data->ref.context = acontext; // array sanity check requires access to context
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for array\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "distribution:", 13) || !strncmp(desc, "distribution-virtual:", 13 + 8)) {
+		if (!strncmp(desc, "distribution-virtual:", 13 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 13;
+		// get configuration
+		data->ref.type = VX_TYPE_DISTRIBUTION;
+		if (sscanf(desc, "" VX_FMT_SIZE ",%d,%u", &data->u.dist.numbins, &data->u.dist.offset, &data->u.dist.range) != 3) 
+			return -1;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for distribution\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "lut:", 4) || !strncmp(desc, "lut-virtual:", 4 + 8)) {
+		if (!strncmp(desc, "lut-virtual:", 4 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 4;
+		// get configuration
+		data->ref.type = VX_TYPE_LUT;
+		const char *s = strstr(desc, ","); if (!s) return -1;
+		char data_type[64];
+		memcpy(data_type, desc, s - desc); data_type[s - desc] = 0;
+		data->u.lut.type = agoName2Enum(data_type);
+		if (!data->u.lut.type) 
+			return -1;
+		if (sscanf(++s, "" VX_FMT_SIZE "", &data->u.lut.count) != 1) return -1;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for lut\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "threshold:", 10) || !strncmp(desc, "threshold-virtual:", 10 + 8)) {
+		if (!strncmp(desc, "threshold-virtual:", 10 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 10;
+		// get configuration
+		data->ref.type = VX_TYPE_THRESHOLD;
+		const char *s = strstr(desc, ","); if (!s) return -1;
+		char thresh_type[64], data_type[64];
+		memcpy(thresh_type, desc, s - desc); thresh_type[s - desc] = 0;
+		strcpy(data_type, s + 1);
+		for (int i = 0; i < 64 && data_type[i]; i++) if (data_type[i] == ':' || data_type[i] == ',') { data_type[i] = 0; s += i + 2; break; }
+		data->u.thr.thresh_type = agoName2Enum(thresh_type);
+		data->u.thr.data_type = agoName2Enum(data_type);
+		if (!data->u.thr.thresh_type || !data->u.thr.data_type) return -1;
+		if (data->u.thr.data_type != VX_TYPE_UINT8 && data->u.thr.data_type != VX_TYPE_UINT16 && data->u.thr.data_type != VX_TYPE_INT16) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: invalid threshold data_type %s\n", data_type);
+			return -1;
+		}
+		if (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_BINARY) {
+			if (sscanf(s, "%d", &data->u.thr.threshold_lower) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+			if (sscanf(s, "%d,%d", &data->u.thr.threshold_lower, &data->u.thr.threshold_upper) == 2)
+				data->isInitialized = vx_true_e;
+		}
+		else {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: invalid threshold thresh_type %s\n", thresh_type);
+			return -1;
+		}
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for threshold\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "convolution:", 12) || !strncmp(desc, "convolution-virtual:", 12 + 8)) {
+		if (!strncmp(desc, "convolution-virtual:", 12 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 12;
+		// get configuration
+		data->ref.type = VX_TYPE_CONVOLUTION;
+		vx_uint32 scale = 1;
+		if (sscanf(desc, "" VX_FMT_SIZE "," VX_FMT_SIZE ",%u", &data->u.conv.columns, &data->u.conv.rows, &scale) < 2)
+			return -1;
+		vx_uint32 shift = 0;
+		for (; shift < 32; shift++) {
+			if (scale == (1u << shift))
+				break;
+		}
+		data->u.conv.shift = shift;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for convolution\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "matrix:", 7) || !strncmp(desc, "matrix-virtual:", 7 + 8)) {
+		if (!strncmp(desc, "matrix-virtual:", 7 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 7;
+		// get configuration
+		data->ref.type = VX_TYPE_MATRIX;
+		const char *s = strstr(desc, ","); if (!s) return -1;
+		char data_type[64];
+		memcpy(data_type, desc, s - desc); data_type[s - desc] = 0;
+		data->u.mat.type = agoName2Enum(data_type);
+		if (data->u.mat.type != VX_TYPE_INT32 && data->u.mat.type != VX_TYPE_FLOAT32) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: invalid matrix type %s\n", data_type);
+			return -1;
+		}
+		if (sscanf(++s, "" VX_FMT_SIZE "," VX_FMT_SIZE "", &data->u.mat.columns, &data->u.mat.rows) != 2)
+			return -1;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for matrix\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "remap:", 6) || !strncmp(desc, "remap-virtual:", 6 + 8)) {
+		if (!strncmp(desc, "remap-virtual:", 6 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 6;
+		// get configuration
+		data->ref.type = VX_TYPE_REMAP;
+		if (sscanf(desc, "%u,%u,%u,%u", &data->u.remap.src_width, &data->u.remap.src_height, &data->u.remap.dst_width, &data->u.remap.dst_height) != 4) 
+			return -1;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for remap\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "scalar:", 7) || !strncmp(desc, "scalar-virtual:", 7 + 8)) {
+		if (!strncmp(desc, "scalar-virtual:", 7 + 8)) {
+			desc += 8;
+			data->isVirtual = vx_true_e;
+		}
+		desc += 7;
+		// get configuration
+		data->ref.type = VX_TYPE_SCALAR;
+		const char *s = strstr(desc, ","); if (!s) return -1;
+		char data_type[64];
+		memcpy(data_type, desc, s - desc); data_type[s - desc] = 0;
+		s++;
+		data->u.scalar.type = agoName2Enum(data_type);
+		data->u.scalar.u.u64 = 0;
+		if (data->u.scalar.type == VX_TYPE_UINT32) {
+			data->u.scalar.itemsize = sizeof(vx_uint32);
+			if (sscanf(s, "%u", &data->u.scalar.u.u) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_INT32) {
+			data->u.scalar.itemsize = sizeof(vx_int32);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_UINT16) {
+			data->u.scalar.itemsize = sizeof(vx_uint16);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_INT16) {
+			data->u.scalar.itemsize = sizeof(vx_int16);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_UINT8) {
+			data->u.scalar.itemsize = sizeof(vx_uint8);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_INT8) {
+			data->u.scalar.itemsize = sizeof(vx_int8);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_CHAR) {
+			data->u.scalar.itemsize = sizeof(vx_char);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_FLOAT32) {
+			data->u.scalar.itemsize = sizeof(vx_float32);
+			if (sscanf(s, "%g", &data->u.scalar.u.f) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_BOOL) {
+			data->u.scalar.itemsize = sizeof(vx_bool);
+			if (sscanf(s, "%d", &data->u.scalar.u.i) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_SIZE) {
+			data->u.scalar.itemsize = sizeof(vx_size);
+			if (sscanf(s, "" VX_FMT_SIZE "", &data->u.scalar.u.s))
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_ENUM) {
+			data->u.scalar.itemsize = sizeof(vx_enum);
+			data->u.scalar.u.e = agoName2Enum(s);
+			if (!data->u.scalar.u.e) {
+				if (sscanf(s, "%i", &data->u.scalar.u.e) != 1) {
+					agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription(*,%s) invalid enum value\n", desc);
+					return -1;
+				}
+				data->isInitialized = vx_true_e;
+			}
+		}
+		else if (data->u.scalar.type == VX_TYPE_DF_IMAGE) {
+			data->u.scalar.itemsize = sizeof(vx_df_image);
+			if (strlen(s) >= 4) {
+				data->u.scalar.u.df = VX_DF_IMAGE(s[0], s[1], s[2], s[3]);
+				data->isInitialized = vx_true_e;
+			}
+		}
+		else if (data->u.scalar.type == VX_TYPE_FLOAT64) {
+			data->u.scalar.itemsize = sizeof(vx_float64);
+			if (sscanf(s, "%lg", &data->u.scalar.u.f64) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_INT64) {
+			data->u.scalar.itemsize = sizeof(vx_int64);
+			if (sscanf(s, "%" PRId64, &data->u.scalar.u.i64) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_UINT64) {
+			data->u.scalar.itemsize = sizeof(vx_uint64);
+			if (sscanf(s, "%" PRIu64, &data->u.scalar.u.u64) == 1)
+				data->isInitialized = vx_true_e;
+		}
+		else if (data->u.scalar.type == VX_TYPE_STRING_AMD) {
+			data->u.scalar.itemsize = sizeof(vx_char *);
+			data->size = VX_MAX_STRING_BUFFER_SIZE_AMD;
+			data->buffer_allocated = data->buffer = (vx_uint8 *)agoAllocMemory(data->size);
+			if (!data->buffer_allocated) {
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: memory allocation (%d) failed for %s\n", (int)data->size, data_type);
+				return -1;
+			}
+			strncpy((char *)data->buffer, s, VX_MAX_STRING_BUFFER_SIZE_AMD);
+			data->buffer[VX_MAX_STRING_BUFFER_SIZE_AMD - 1] = 0; // NUL terminate string in case of overflow
+			data->isInitialized = vx_true_e;
+		}
+		else {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: invalid scalar type %s\n", data_type);
+			return -1;
+		}
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for scalar\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "ago-meanstddev-data:", 20) || !strncmp(desc, "ago-meanstddev-data-virtual:", 20 + 8)) {
+		if (!strncmp(desc, "ago-meanstddev-data-virtual:", 20 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 20;
+		// get configuration
+		data->ref.type = AGO_TYPE_MEANSTDDEV_DATA;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for ago-meanstddev-data\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "ago-minmaxloc-data:", 19) || !strncmp(desc, "ago-minmaxloc-data-virtual:", 19 + 8)) {
+		if (!strncmp(desc, "ago-minmaxloc-data-virtual:", 19 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 19;
+		// get configuration
+		data->ref.type = AGO_TYPE_MINMAXLOC_DATA;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for ago-minmaxloc-data\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "ago-canny-stack:", 16) || !strncmp(desc, "ago-canny-stack-virtual:", 16 + 8)) {
+		if (!strncmp(desc, "ago-canny-stack-virtual:", 16 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 16;
+		// get configuration
+		data->ref.type = AGO_TYPE_CANNY_STACK;
+		if (sscanf(desc, "%u", &data->u.cannystack.count) != 1) return -1;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for ago-canny-stack\n");
+			return -1;
+		}
+		return 0;
+	}
+	else if (!strncmp(desc, "ago-scale-matrix:", 17) || !strncmp(desc, "ago-scale-matrix-virtual:", 17 + 8)) {
+		if (!strncmp(desc, "ago-scale-matrix-virtual:", 17 + 8)) {
+			data->isVirtual = vx_true_e;
+			desc += 8;
+		}
+		desc += 17;
+		// get configuration
+		data->ref.type = AGO_TYPE_SCALE_MATRIX;
+		if (sscanf(desc, "%g,%g,%g,%g", &data->u.scalemat.xscale, &data->u.scalemat.yscale, &data->u.scalemat.xoffset, &data->u.scalemat.yoffset) != 4) 
+			return -1;
+		data->isInitialized = vx_true_e;
+		// sanity check and update
+		if (agoDataSanityCheckAndUpdate(data)) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: agoDataSanityCheckAndUpdate failed for ago-scale-matrix\n");
+			return -1;
+		}
+		return 0;
+	}
+	return -1;
+}
+
+AgoData * agoCreateDataFromDescription(AgoContext * acontext, AgoGraph * agraph, const char * desc, bool isForExternalUse)
+{
+	AgoData * data = new AgoData;
+	int status = agoGetDataFromDescription(acontext, agraph, data, desc);
+	if (status < 0) {
+		agoAddLogEntry(&acontext->ref, VX_FAILURE, "ERROR: agoCreateDataFromDescription: agoGetDataFromDescription(%s) failed\n", desc);
+		delete data;
+		return NULL;
+	}
+	agoResetReference(&data->ref, data->ref.type, acontext, data->isVirtual ? &agraph->ref : NULL);
+	if (isForExternalUse) {
+		data->ref.external_count = 1;
+	}
+	else {
+		data->ref.internal_count = 1;
+	}
+	return data;
+}
+
+void agoGenerateDataName(AgoContext * acontext, const char * postfix, std::string& name_)
+{
+	char name[1024];
+	sprintf(name, "AUTOX!%04d!%s", acontext->dataGenerationCount++, postfix);
+	name_ = name;
+}
+
+void agoGenerateVirtualDataName(AgoGraph * agraph, const char * postfix, std::string& name_)
+{
+	char name[1024];
+	sprintf(name, "AUTO!%04d!%s", agraph->virtualDataGenerationCount++, postfix);
+	name_ = name;
+}
+
+int agoGetImageComponentsAndPlanes(vx_df_image format, vx_size * pComponents, vx_size * pPlanes, vx_size * pPixelSizeInBits, vx_color_space_e * pColorSpace, vx_channel_range_e * pChannelRange)
+{
+	if (format == VX_DF_IMAGE_RGBX) *pComponents = 4, *pPlanes = 1, *pPixelSizeInBits = 4 * 8, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_RGB) *pComponents = 3, *pPlanes = 1, *pPixelSizeInBits = 3 * 8, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_NV12) *pComponents = 3, *pPlanes = 2, *pPixelSizeInBits = 0, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_NV21) *pComponents = 3, *pPlanes = 2, *pPixelSizeInBits = 0, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_UYVY) *pComponents = 3, *pPlanes = 1, *pPixelSizeInBits = 2 * 8, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_YUYV) *pComponents = 3, *pPlanes = 1, *pPixelSizeInBits = 2 * 8, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_IYUV) *pComponents = 3, *pPlanes = 3, *pPixelSizeInBits = 0, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_YUV4) *pComponents = 3, *pPlanes = 3, *pPixelSizeInBits = 0, *pColorSpace = VX_COLOR_SPACE_DEFAULT, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_U8) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 8, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_U16) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 16, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_S16) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 16, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_U32) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 32, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_S32) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 32, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_U1_AMD) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 1, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_F32x3_AMD) *pComponents = 3, *pPlanes = 1, *pPixelSizeInBits = 3*32, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_F32_AMD) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 32, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else if (format == VX_DF_IMAGE_F64_AMD) *pComponents = 1, *pPlanes = 1, *pPixelSizeInBits = 64, *pColorSpace = VX_COLOR_SPACE_NONE, *pChannelRange = VX_CHANNEL_RANGE_FULL;
+	else return -1;
+	return 0;
+}
+
+int agoGetImagePlaneFormat(vx_df_image format, vx_uint32 width, vx_uint32 height, vx_uint32 plane, vx_df_image *pFormat, vx_uint32 * pWidth, vx_uint32 * pHeight)
+{
+	if (format == VX_DF_IMAGE_YUV4) {
+		if (plane < 3) {
+			*pFormat = VX_DF_IMAGE_U8;
+			*pWidth = width;
+			*pHeight = height;
+			return 0;
+		}
+	}
+	else if (format == VX_DF_IMAGE_IYUV) {
+		if (plane == 0) {
+			*pFormat = VX_DF_IMAGE_U8;
+			*pWidth = width;
+			*pHeight = height;
+			return 0;
+		}
+		else if (plane < 3) {
+			*pFormat = VX_DF_IMAGE_U8;
+			*pWidth = (width + 1) >> 1;
+			*pHeight = (height + 1) >> 1;
+			return 0;
+		}
+	}
+	else if (format == VX_DF_IMAGE_NV12 || format == VX_DF_IMAGE_NV21) {
+		if (plane == 0) {
+			*pFormat = VX_DF_IMAGE_U8;
+			*pWidth = width;
+			*pHeight = height;
+			return 0;
+		}
+		else if (plane == 1) {
+			*pFormat = VX_DF_IMAGE_U16;
+			*pWidth = (width + 1) >> 1;
+			*pHeight = (height + 1) >> 1;
+			return 0;
+		}
+	}
+	else {
+		if (plane == 0) {
+			*pFormat = format;
+			*pWidth = width;
+			*pHeight = height;
+			return 0;
+		}
+	}
+	return -1;
+}
+
+void agoGetDataName(vx_char * name, AgoData * data)
+{
+	name[0] = 0;
+	for (AgoData * pdata = data; pdata; pdata = pdata->parent) {
+		char tmp[1024]; strcpy(tmp, name);
+		if (pdata->parent) {
+			sprintf(name, "[%d]%s", (pdata->parent->ref.type == VX_TYPE_DELAY) ? -pdata->siblingIndex : pdata->siblingIndex, tmp);
+		}
+		else if (pdata->name.length()) {
+			sprintf(name, "%s%s", pdata->name.c_str(), tmp);
+		}
+		else {
+			name[0] = 0;
+			break;
+		}
+	}
+}
+
+vx_enum agoAddUserStruct(AgoContext * acontext, vx_size size, vx_char * name)
+{
+	CAgoLock lock(acontext->cs);
+	if (name && agoGetUserStructSize(acontext, name) > 0) {
+		agoAddLogEntry(&acontext->ref, VX_FAILURE, "ERROR: agoAddUserStruct(*," VX_FMT_SIZE ",%s): already exists\n", size, name);
+		return VX_TYPE_INVALID;
+	}
+	if (acontext->nextUserStructId >= (VX_TYPE_USER_STRUCT_START + 256)) {
+		agoAddLogEntry(&acontext->ref, VX_FAILURE, "ERROR: agoAddUserStruct(*," VX_FMT_SIZE ",%s): number of user-structures exceeded MAX\n", size, name ? name : "*");
+		return VX_TYPE_INVALID;
+	}
+	AgoUserStruct aus;
+	aus.id = acontext->nextUserStructId++;
+	aus.size = size;
+	if(name) aus.name = name;
+	else agoGenerateDataName(acontext, "UserStruct", aus.name);
+	acontext->userStructList.push_back(aus);
+	return aus.id;
+}
+
+vx_size agoGetUserStructSize(AgoContext * acontext, vx_char * name)
+{
+	for (auto it = acontext->userStructList.begin(); it != acontext->userStructList.end(); it++) {
+		if (!strcmp(it->name.c_str(), name)) {
+			return it->size;
+		}
+	}
+	return 0;
+}
+
+vx_size agoGetUserStructSize(AgoContext * acontext, vx_enum id)
+{
+	for (auto it = acontext->userStructList.begin(); it != acontext->userStructList.end(); it++) {
+		if (it->id == id) {
+			return it->size;
+		}
+	}
+	return 0;
+}
+
+vx_enum agoGetUserStructType(AgoContext * acontext, vx_char * name)
+{
+	for (auto it = acontext->userStructList.begin(); it != acontext->userStructList.end(); it++) {
+		if (!strcmp(it->name.c_str(), name)) {
+			return it->id;
+		}
+	}
+	return 0;
+}
+
+const char * agoGetUserStructName(AgoContext * acontext, vx_enum id)
+{
+	for (auto it = acontext->userStructList.begin(); it != acontext->userStructList.end(); it++) {
+		if (it->id == id) {
+			return it->name.c_str();
+		}
+	}
+	return nullptr;
+}
+
+bool agoIsValidParameter(vx_parameter parameter)
+{
+	bool ret = false;
+	if (parameter && parameter->ref.type == VX_TYPE_PARAMETER && parameter->scope && parameter->scope->magic == AGO_MAGIC_VALID &&
+		((parameter->scope->type == VX_TYPE_NODE) || (parameter->scope->type == VX_TYPE_KERNEL) || (parameter->scope->type == VX_TYPE_GRAPH)))
+	{
+		ret = true;
+	}
+	return ret;
+}
+
+bool agoIsValidReference(vx_reference ref)
+{
+	bool ret = false;
+	if ((ref != NULL) && (ref->magic == AGO_MAGIC_VALID) && ((ref->external_count + ref->internal_count) > 0)) {
+		ret = true;
+	}
+	return ret;
+}
+
+bool agoIsValidContext(vx_context context)
+{
+	bool ret = false;
+	if (agoIsValidReference((vx_reference) context) && (context->ref.type == VX_TYPE_CONTEXT)) {
+		ret = true; /* this is the top level context */
+	}
+	return ret;
+}
+
+bool agoIsValidGraph(vx_graph graph)
+{
+	bool ret = false;
+	if (agoIsValidReference((vx_reference) graph) && (graph->ref.type == VX_TYPE_GRAPH)) {
+		ret = true;
+	}
+	return ret;
+}
+
+bool agoIsValidKernel(vx_kernel kernel)
+{
+	bool ret = false;
+	if (agoIsValidReference((vx_reference) kernel) && (kernel->ref.type == VX_TYPE_KERNEL)) {
+		ret = true;
+	}
+	return ret;
+}
+
+bool agoIsValidNode(vx_node node)
+{
+	bool ret = false;
+	if (agoIsValidReference((vx_reference) node) && (node->ref.type == VX_TYPE_NODE)) {
+		ret = true;
+	}
+	return ret;
+}
+
+bool agoIsValidData(AgoData * data, vx_enum type)
+{
+	bool ret = false;
+	if (agoIsValidReference((vx_reference) data) && (data->ref.type == type)) {
+		ret = true;
+	}
+	return ret;
+}
+
+int agoDataSanityCheckAndUpdate(AgoData * data)
+{
+	if (data->ref.type == VX_TYPE_DELAY) {
+		// make sure number of children is +ve integer and consistent number of children exist
+		if (data->u.delay.count < 1 || !data->children || data->numChildren != data->u.delay.count)
+			return -1;
+		// do sanity check and update on each children
+		for (vx_uint32 child = 0; child < data->numChildren; child++) {
+			if (!data->children[child] || agoDataSanityCheckAndUpdate(data->children[child]))
+				return -1;
+			// make sure delay type matches with it's children
+			if (data->u.delay.type != data->children[child]->ref.type)
+				return -1;
+		}
+		// initialize/update other attributes (if needed)
+		data->u.delay.age = 0;
+	}
+	else if (data->ref.type == VX_TYPE_PYRAMID) {
+		// make sure number of children is +ve integer and consistent number of children exist
+		if (data->u.pyr.levels < 1 || !data->children || data->numChildren != data->u.pyr.levels)
+			return -1;
+		// restrict the range of scale factors to 1/8 to 8
+		if (data->u.pyr.scale < 0.125f || data->u.pyr.scale > 8.0f)
+			return -1;
+		// do sanity check and update on each children
+		for (vx_uint32 level = 0, width = data->u.pyr.width, height = data->u.pyr.height; level < data->u.pyr.levels; level++) {
+			// make sure children are valid images of same type
+			if (!data->children[level] || data->children[level]->ref.type != VX_TYPE_IMAGE || data->u.pyr.format != data->children[level]->u.img.format)
+				return -1;
+			// set width and height of children
+			data->children[level]->u.img.width = width;
+			data->children[level]->u.img.height = height;
+			if (data->u.pyr.scale == VX_SCALE_PYRAMID_ORB) {
+				float orb_scale_factor[4] = {
+					VX_SCALE_PYRAMID_ORB,
+					VX_SCALE_PYRAMID_ORB * VX_SCALE_PYRAMID_ORB,
+					VX_SCALE_PYRAMID_ORB * VX_SCALE_PYRAMID_ORB * VX_SCALE_PYRAMID_ORB,
+					VX_SCALE_PYRAMID_HALF
+				};
+				width = (vx_uint32)ceilf(orb_scale_factor[level & 3] * data->children[level & ~3]->u.img.width);
+				height = (vx_uint32)ceilf(orb_scale_factor[level & 3] * data->children[level & ~3]->u.img.height);
+			}
+			else {
+				width = (vx_uint32)ceilf(data->u.pyr.scale * width);
+				height = (vx_uint32)ceilf(data->u.pyr.scale * height);
+			}
+			// sanity check and update the images
+			if (agoDataSanityCheckAndUpdate(data->children[level]))
+				return -1;
+		}
+		data->size = sizeof(ago_pyramid_u8_t) * data->u.pyr.levels;
+	}
+	else if (data->ref.type == VX_TYPE_IMAGE) {
+		if (data->children) {
+			for (vx_uint32 child = 0; child < data->numChildren; child++) {
+				if (!data->children[child] || agoDataSanityCheckAndUpdate(data->children[child]))
+					return -1;
+				data->children[child]->u.img.x_scale_factor_is_2 = (data->children[child]->u.img.width != data->u.img.width) ? 1 : 0;
+				data->children[child]->u.img.y_scale_factor_is_2 = (data->children[child]->u.img.height != data->u.img.height) ? 1 : 0;
+			}
+		}
+		else if (data->u.img.isROI) {
+			// re-compute image parameters to deal with parameter changes
+			agoGetImageComponentsAndPlanes(data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range);
+			// get buffer stride and compute buffer start address
+			data->u.img.stride_in_bytes = data->u.img.roiMasterImage->u.img.stride_in_bytes;
+			if ((data->u.img.rect_roi.start_x * data->u.img.pixel_size_in_bits) & 7) {
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: detected U1 ROI that doesn't start on 8-bit boundary: %s at (%d,%d)\n", data->name.length() ? data->name.c_str() : "<?>", data->u.img.rect_roi.start_x, data->u.img.rect_roi.start_y);
+				return -1;
+			}
+		}
+		else {
+			// re-compute image parameters to deal with parameter changes
+			// NOTE: image buffer stride needs to be multiple of 16 bytes to support CPU/GPU optimizations
+			// NOTE: image buffer height needs to be mutliple of 16 to support OpenCL workgroup height=16
+			agoGetImageComponentsAndPlanes(data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range);
+			if (data->u.img.isUniform) {
+				// calculate other attributes and buffer size:
+				//   - make sure that the stride is multiple of 16 bytes
+				data->u.img.stride_in_bytes = ALIGN16((data->u.img.width * data->u.img.pixel_size_in_bits + 7) >> 3);
+				data->size = ALIGN16(data->u.img.height) * data->u.img.stride_in_bytes;
+				if (!data->size) 
+					return -1;
+				// set min/max values as uniform value
+				if (data->u.img.format == VX_DF_IMAGE_U8 ||
+					data->u.img.format == VX_DF_IMAGE_S16 ||
+					data->u.img.format == VX_DF_IMAGE_U16 ||
+					data->u.img.format == VX_DF_IMAGE_S32 ||
+					data->u.img.format == VX_DF_IMAGE_U32 ||
+					data->u.img.format == VX_DF_IMAGE_U1_AMD)
+				{
+					data->u.img.hasMinMax = vx_true_e;
+					data->u.img.minValue = (vx_int32)data->u.img.uniform[0];
+					data->u.img.maxValue = (vx_int32)data->u.img.uniform[0];
+				}
+			}
+			else {
+				// calculate other attributes and buffer size:
+				//   - make sure that the stride is multiple of 16 bytes
+				data->u.img.stride_in_bytes = (((data->u.img.width * data->u.img.pixel_size_in_bits + 7) >> 3) + 15) & ~15;
+				data->size = ALIGN16(data->u.img.height) * data->u.img.stride_in_bytes;
+				if (!data->size) 
+					return -1;
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		// calculate other attributes and buffer size
+		data->u.arr.itemsize = agoType2Size(data->ref.context, data->u.arr.itemtype);
+		if (!data->u.arr.itemsize) {
+			vx_size size = agoGetUserStructSize(data->ref.context, data->u.arr.itemtype);
+			if(!size)
+				return -1;
+			data->u.arr.itemsize = size;
+		}
+		data->size = data->u.arr.itemsize * data->u.arr.capacity;
+		if (!data->size) 
+			return -1;
+		data->u.arr.numitems = 0;
+	}
+	else if (data->ref.type == VX_TYPE_DISTRIBUTION) {
+		// calculate other attributes and buffer size
+		data->size = data->u.dist.numbins * sizeof(vx_uint32);
+		if (!data->size) 
+			return -1;
+		data->u.dist.window = (vx_uint32)((data->u.dist.range + data->u.dist.numbins - 1) / data->u.dist.numbins);
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		// calculate other attributes and buffer size
+		if (data->u.lut.type != VX_TYPE_UINT8 || data->u.lut.count != 256) return -1;
+		data->size = sizeof(vx_uint8) * 256;
+		if (!data->size) 
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_THRESHOLD) {
+		// calculate other attributes and buffer size
+		data->u.thr.false_value = 0;
+		if (data->u.thr.data_type == VX_TYPE_UINT8) data->u.thr.true_value = 0xff;
+		else if (data->u.thr.data_type == VX_TYPE_UINT16) data->u.thr.true_value = 0xffff;
+		else if (data->u.thr.data_type == VX_TYPE_INT16) data->u.thr.true_value = 0x7fff;
+		else
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_CONVOLUTION) {
+		// check validity of shift
+		if (data->u.conv.shift >= 32)
+			return -1;
+		// calculate other attributes and buffer size
+		data->size = data->u.conv.columns * data->u.conv.rows * sizeof(vx_int16);
+		if (!data->size) 
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_MATRIX) {
+		// calculate other attributes and buffer size
+		if (data->u.mat.type == VX_TYPE_INT32) 
+			data->u.mat.itemsize = sizeof(vx_int32);
+		else if (data->u.mat.type == VX_TYPE_FLOAT32) 
+			data->u.mat.itemsize = sizeof(vx_float32);
+		else
+			return -1;
+		data->size = data->u.mat.columns * data->u.mat.rows * data->u.mat.itemsize;
+		if (!data->size) 
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		// calculate remap_fractional_bits
+		if (data->u.remap.src_width >= (1 << (15 - (AGO_REMAP_FRACTIONAL_BITS - 3))) || data->u.remap.src_height >= (1 << (15 - (AGO_REMAP_FRACTIONAL_BITS - 3))))
+			data->u.remap.remap_fractional_bits = AGO_REMAP_FRACTIONAL_BITS - 3;
+		else if (data->u.remap.src_width >= (1 << (15 - (AGO_REMAP_FRACTIONAL_BITS - 2))) || data->u.remap.src_height >= (1 << (15 - (AGO_REMAP_FRACTIONAL_BITS - 2))))
+			data->u.remap.remap_fractional_bits = AGO_REMAP_FRACTIONAL_BITS - 2;
+		else if (data->u.remap.src_width >= (1 << (15 - (AGO_REMAP_FRACTIONAL_BITS - 1))) || data->u.remap.src_height >= (1 << (15 - (AGO_REMAP_FRACTIONAL_BITS - 1))))
+			data->u.remap.remap_fractional_bits = AGO_REMAP_FRACTIONAL_BITS - 1;
+		else
+			data->u.remap.remap_fractional_bits = AGO_REMAP_FRACTIONAL_BITS;
+		// calculate other attributes and buffer size
+		data->size = data->u.remap.dst_width * data->u.remap.dst_height * sizeof(ago_coord2d_ushort_t);
+		if (!data->size) 
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		// nothing to do
+	}
+	else if (data->ref.type == VX_TYPE_STRING_AMD) {
+		// nothing to do
+	}
+	else if (data->ref.type == AGO_TYPE_MEANSTDDEV_DATA) {
+		// calculate other attributes and buffer size
+		data->size = sizeof(ago_meanstddev_data_t);
+	}
+	else if (data->ref.type == AGO_TYPE_MINMAXLOC_DATA) {
+		// calculate other attributes and buffer size
+		data->size = sizeof(ago_minmaxloc_data_t);
+	}
+	else if (data->ref.type == AGO_TYPE_CANNY_STACK) {
+		// calculate other attributes and buffer size
+		data->u.cannystack.stackTop = 0;
+		data->size = sizeof(ago_coord2d_ushort_t) * data->u.cannystack.count;
+		if (!data->size) 
+			return -1;
+	}
+	else if (data->ref.type == AGO_TYPE_SCALE_MATRIX) {
+		// nothing to do
+	}
+	else return -1;
+	return 0;
+}
+
+int agoAllocData(AgoData * data)
+{
+	if (data->buffer) {
+		// already allocated: nothing to do
+		return 0;
+	}
+	else if (agoDataSanityCheckAndUpdate(data)) {
+		// can't proceed further
+		return -1;
+	}
+
+	if (data->ref.type == VX_TYPE_DELAY) {
+		for (vx_uint32 child = 0; child < data->numChildren; child++) {
+			if (data->children[child]) {
+				if (agoAllocData(data->children[child])) {
+					return -1;
+				}
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_PYRAMID) {
+		for (vx_uint32 child = 0; child < data->numChildren; child++) {
+			if (data->children[child]) {
+				if (agoAllocData(data->children[child])) {
+					return -1;
+				}
+			}
+		}
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+		// initialize pyramid image information
+		ago_pyramid_u8_t * pyrInfo = (ago_pyramid_u8_t *) data->buffer;
+		for (vx_uint32 child = 0; child < data->numChildren; child++) {
+			if (data->children[child]) {
+				pyrInfo[child].width = data->children[child]->u.img.width;
+				pyrInfo[child].height = data->children[child]->u.img.height;
+				pyrInfo[child].strideInBytes = data->children[child]->u.img.stride_in_bytes;
+				pyrInfo[child].imageAlreadyComputed = vx_false_e;
+				pyrInfo[child].pImage = data->children[child]->buffer;
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_IMAGE) {
+		if (data->children) {
+			for (vx_uint32 child = 0; child < data->numChildren; child++) {
+				if (data->children[child]) {
+					if (agoAllocData(data->children[child])) {
+						// TBD error handling
+						return -1;
+					}
+				}
+			}
+		}
+		else if (data->u.img.isROI) {
+            // make sure that the master image has been allocated
+			if (!data->u.img.roiMasterImage->buffer) {
+				if (agoAllocData(data->u.img.roiMasterImage) < 0) {
+					return -1;
+				}
+			}
+			// get the region from master image
+			data->buffer = data->u.img.roiMasterImage->buffer +
+				data->u.img.rect_roi.start_y * data->u.img.stride_in_bytes +
+				((data->u.img.rect_roi.start_x * data->u.img.pixel_size_in_bits) >> 3);
+		}
+		else {
+			if (data->u.img.isUniform) {
+				// allocate buffer
+				data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+				if (!data->buffer_allocated)
+					return -1;
+				// initialize image with uniform values
+				if (data->u.img.format == VX_DF_IMAGE_RGBX) {
+					vx_uint32 value = (data->u.img.uniform[0] & 0xff) | ((data->u.img.uniform[1] & 0xff) << 8) | ((data->u.img.uniform[2] & 0xff) << 16) | ((data->u.img.uniform[3] & 0xff) << 24);
+					HafCpu_MemSet_U32(data->size >> 2, (vx_uint32 *)data->buffer, value);
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_RGB) {
+					vx_uint32 value = (data->u.img.uniform[0] & 0xff) | ((data->u.img.uniform[1] & 0xff) << 8) | ((data->u.img.uniform[2] & 0xff) << 16);
+					vx_uint8 * row = data->buffer;
+					for (vx_uint32 y = 0; y < data->u.img.height; y++, row += data->u.img.stride_in_bytes) {
+						HafCpu_MemSet_U24(data->u.img.width, row, value);
+					}
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_UYVY) {
+					vx_uint32 value = (data->u.img.uniform[1] & 0xff) | ((data->u.img.uniform[0] & 0xff) << 8) | ((data->u.img.uniform[2] & 0xff) << 16) | ((data->u.img.uniform[0] & 0xff) << 24);
+					HafCpu_MemSet_U32(data->size >> 2, (vx_uint32 *)data->buffer, value);
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_YUYV) {
+					vx_uint32 value = (data->u.img.uniform[0] & 0xff) | ((data->u.img.uniform[1] & 0xff) << 8) | ((data->u.img.uniform[0] & 0xff) << 16) | ((data->u.img.uniform[2] & 0xff) << 24);
+					HafCpu_MemSet_U32(data->size >> 2, (vx_uint32 *)data->buffer, value);
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_U8) {
+					vx_uint8 value = (vx_uint8)data->u.img.uniform[0];
+					HafCpu_MemSet_U8(data->size, data->buffer, value);
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_U16 || data->u.img.format == VX_DF_IMAGE_S16) {
+					vx_uint16 value = (vx_uint16)data->u.img.uniform[0];
+					HafCpu_MemSet_U16(data->size >> 1, (vx_uint16 *)data->buffer, value);
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_U32 || data->u.img.format == VX_DF_IMAGE_S32) {
+					vx_uint32 value = (vx_uint32)data->u.img.uniform[0];
+					HafCpu_MemSet_U32(data->size >> 2, (vx_uint32 *)data->buffer, value);
+				}
+				else if (data->u.img.format == VX_DF_IMAGE_U1_AMD) {
+					vx_uint8 value = data->u.img.uniform[0] ? 255 : 0;
+					HafCpu_MemSet_U8(data->size, data->buffer, value);
+					// make sure that the data->u.img.uniform[0] is 0 or 1
+					data->u.img.uniform[0] = data->u.img.uniform[0] ? 1 : 0;
+				}
+				else {
+					// TBD error handling
+					return -1;
+				}
+			}
+			else {
+				// allocate buffer and get aligned buffer with 16-byte alignment
+				data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+				if (!data->buffer_allocated)
+					return -1;
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_DISTRIBUTION) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		data->reserved = data->reserved_allocated = (vx_uint8 *)agoAllocMemory(256 * sizeof(vx_uint32));
+		if (!data->buffer_allocated || !data->reserved_allocated)
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_THRESHOLD) {
+		// nothing to do
+	}
+	else if (data->ref.type == VX_TYPE_CONVOLUTION) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+		// allocate reserved buffer to store float version of coefficients
+		data->reserved = data->reserved_allocated = (vx_uint8 *)agoAllocMemory(data->size << 1);
+		if (!data->reserved_allocated)
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_MATRIX) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		data->reserved = data->reserved_allocated = (vx_uint8 *)agoAllocMemory(data->u.remap.dst_width * data->u.remap.dst_height * sizeof(ago_coord2d_float_t));
+		if (!data->buffer_allocated || !data->reserved_allocated)
+			return -1;
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		// nothing to do
+	}
+	else if (data->ref.type == AGO_TYPE_MEANSTDDEV_DATA) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+	}
+	else if (data->ref.type == AGO_TYPE_MINMAXLOC_DATA) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+	}
+	else if (data->ref.type == AGO_TYPE_CANNY_STACK) {
+		// allocate buffer and get aligned buffer with 16-byte alignment
+		data->buffer = data->buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size);
+		if (!data->buffer_allocated)
+			return -1;
+	}
+	else if (data->ref.type == AGO_TYPE_SCALE_MATRIX) {
+		// nothing to do
+	}
+	else return -1;
+	return 0;
+}
+
+void agoRetainData(AgoGraph * graph, AgoData * data, bool isForExternalUse)
+{
+	if (isForExternalUse) {
+		data->ref.external_count++;
+	}
+	else {
+		data->ref.internal_count++;
+	}
+	if (graph && data->isVirtual) {
+		// if found in trash, move it to data list
+		bool foundInTrash = false;
+		if (data == graph->dataList.trash) {
+			graph->dataList.trash = data->next;
+			data->next = nullptr;
+			foundInTrash = true;
+		}
+		else if (graph->dataList.trash) {
+			for (AgoData * cur = graph->dataList.trash; cur->next; cur = cur->next) {
+				if (cur->next == data) {
+					cur->next = data->next;
+					data->next = nullptr;
+					foundInTrash = true;
+					break;
+				}
+			}
+		}
+		if (foundInTrash) {
+			// add the data into main part of the list
+			data->next = graph->dataList.tail;
+			graph->dataList.tail = data;
+			if (!graph->dataList.head)
+				graph->dataList.head = data;
+		}
+	}
+}
+
+int agoReleaseData(AgoData * data, bool isForExternalUse)
+{
+	if (data->isVirtual) {
+		AgoGraph * graph = (AgoGraph *)data->ref.scope;
+		CAgoLock lock(graph->cs);
+		if (isForExternalUse) {
+			if (data->ref.external_count > 0)
+				data->ref.external_count--;
+		}
+		else {
+			if (data->ref.internal_count > 0)
+				data->ref.internal_count--;
+		}
+		if (data->ref.external_count == 0 && data->ref.internal_count == 0) {
+			// clear child link in it's paren link
+			if (data->parent) {
+				for (vx_uint32 i = 0; i < data->parent->numChildren; i++) {
+					if (data->parent->children[i] == data) {
+						data->parent->children[i] = NULL;
+					}
+				}
+			}
+			// remove all children of data
+			for (vx_uint32 i = 0; i < data->numChildren; i++) {
+				if (data->children[i]) {
+					// release the children
+					data->children[i]->ref.external_count = 0;
+					if (agoReleaseData(data->children[i], false)) {
+						agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoReleaseData: agoReleaseData(context,%s) failed for children[%d]\n", data->children[i]->name.c_str(), i);
+						return -1;
+					}
+					data->children[i] = NULL;
+				}
+			}
+			// remove the data from graph
+			if (agoRemoveData(&graph->dataList, data, nullptr)) {
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoReleaseData: agoRemoveData(context,%s) failed\n", data->name.c_str());
+				return -1;
+			}
+		}
+	}
+	else {
+		AgoContext * context = data->ref.context;
+		CAgoLock lock(context->cs);
+		if (isForExternalUse) {
+			if (data->ref.external_count > 0)
+				data->ref.external_count--;
+		}
+		else {
+			if (data->ref.internal_count > 0)
+				data->ref.internal_count--;
+		}
+		if (data->ref.external_count == 0 && data->ref.internal_count == 0) {
+			// clear child link in it's paren link
+			if (data->parent) {
+				for (vx_uint32 i = 0; i < data->parent->numChildren; i++) {
+					if (data->parent->children[i] == data) {
+						data->parent->children[i] = NULL;
+					}
+				}
+			}
+			// remove all children of data
+			for (vx_uint32 i = 0; i < data->numChildren; i++) {
+				if (data->children[i]) {
+					// release the children
+					data->children[i]->ref.external_count = 0;
+					data->children[i]->parent = NULL; // NOTE: this is needed to terminate recursion
+					if (agoReleaseData(data->children[i], false)) {
+						agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoReleaseData: agoReleaseData(context,%s) failed for children[%d]\n", data->children[i]->name.c_str(), i);
+						return -1;
+					}
+					data->children[i] = NULL;
+				}
+			}
+			// remove the data from context
+			if (agoRemoveData(&context->dataList, data, nullptr)) {
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoReleaseData: agoRemoveData(context,%s) failed\n", data->name.c_str());
+				return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoReleaseKernel(AgoKernel * kernel, bool isForExternalUse)
+{
+	vx_context context = kernel->ref.context;
+	CAgoLock lock(context->cs);
+	if (isForExternalUse) {
+		if (kernel->ref.external_count > 0)
+			kernel->ref.external_count--;
+	}
+	else {
+		if (kernel->ref.internal_count > 0)
+			kernel->ref.internal_count--;
+	}
+	if (kernel->ref.external_count == 0 && kernel->ref.internal_count == 0 && kernel->external_kernel && !kernel->finalized) {
+		// only remove the kernels that are created externally
+		if (agoRemoveKernel(&context->kernelList, kernel) != kernel) {
+			agoAddLogEntry(&kernel->ref, VX_FAILURE, "ERROR: agoReleaseKernel: agoRemoveKernel(context,%s) failed\n", kernel->name);
+			return -1;
+		}
+		delete kernel;
+	}
+	return 0;
+}
+
+AgoNode * agoCreateNode(AgoGraph * graph, AgoKernel * kernel)
+{
+	AgoNode * node = new AgoNode;
+	agoResetReference(&node->ref, VX_TYPE_NODE, graph->ref.context, &graph->ref);
+	node->attr_affinity = graph->attr_affinity;
+	node->ref.internal_count = 1;
+	node->akernel = kernel;
+	node->attr_border_mode.mode = VX_BORDER_MODE_UNDEFINED;
+	node->localDataSize = kernel->localDataSize;
+	node->localDataPtr = NULL;
+	node->paramCount = kernel->argCount;
+	memcpy(node->parameters, kernel->parameters, sizeof(node->parameters));
+	for (vx_uint32 i = 0; i < node->paramCount; i++) {
+		agoResetReference(&node->parameters[i].ref, VX_TYPE_PARAMETER, graph->ref.context, &graph->ref);
+		node->parameters[i].scope = &node->ref;
+	}
+	agoAddNode(&graph->nodeList, node);
+	kernel->ref.internal_count++;
+	return node;
+}
+
+AgoNode * agoCreateNode(AgoGraph * graph, vx_enum kernel_id)
+{
+	AgoNode * node = NULL;
+	AgoKernel * kernel = agoFindKernelByEnum(graph->ref.context, kernel_id);
+	if (kernel) {
+		node = agoCreateNode(graph, kernel);
+	}
+	return node;
+}
+
+int agoReleaseNode(AgoNode * node)
+{
+	vx_graph graph = (vx_graph)node->ref.scope;
+	CAgoLock lock(graph->cs);
+	if (node->ref.external_count > 0) {
+		node->ref.external_count--;
+	}
+	if (node->ref.external_count == 0 && node->ref.internal_count == 0) {
+		// only remove the node if there are no internal references
+		if (agoRemoveNode(&graph->nodeList, node, true)) {
+			agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: agoReleaseNode: agoRemoveNode(graph,%s) failed\n", node->akernel->name);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+void agoEvaluateIntegerExpression(char * expr)
+{
+	bool inValue = false;
+	char opStack[32];
+	int valStack[32], valStackTop = 0, opStackTop = 0;
+	for (const char * s = expr; *s; s++) {
+		char c = *s;
+		if (c >= '0' && c <= '9') {
+			if (!inValue) {
+				inValue = true;
+				valStack[valStackTop++] = 0;
+			}
+			valStack[valStackTop-1] = valStack[valStackTop-1] * 10 + c - '0';
+		}
+		else if (c == '+' || c == '-' || c == '*' || c == '/' || c == '(' || c == ')') {
+			if (c == '(') {
+				if (inValue) 
+					return; // error
+				opStack[opStackTop++] = c;
+			}
+			else {
+				if (c == ')') {
+					bool valid = false;
+					for (; opStackTop-- > 0;) {
+						if (opStack[opStackTop] == '(') {
+							valid = true;
+							break;
+						}
+						if (valStackTop < 2)
+							return; // error
+						valStackTop--;
+						if (opStack[opStackTop] == '+') valStack[valStackTop - 1] += valStack[valStackTop];
+						if (opStack[opStackTop] == '-') valStack[valStackTop - 1] -= valStack[valStackTop];
+						if (opStack[opStackTop] == '*') valStack[valStackTop - 1] *= valStack[valStackTop];
+						if (opStack[opStackTop] == '/') valStack[valStackTop - 1] /= valStack[valStackTop];
+					}
+					if (!valid)
+						return; // error
+				}
+				else if (c == '+' || c == '-') {
+					for (; opStackTop > 0;) {
+						int op = opStack[opStackTop - 1];
+						if (op == '+' || op == '-' || op == '*' || op == '/') {
+							if (valStackTop < 2)
+								return; // error
+							opStackTop--;
+							valStackTop--;
+							if (opStack[opStackTop] == '+') valStack[valStackTop - 1] += valStack[valStackTop];
+							if (opStack[opStackTop] == '-') valStack[valStackTop - 1] -= valStack[valStackTop];
+							if (opStack[opStackTop] == '*') valStack[valStackTop - 1] *= valStack[valStackTop];
+							if (opStack[opStackTop] == '/') valStack[valStackTop - 1] /= valStack[valStackTop];
+						}
+						else break;
+					}
+					opStack[opStackTop++] = c;
+				}
+				else if (c == '*' || c == '/') {
+					for (; opStackTop > 0;) {
+						int op = opStack[opStackTop - 1];
+						if (op == '*' || op == '/') {
+							if (valStackTop < 2)
+								return; // error
+							opStackTop--;
+							valStackTop--;
+							if (opStack[opStackTop] == '*') valStack[valStackTop - 1] *= valStack[valStackTop];
+							if (opStack[opStackTop] == '/') valStack[valStackTop - 1] /= valStack[valStackTop];
+						}
+						else break;
+					}
+					opStack[opStackTop++] = c;
+				}
+			}
+			inValue = false;
+		}
+		else
+			return; // error
+	}
+	for (; opStackTop > 0;) {
+		int op = opStack[opStackTop - 1];
+		if (op == '+' || op == '-' || op == '*' || op == '/') {
+			if (valStackTop < 2)
+				return; // error
+			opStackTop--;
+			valStackTop--;
+			if (opStack[opStackTop] == '+') valStack[valStackTop - 1] += valStack[valStackTop];
+			if (opStack[opStackTop] == '-') valStack[valStackTop - 1] -= valStack[valStackTop];
+			if (opStack[opStackTop] == '*') valStack[valStackTop - 1] *= valStack[valStackTop];
+			if (opStack[opStackTop] == '/') valStack[valStackTop - 1] /= valStack[valStackTop];
+		}
+		else
+			return; // error
+	}
+	if (valStackTop == 1) {
+		sprintf(expr, "%d", valStack[0]);
+	}
+}
+
+void agoImportNodeConfig(AgoNode * childnode, AgoNode * anode)
+{
+	childnode->attr_border_mode = anode->attr_border_mode;
+	childnode->attr_affinity = anode->attr_affinity;
+	if (anode->callback) {
+		// TBD: need a mechanism to propagate callback changes later in the flow and
+		// and ability to have multiple callbacks for the same node as multiple original nodes
+		// can get mapped to one node after optimization and one can get mapped to several nodes 
+		// after optimization
+		childnode->callback = anode->callback;
+	}
+}
+
+void agoPerfCaptureReset(vx_perf_t * perf)
+{
+	memset(perf, 0, sizeof(*perf));
+}
+
+void agoPerfCaptureStart(vx_perf_t * perf)
+{
+	perf->beg = agoGetClockCounter();
+}
+
+void agoPerfCaptureStop(vx_perf_t * perf)
+{
+	perf->end = agoGetClockCounter();
+	perf->tmp = perf->end - perf->beg;
+	perf->min = (perf->num == 0) ? perf->tmp : ((perf->tmp < perf->min) ? perf->tmp : perf->min);
+	perf->max = (perf->num == 0) ? perf->tmp : ((perf->tmp > perf->max) ? perf->tmp : perf->max);
+	perf->sum += perf->tmp;
+	perf->num++;
+	perf->avg = perf->sum / perf->num;
+}
+
+void agoPerfCopyNormalize(AgoContext * context, vx_perf_t * perfDst, vx_perf_t * perfSrc)
+{
+	agoPerfCaptureReset(perfDst);
+	perfDst->num = perfSrc->num;
+	perfDst->beg = perfSrc->beg;
+	perfDst->end = perfSrc->end;
+	perfDst->tmp = perfSrc->tmp;
+	perfDst->sum = perfSrc->sum;
+	perfDst->avg = perfSrc->avg;
+	perfDst->min = perfSrc->min;
+	perfDst->max = perfSrc->max;
+}
+
+void agoAddLogEntry(vx_reference ref, vx_status status, const char *message, ...)
+{
+	va_list ap;
+	if (agoIsValidReference(ref) && ref->enable_logging && ref->context->callback_log) {
+		vx_char string[VX_MAX_LOG_MESSAGE_LEN];
+		va_start(ap, message);
+		vsnprintf(string, VX_MAX_LOG_MESSAGE_LEN, message, ap);
+		string[VX_MAX_LOG_MESSAGE_LEN - 1] = 0; // for MSVC which is not C99 compliant
+		va_end(ap);
+		if (!ref->context->callback_reentrant) {
+			CAgoLock lock(ref->context->cs); // TBD: create a separate lock object for log_callback
+			ref->context->callback_log(ref->context, ref, status, string);
+		}
+		else {
+			ref->context->callback_log(ref->context, ref, status, string);
+		}
+	}
+#if _DEBUG
+	else {
+		va_start(ap, message);
+		vprintf(message, ap);
+		va_end(ap);
+	}
+#endif
+}
+
+// constructor and destructors of basic data types
+AgoReference::AgoReference()
+: dispatchTbl{ nullptr }, magic{ AGO_MAGIC_VALID }, type{ VX_TYPE_REFERENCE }, context{ nullptr }, scope{ nullptr },
+  external_count{ 0 }, internal_count{ 0 }, read_count{ 0 }, write_count{ 0 }, hint_serialize{ false }, enable_logging{ ENABLE_LOG_MESSAGES_DEFAULT },
+  read_only{ false }, status{ VX_SUCCESS }
+{
+}
+AgoReference::~AgoReference()
+{
+	magic = AGO_MAGIC_INVALID;
+}
+AgoData::AgoData()
+	: next{ nullptr }, size{ 0 }, import_type{ VX_IMPORT_TYPE_NONE }, 
+	  buffer{ nullptr }, buffer_allocated{ nullptr }, reserved{ nullptr }, reserved_allocated{ nullptr }, buffer_sync_flags{ 0 }, 
+#if ENABLE_OPENCL
+	  opencl_buffer{ nullptr }, opencl_buffer_allocated{ nullptr },
+#endif
+	  opencl_svm_buffer{ nullptr }, opencl_svm_buffer_allocated{ nullptr }, opencl_buffer_offset{ 0 },
+	  isVirtual{ vx_false_e }, isDelayed{ vx_false_e }, isNotFullyConfigured{ vx_false_e }, isInitialized{ vx_false_e }, siblingIndex{ 0 },
+	  numChildren{ 0 }, children{ nullptr }, parent{ nullptr }, inputUsageCount{ 0 }, outputUsageCount{ 0 }, inoutUsageCount{ 0 },
+	  hierarchical_level{ 0 }, hierarchical_life_start{ 0 }, hierarchical_life_end{ 0 }
+{
+	memset(&u, 0, sizeof(u));
+	memset(&delta, 0, sizeof(delta));
+}
+AgoData::~AgoData()
+{
+#if ENABLE_OPENCL
+	agoGpuOclReleaseData(this);
+#endif
+	if (buffer_allocated) {
+		agoReleaseMemory(buffer_allocated);
+		buffer_allocated = nullptr;
+	}
+	if (reserved_allocated) {
+		agoReleaseMemory(reserved_allocated);
+		reserved_allocated = nullptr;
+	}
+}
+AgoParameter::AgoParameter()
+	: scope{ nullptr }, index{ 0 }, direction{ VX_INPUT }, type{ VX_TYPE_REFERENCE }, state{ VX_PARAMETER_STATE_REQUIRED }
+{
+}
+AgoParameter::~AgoParameter()
+{
+}
+AgoKernel::AgoKernel()
+	: next{ nullptr }, id{ VX_KERNEL_INVALID }, flags{ 0 }, func{ nullptr }, argCount{ 0 }, kernOpType{ 0 }, kernOpInfo{ 0 },
+	  localDataSize{ 0 }, localDataPtr{ nullptr }, external_kernel{ false }, finalized{ false },
+	  kernel_f{ nullptr }, input_validate_f{ nullptr }, output_validate_f{ nullptr }, initialize_f{ nullptr }, deinitialize_f{ nullptr }, 
+	  query_target_support_f{ nullptr }, opencl_codegen_callback_f{ nullptr }, regen_callback_f{ nullptr },
+	  importing_module_index_plus1{ 0 }
+{
+	memset(&name, 0, sizeof(name));
+	memset(&argConfig, 0, sizeof(argConfig));
+	memset(&argType, 0, sizeof(argType));
+}
+AgoKernel::~AgoKernel()
+{
+}
+AgoSuperNode::AgoSuperNode()
+	: next{ nullptr }, group{ 0 }, width{ 0 }, height{ 0 }, launched{ false }, isGpuOclSuperNode{ false },
+#if ENABLE_OPENCL
+	  opencl_cmdq{ nullptr }, opencl_program{ nullptr }, opencl_kernel{ nullptr }, opencl_event{ nullptr },
+#endif
+	  status{ VX_SUCCESS }
+{
+#if ENABLE_OPENCL
+	memset(&opencl_global_work, 0, sizeof(opencl_global_work));
+#endif
+	memset(&perf, 0, sizeof(perf));
+}
+AgoSuperNode::~AgoSuperNode()
+{
+}
+AgoNode::AgoNode()
+	: next{ nullptr }, akernel{ nullptr }, flags{ 0 }, localDataSize{ 0 }, localDataPtr{ nullptr }, localDataPtr_allocated{ nullptr },
+	  paramCount{ 0 }, callback{ nullptr }, supernode{ nullptr }, initialized{ false }, target_support_flags{ 0 }, hierarchical_level{ 0 }, status{ VX_SUCCESS }
+#if ENABLE_OPENCL
+	, opencl_type{ 0 }, opencl_param_mem2reg_mask{ 0 }, opencl_param_discard_mask{ 0 }, 
+	  opencl_param_atomic_mask{ 0 }, opencl_local_buffer_usage_mask{ 0 }, opencl_local_buffer_size_in_bytes{ 0 }, opencl_work_dim{ 0 },
+	  opencl_compute_work_multiplier{ 0 }, opencl_compute_work_param_index{ 0 }, opencl_output_array_param_index_plus1{ 0 },
+	  opencl_program{ nullptr }, opencl_kernel{ nullptr }, opencl_event{ nullptr }
+#endif
+{
+	memset(&attr_border_mode, 0, sizeof(attr_border_mode));
+	memset(&attr_affinity, 0, sizeof(attr_affinity));
+	memset(&paramList, 0, sizeof(paramList));
+	memset(&paramListForAgeDelay, 0, sizeof(paramListForAgeDelay));
+	memset(&funcExchange, 0, sizeof(funcExchange));
+	memset(&rect_valid, 0, sizeof(rect_valid));
+	memset(&perf, 0, sizeof(perf));
+#if ENABLE_OPENCL
+	memset(&opencl_name, 0, sizeof(opencl_name));
+	memset(&opencl_scalar_array_output_sync, 0, sizeof(opencl_scalar_array_output_sync));
+	memset(&opencl_global_work, 0, sizeof(opencl_global_work));
+	memset(&opencl_local_work, 0, sizeof(opencl_local_work));
+#endif
+}
+AgoNode::~AgoNode()
+{
+	agoShutdownNode(this);
+#if ENABLE_OPENCL
+	if (opencl_event) {
+		clReleaseEvent(opencl_event);
+	}
+	if (opencl_kernel) {
+		clReleaseKernel(opencl_kernel);
+	}
+	if (opencl_program) {
+		clReleaseProgram(opencl_program);
+	}
+#endif
+}
+AgoGraph::AgoGraph()
+	: next{ nullptr }, hThread{ nullptr }, hSemToThread{ nullptr }, hSemFromThread{ nullptr },
+	  threadScheduleCount{ 0 }, threadExecuteCount{ 0 }, threadWaitCount{ 0 }, threadThreadTerminationState{ 0 },
+	  isReadyToExecute{ vx_false_e }, detectedInvalidNode{ false }, status{ VX_SUCCESS },
+	  virtualDataGenerationCount{ 0 }, optimizer_flags{ AGO_GRAPH_OPTIMIZER_FLAGS_DEFAULT }, verified{ false }
+#if ENABLE_OPENCL
+	, supernodeList{ nullptr }, opencl_cmdq{ nullptr }, opencl_device{ nullptr }
+#endif
+{
+	memset(&dataList, 0, sizeof(dataList));
+	memset(&nodeList, 0, sizeof(nodeList));
+	memset(&perf, 0, sizeof(perf));
+	memset(&opencl_perf, 0, sizeof(opencl_perf));
+	memset(&opencl_perf_total, 0, sizeof(opencl_perf_total));
+	memset(&attr_affinity, 0, sizeof(attr_affinity));
+	// critical section
+	InitializeCriticalSection(&cs);
+}
+AgoGraph::~AgoGraph()
+{
+	// move all virtual data to garbage data list
+	while (dataList.trash) {
+		agoRemoveData(&dataList, dataList.trash, &ref.context->graph_garbage_data);
+	}
+	while (dataList.head) {
+		agoRemoveData(&dataList, dataList.head, &ref.context->graph_garbage_data);
+	}
+
+	agoResetNodeList(&nodeList);
+#if ENABLE_OPENCL
+	agoResetSuperNodeList(supernodeList);
+	supernodeList = NULL;
+	agoGpuOclReleaseGraph(this);
+#endif
+
+	// critical section
+	DeleteCriticalSection(&cs);
+}
+AgoContext::AgoContext()
+	: perfNormFactor{ 0 }, dataGenerationCount{ 0 }, nextUserStructId{ VX_TYPE_USER_STRUCT_START },
+	  num_active_modules{ 0 }, num_active_references{ 0 }, callback_log{ nullptr }, callback_reentrant{ vx_false_e },
+	  thread_config{ CONFIG_THREAD_DEFAULT }, importing_module_index_plus1{ 0 }, graph_garbage_data{ nullptr }, graph_garbage_node{ nullptr }, graph_garbage_list{ nullptr }
+#if ENABLE_OPENCL
+	, opencl_context_imported{ false }, opencl_context{ nullptr }, opencl_cmdq{ nullptr }, opencl_config_flags{ 0 }, opencl_svmcaps{ 0 }, opencl_num_devices{ 0 }
+#endif
+{
+	memset(&kernelList, 0, sizeof(kernelList));
+	memset(&dataList, 0, sizeof(dataList));
+	memset(&graphList, 0, sizeof(graphList));
+	memset(&immediate_border_mode, 0, sizeof(immediate_border_mode));
+	memset(&extensions, 0, sizeof(extensions));
+#if ENABLE_OPENCL
+	memset(&opencl_extensions, 0, sizeof(opencl_extensions));
+	memset(&opencl_device_list, 0, sizeof(opencl_device_list));
+	memset(&opencl_build_options, 0, sizeof(opencl_build_options));
+#endif
+	memset(&attr_affinity, 0, sizeof(attr_affinity));
+	// critical section
+	InitializeCriticalSection(&cs);
+	// initialize constants as enumerations with name "!<name>"
+	for (vx_uint32 i = 0; s_table_constants[i].name; i++) {
+		char word[64];
+		sprintf(word, "scalar:ENUM,0x%08x", s_table_constants[i].value);
+		AgoData * data = agoCreateDataFromDescription(this, NULL, word, false);
+		if (!data) {
+			agoAddLogEntry(nullptr, VX_FAILURE, "ERROR: AgoContext::AgoContext: agoCreateDataFromDescription(*,%s) failed\n", word);
+			ref.status = VX_FAILURE;
+		}
+		else {
+			char name[256];
+			name[0] = '!';
+			strcpy(name + 1, s_table_constants[i].name);
+			data->name = name;
+			agoAddData(&dataList, data);
+		}
+	}
+}
+AgoContext::~AgoContext()
+{
+	for (AgoGraph * agraph = graphList.head; agraph;) {
+		AgoGraph * next = agraph->next;
+		agraph->ref.external_count = 1;
+		agraph->ref.internal_count = 0;
+		agoReleaseGraph(agraph);
+		agraph = next;
+	}
+
+	agoResetDataList(&dataList);
+	for (AgoData * data = graph_garbage_data; data;) {
+		AgoData * item = data;
+		data = data->next;
+		delete item;
+	}
+
+	for (AgoNode * node = graph_garbage_node; node;) {
+		AgoNode * item = node;
+		node = node->next;
+		delete item;
+	}
+
+	for (AgoGraph * graph = graph_garbage_list; graph;) {
+		AgoGraph * item = graph;
+		graph = graph->next;
+		delete item;
+	}
+
+	for (auto it = macros.begin(); it != macros.end(); ++it) {
+		if (it->text_allocated)
+			free(it->text_allocated);
+	}
+	agoResetKernelList(&kernelList);
+#if ENABLE_OPENCL
+	agoGpuOclReleaseContext(this);
+#endif
+	// critical section
+	DeleteCriticalSection(&cs);
+}
diff --git a/openvx/ago/ago_util_opencl.cpp b/openvx/ago/ago_util_opencl.cpp
new file mode 100644
index 0000000..7e2499a
--- /dev/null
+++ b/openvx/ago/ago_util_opencl.cpp
@@ -0,0 +1,1816 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+#include "ago_haf_gpu.h"
+
+#define ENABLE_LOCAL_DEBUG_MESSAGES                       0
+#define ENABLE_DEBUG_DUMP_CL_BUFFERS                      0
+
+#if ENABLE_DEBUG_DUMP_CL_BUFFERS
+static void clDumpBuffer(const char * fileNameFormat, cl_command_queue opencl_cmdq, AgoData * data)
+{
+	if(!data->opencl_buffer) return;
+	static int dumpBufferCount = 0; dumpBufferCount++;
+	char fileName[1024]; sprintf(fileName, fileNameFormat, dumpBufferCount);
+	cl_mem opencl_buffer = data->opencl_buffer;
+	cl_uint opencl_buffer_offset = data->opencl_buffer_offset;
+	cl_uint size = (cl_uint)data->size;
+	FILE * fp = fopen(fileName, "wb"); if (!fp) { printf("ERROR: unable to create: %s\n", fileName); exit(1); }
+	clFinish(opencl_cmdq);
+	void * p = clEnqueueMapBuffer(opencl_cmdq, opencl_buffer, CL_TRUE, CL_MAP_READ, 0, opencl_buffer_offset + size, 0, NULL, NULL, NULL);
+	fwrite(p, 1, opencl_buffer_offset + size, fp);
+	clEnqueueUnmapMemObject(opencl_cmdq, opencl_buffer, p, 0, NULL, NULL);
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		printf("OK: dumped buffer %4.4s %dx%d,%d (%d+%d bytes) into %s\n", &data->u.img.format, data->u.img.width, data->u.img.height, data->u.img.stride_in_bytes, opencl_buffer_offset, size, fileName);
+	}
+	else {
+		printf("OK: dumped buffer (%d+%d bytes) into %s\n", opencl_buffer_offset, size, fileName);
+	}
+	fclose(fp);
+}
+#endif
+
+#if ENABLE_OPENCL
+int agoGpuOclReleaseContext(AgoContext * context)
+{
+	if (context->opencl_cmdq) {
+		cl_int status = clReleaseCommandQueue(context->opencl_cmdq);
+		if (status) {
+			agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: agoGpuOclReleaseContext: clReleaseCommandQueue(%p) failed (%d)\n", context->opencl_cmdq, status);
+			return -1;
+		}
+		context->opencl_cmdq = NULL;
+	}
+	if (context->opencl_context && !context->opencl_context_imported) {
+		cl_int status = clReleaseContext(context->opencl_context);
+		if (status) {
+			agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: agoGpuOclReleaseContext: clReleaseContext(%p) failed (%d)\n", context->opencl_context, status);
+			return -1;
+		}
+	}
+	context->opencl_context = NULL;
+	return 0;
+}
+
+int agoGpuOclReleaseGraph(AgoGraph * graph)
+{
+	if (graph->opencl_cmdq) {
+		cl_int status = clReleaseCommandQueue(graph->opencl_cmdq);
+		if (status) {
+			agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoGpuOclReleaseGraph: clReleaseCommandQueue(%p) failed (%d)\n", graph->opencl_cmdq, status);
+			return -1;
+		}
+		graph->opencl_cmdq = NULL;
+	}
+	return 0;
+}
+
+int agoGpuOclReleaseSuperNode(AgoSuperNode * supernode)
+{
+	cl_int err;
+	if (supernode->opencl_kernel) {
+		err = clReleaseKernel(supernode->opencl_kernel); 
+		if (err) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clReleaseKernel(%p) failed(%d)\n", supernode->opencl_kernel, err);
+			return -1; 
+		}
+	}
+	if (supernode->opencl_program) {
+		err = clReleaseProgram(supernode->opencl_program); 
+		if (err) { 
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clReleaseProgram(%p) failed(%d)\n", supernode->opencl_program, err);
+			return -1; 
+		}
+	}
+	if (supernode->opencl_event) {
+		clReleaseEvent(supernode->opencl_event);
+	}
+	return 0;
+}
+
+int agoGpuOclReleaseData(AgoData * data)
+{
+	if (data->opencl_buffer_allocated) {
+		clReleaseMemObject(data->opencl_buffer_allocated);
+		data->opencl_buffer_allocated = NULL;
+	}
+	if (data->opencl_svm_buffer_allocated) {
+		if (data->ref.context->opencl_config_flags & CONFIG_OPENCL_SVM_AS_FGS) {
+			agoReleaseMemory(data->opencl_svm_buffer_allocated);
+		}
+		else {
+			clSVMFree(data->ref.context->opencl_context, data->opencl_svm_buffer_allocated);
+		}
+		data->opencl_svm_buffer_allocated = NULL;
+	}
+	data->opencl_buffer = NULL;
+	data->opencl_svm_buffer = NULL;
+	data->opencl_buffer_offset = 0;
+	return 0;
+}
+
+int agoGpuOclCreateContext(AgoContext * context, cl_context opencl_context)
+{
+	if (opencl_context) {
+		// use the given OpenCL context 
+		context->opencl_context_imported = true;
+		context->opencl_context = opencl_context;
+	}
+	else {
+		// get AMD platform
+		cl_uint num_platforms;
+		cl_int status;
+		if ((status = clGetPlatformIDs(0, NULL, &num_platforms)) != CL_SUCCESS) {
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clGetPlatformIDs(0,0,*) => %d (failed)\n", status);
+			return -1;
+		}
+		cl_platform_id * platform_list = new cl_platform_id[num_platforms];
+		if ((status = clGetPlatformIDs(num_platforms, platform_list, NULL)) != CL_SUCCESS) {
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clGetPlatformIDs(%d,*,0) => %d (failed)\n", num_platforms, status);
+			return -1;
+		}
+		cl_platform_id platform_id = 0;
+		for (int i = 0; i < (int)num_platforms; i++) {
+			char vendor[128] = { 0 };
+			if ((status = clGetPlatformInfo(platform_list[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL)) != CL_SUCCESS) {
+				agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clGetPlatformInfo([%d],...) => %d (failed)\n", i, status);
+				return -1;
+			}
+			if (!strcmp(vendor, "Advanced Micro Devices, Inc.")) {
+				platform_id = platform_list[i];
+				break;
+			}
+		}
+		delete [] platform_list;
+		if (!platform_id) {
+			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: Could not find a valid AMD platform\n");
+			return -1;
+		}
+		// set context properties
+		cl_context_properties ctxprop[] = {
+			CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
+			0, 0
+		};
+		// create context
+		context->opencl_context_imported = false;
+		context->opencl_context = clCreateContextFromType(ctxprop, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
+		if (!context || status != CL_SUCCESS) {
+			agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateContextFromType(CL_DEVICE_TYPE_GPU) => %d (failed)\n", status);
+			return -1;
+		}
+	}
+	// get the list of GPUs
+	size_t size;
+	cl_int status = clGetContextInfo(context->opencl_context, CL_CONTEXT_DEVICES, sizeof(context->opencl_device_list), context->opencl_device_list, &size);
+	if (status != CL_SUCCESS) {
+		agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clGetContextInfo() => %d\n", status);
+		return -1;
+	}
+	context->opencl_num_devices = (int)(size / sizeof(cl_device_id));
+	// select device id
+	int device_id = 0;
+	if (context->attr_affinity.device_type == AGO_TARGET_AFFINITY_GPU) {
+		if ((context->attr_affinity.device_info & AGO_TARGET_AFFINITY_GPU_INFO_DEVICE_MASK) < context->opencl_num_devices) {
+			device_id = context->attr_affinity.device_info & AGO_TARGET_AFFINITY_GPU_INFO_DEVICE_MASK;
+		}
+	}
+	// get device information
+	char deviceVersion[256] = { 0 };
+	status = clGetDeviceInfo(context->opencl_device_list[device_id], CL_DEVICE_VERSION, sizeof(deviceVersion), deviceVersion, NULL);
+	if (status) { 
+		agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clGetDeviceInfo(%p,CL_DEVICE_VERSION) => %d\n", context->opencl_device_list[device_id], status);
+		return -1; 
+	}
+	// check for OpenCL 1.2 version: force OpenCL 1.2 if environment variable AGO_OPENCL_VERSION_CHECK=1.2
+	char opencl_version_check[64] = "";
+	agoGetEnvironmentVariable("AGO_OPENCL_VERSION_CHECK", opencl_version_check, sizeof(opencl_version_check));
+	if (deviceVersion[7] < '2' || !strcmp(opencl_version_check, "1.2")) {
+		// mark that kernels have to be OpenCL 1.2 compatible
+		context->opencl_config_flags |= CONFIG_OPENCL_USE_1_2;
+	}
+	// get device capabilities
+	char deviceName[256] = { 0 };
+	status = clGetDeviceInfo(context->opencl_device_list[device_id], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
+	if (status) { 
+		agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clGetDeviceInfo(%p,CL_DEVICE_NAME) => %d\n", context->opencl_device_list[device_id], status);
+		return -1; 
+	}
+	agoAddLogEntry(&context->ref, VX_SUCCESS, "OK: OpenVX using GPU device#%d (%s) [%s] [SvmCaps " VX_FMT_SIZE " %d]\n", device_id, deviceName, deviceVersion, context->opencl_svmcaps, context->opencl_config_flags);
+	memset(context->opencl_extensions, 0, sizeof(context->opencl_extensions));
+	status = clGetDeviceInfo(context->opencl_device_list[device_id], CL_DEVICE_EXTENSIONS, sizeof(context->opencl_extensions), context->opencl_extensions, NULL);
+	if (status) { 
+		agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clGetDeviceInfo(%p,CL_DEVICE_EXTENSIONS) => %d\n", context->opencl_device_list[device_id], status);
+		return -1; 
+	}
+	context->opencl_svmcaps = 0;
+	status = clGetDeviceInfo(context->opencl_device_list[device_id], CL_DEVICE_SVM_CAPABILITIES, sizeof(context->opencl_svmcaps), &context->opencl_svmcaps, NULL);
+	if (status) { 
+		agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clGetDeviceInfo(%p,CL_DEVICE_SVM_CAPABILITIES) => %d\n", context->opencl_device_list[device_id], status);
+		return -1; 
+	}
+	// get default OpenCL build options
+	strcpy(context->opencl_build_options, (context->opencl_config_flags & CONFIG_OPENCL_USE_1_2) ? "-cl-std=CL1.2" : "-cl-std=CL2.0");
+	// override build options with environment variable
+	agoGetEnvironmentVariable("AGO_OPENCL_BUILD_OPTIONS", context->opencl_build_options, sizeof(context->opencl_build_options));
+	// override affinity device_info
+	char opencl_device_info[64] = "";
+	agoGetEnvironmentVariable("AGO_OPENCL_DEVICE_INFO", opencl_device_info, sizeof(opencl_device_info));
+	if (opencl_device_info[0] >= '0' && opencl_device_info[0] <= '9') {
+		context->attr_affinity.device_info = atoi(opencl_device_info);
+	}
+
+	// decide SVM features
+	if (context->opencl_svmcaps & (CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) {
+		context->opencl_config_flags &= ~CONFIG_OPENCL_SVM_MASK;
+		if (context->attr_affinity.device_info & AGO_TARGET_AFFINITY_GPU_INFO_SVM_MASK) {
+			// set SVM flags based on device capabilities and affinity
+			context->opencl_config_flags |= CONFIG_OPENCL_SVM_ENABLE;
+			if (!(context->attr_affinity.device_info & AGO_TARGET_AFFINITY_GPU_INFO_SVM_NO_FGS)) {
+				if (context->opencl_svmcaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
+					context->opencl_config_flags |= CONFIG_OPENCL_SVM_AS_FGS;
+				}
+			}
+			if (context->attr_affinity.device_info & AGO_TARGET_AFFINITY_GPU_INFO_SVM_AS_CLMEM) {
+				if (!(context->opencl_config_flags & CONFIG_OPENCL_SVM_AS_FGS)) {
+					context->opencl_config_flags |= CONFIG_OPENCL_SVM_AS_CLMEM;
+				}
+			}
+		}
+		else {
+			// default: TBD (SVM not enabled, for now)
+			if (context->opencl_svmcaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
+				// context->opencl_config_flags |= (CONFIG_OPENCL_SVM_ENABLE | CONFIG_OPENCL_SVM_AS_FGS);
+			}
+			else {
+				// context->opencl_config_flags |= CONFIG_OPENCL_SVM_ENABLE;
+			}
+		}
+	}
+	// create command queue for buffer sync
+	context->opencl_cmdq = clCreateCommandQueueWithProperties(context->opencl_context, context->opencl_device_list[device_id], NULL, &status);
+	if (status) {
+		agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateCommandQueueWithProperties(%p,%p,0,*) => %d\n", context->opencl_context, context->opencl_device_list[device_id], status);
+		return -1;
+	}
+
+	return 0;
+}
+
+int agoGpuOclAllocBuffer(AgoData * data)
+{
+	// make sure buffer is valid
+	if (agoDataSanityCheckAndUpdate(data)) {
+		return -1;
+	}
+	// allocate buffer
+	AgoContext * context = data->ref.context;
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		AgoData * dataMaster = data->u.img.roiMasterImage ? data->u.img.roiMasterImage : data; // to handle image ROI
+		if (!dataMaster->opencl_buffer && !dataMaster->u.img.enableUserBufferOpenCL) {
+			cl_int err = CL_SUCCESS;
+			dataMaster->opencl_buffer_offset = 256 + dataMaster->u.img.stride_in_bytes;
+			if (!dataMaster->buffer && !dataMaster->u.img.isUniform) {
+				if (context->opencl_config_flags & CONFIG_OPENCL_SVM_ENABLE) {
+					if (context->opencl_config_flags & CONFIG_OPENCL_SVM_AS_FGS) {
+						// allocate SVM buffer for fine grain system access
+						dataMaster->opencl_svm_buffer = dataMaster->opencl_svm_buffer_allocated = (vx_uint8 *)agoAllocMemory(dataMaster->size + dataMaster->opencl_buffer_offset);
+						if (!dataMaster->opencl_svm_buffer_allocated) {
+							agoAddLogEntry(&dataMaster->ref, VX_FAILURE, "ERROR: agoAllocMemory(%d) => NULL\n", (int)dataMaster->size + dataMaster->opencl_buffer_offset);
+							return -1;
+						}
+					}
+					else {
+						// allocate SVM buffer
+						dataMaster->opencl_svm_buffer = dataMaster->opencl_svm_buffer_allocated = (vx_uint8 *)clSVMAlloc(context->opencl_context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, dataMaster->size + dataMaster->opencl_buffer_offset, 0);
+						if (!dataMaster->opencl_svm_buffer_allocated) {
+							agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clSVMAlloc(%p,CL_MEM_READ_WRITE|CL_MEM_SVM_FINE_GRAIN_BUFFER,%d,0,*) => NULL\n", context->opencl_context, (int)dataMaster->size + dataMaster->opencl_buffer_offset);
+							return -1;
+						}
+					}
+				}
+			}
+			if (dataMaster->opencl_svm_buffer_allocated) {
+				// use svm buffer as buffer(CPU)
+				dataMaster->buffer = dataMaster->opencl_svm_buffer_allocated + dataMaster->opencl_buffer_offset;
+				if (context->opencl_config_flags & CONFIG_OPENCL_SVM_AS_CLMEM) {
+					// use svm buffer as opencl_buffer(GPU)
+					dataMaster->opencl_buffer = dataMaster->opencl_buffer_allocated = clCreateBuffer(context->opencl_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, dataMaster->size + dataMaster->opencl_buffer_offset, dataMaster->opencl_svm_buffer_allocated, &err);
+				}
+			}
+			else {
+				// allocate normal opencl_buffer
+				dataMaster->opencl_buffer = dataMaster->opencl_buffer_allocated = clCreateBuffer(context->opencl_context, CL_MEM_READ_WRITE, dataMaster->size + dataMaster->opencl_buffer_offset, NULL, &err);
+			}
+			if (err) {
+				agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateBuffer(%p,CL_MEM_READ_WRITE,%d,0,*) => %d\n", context->opencl_context, (int)dataMaster->size + dataMaster->opencl_buffer_offset, err);
+				return -1;
+			}
+			if (dataMaster->u.img.isUniform) {
+				// make sure that CPU buffer is allocated
+				if (!dataMaster->buffer) {
+					if (agoAllocData(dataMaster)) {
+						return -1;
+					}
+				}
+				// copy the uniform image into OpenCL buffer because there won't be any commits happening to this buffer
+				cl_int err = clEnqueueWriteBuffer(context->opencl_cmdq, dataMaster->opencl_buffer, CL_TRUE, dataMaster->opencl_buffer_offset, dataMaster->size, dataMaster->buffer, 0, NULL, NULL);
+				if (err) { 
+					agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: agoGpuOclAllocBuffer: clEnqueueWriteBuffer() => %d\n", err);
+					return -1; 
+				}
+				dataMaster->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+			}
+		}
+		if (data != dataMaster) {
+			// special handling for image ROI
+			data->opencl_buffer = dataMaster->opencl_buffer;
+			data->opencl_svm_buffer = dataMaster->opencl_svm_buffer;
+			data->opencl_buffer_offset = data->u.img.rect_roi.start_y * data->u.img.stride_in_bytes +
+				((data->u.img.rect_roi.start_x * (vx_uint32)data->u.img.pixel_size_in_bits) >> 3) +
+				dataMaster->opencl_buffer_offset;
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY || data->ref.type == AGO_TYPE_CANNY_STACK) {
+		if (!data->opencl_buffer) {
+			data->opencl_buffer_offset = DATA_OPENCL_ARRAY_OFFSET; // first few bytes reserved for numitems/stacktop
+			cl_int err = CL_SUCCESS;
+			if (!data->buffer) {
+				if (context->opencl_config_flags & CONFIG_OPENCL_SVM_ENABLE) {
+					if (context->opencl_config_flags & CONFIG_OPENCL_SVM_AS_FGS) {
+						// allocate SVM buffer for fine grain system access
+						data->opencl_svm_buffer = data->opencl_svm_buffer_allocated = (vx_uint8 *)agoAllocMemory(data->size + data->opencl_buffer_offset);
+						if (!data->opencl_svm_buffer_allocated) {
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoAllocMemory(%d) => NULL\n", (int)data->size + data->opencl_buffer_offset);
+							return -1;
+						}
+					}
+					else {
+						// allocate SVM buffer
+						data->opencl_svm_buffer = data->opencl_svm_buffer_allocated = (vx_uint8 *)clSVMAlloc(context->opencl_context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, data->size + data->opencl_buffer_offset, 0);
+						if (!data->opencl_svm_buffer_allocated) {
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSVMAlloc(%p,CL_MEM_READ_WRITE|CL_MEM_SVM_FINE_GRAIN_BUFFER,%d,0,*) => NULL\n", context->opencl_context, (int)data->size + data->opencl_buffer_offset);
+							return -1;
+						}
+					}
+					// initialize array header which containts numitems
+					if (data->opencl_svm_buffer)
+						memset(data->opencl_svm_buffer, 0, data->opencl_buffer_offset);
+				}
+			}
+			if (data->opencl_svm_buffer_allocated) {
+				// use svm buffer as buffer(CPU)
+				data->buffer = data->opencl_svm_buffer_allocated + data->opencl_buffer_offset;
+				if (context->opencl_config_flags & CONFIG_OPENCL_SVM_AS_CLMEM) {
+					// use svm buffer as opencl_buffer(GPU)
+					data->opencl_buffer = data->opencl_buffer_allocated = clCreateBuffer(context->opencl_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, data->size + data->opencl_buffer_offset, data->opencl_svm_buffer_allocated, &err);
+				}
+			}
+			else {
+				// normal opencl_buffer allocation
+				data->opencl_buffer = data->opencl_buffer_allocated = clCreateBuffer(context->opencl_context, CL_MEM_READ_WRITE, data->size + data->opencl_buffer_offset, NULL, &err);
+				if (data->opencl_buffer) {
+					// initialize array header which containts numitems
+					vx_uint32 zero = 0;
+					cl_event ev = nullptr;
+					err = clEnqueueFillBuffer(context->opencl_cmdq, data->opencl_buffer, &zero, sizeof(zero), 0, data->opencl_buffer_offset, 0, NULL, &ev);
+					if (!err) err = clWaitForEvents(1, &ev);
+				}
+			}
+			if (err) {
+				agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateBuffer(%p,CL_MEM_READ_WRITE,%d,0,*) => %d (array/cannystack)\n", context->opencl_context, (int)data->size, err);
+				return -1;
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR || data->ref.type == VX_TYPE_THRESHOLD || data->ref.type == VX_TYPE_MATRIX || data->ref.type == VX_TYPE_CONVOLUTION) {
+		// nothing to do
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		if (!data->opencl_buffer) {
+			cl_int err = -1;
+			cl_image_format format = { CL_INTENSITY, CL_UNORM_INT8 };
+			cl_image_desc desc = { CL_MEM_OBJECT_IMAGE1D, 256, 0, 0, 1, 0, 0, 0, 0, NULL };
+			data->opencl_buffer = data->opencl_buffer_allocated = clCreateImage(context->opencl_context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+			if (err) {
+				agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateImage(%p,CL_MEM_READ_WRITE,1D/U8,256,0,*) => %d\n", context->opencl_context, err);
+				return -1;
+			}
+			data->opencl_buffer_offset = 0;
+		}
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		if (!data->opencl_buffer) {
+			cl_int err = -1;
+			data->opencl_buffer = data->opencl_buffer_allocated = clCreateBuffer(context->opencl_context, CL_MEM_READ_WRITE, data->size, NULL, &err);
+			if (err) {
+				agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateBuffer(%p,CL_MEM_READ_WRITE,%d,0,*) => %d\n", context->opencl_context, (int)data->size, err);
+				return -1;
+			}
+			data->opencl_buffer_offset = 0;
+		}
+	}
+	else if (data->numChildren > 0) {
+		for (vx_uint32 child = 0; child < data->numChildren; child++) {
+			if (agoGpuOclAllocBuffer(data->children[child]) < 0) {
+				return -1;
+			}
+		}
+	}
+	else {
+		agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclAllocBuffer: doesn't support object type %s of %s\n", agoEnum2Name(data->ref.type), data->name.length() ? "?" : data->name.c_str());
+		return -1;
+	}
+	// allocate CPU buffer
+	if (agoAllocData(data)) {
+		return -1;
+	}
+	return 0;
+}
+
+int agoGpuOclAllocBuffers(AgoGraph * graph, AgoNode * node)
+{
+	for (vx_uint32 i = 0; i < node->paramCount; i++) {
+		AgoData * data = node->paramList[i];
+		if (data && !data->opencl_buffer) {
+			if (agoIsPartOfDelay(data)) {
+				int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0;
+				data = agoGetSiblingTraceToDelay(data, siblingTrace, siblingTraceCount);
+				if (!data) return -1;
+			}
+			if (agoGpuOclAllocBuffer(data) < 0) {
+				return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoGpuOclSuperNodeMerge(AgoGraph * graph, AgoSuperNode * supernode, AgoNode * node)
+{
+	// sanity check
+	if (!node->akernel->func && !node->akernel->opencl_codegen_callback_f) {
+		agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeMerge: doesn't support kernel %s\n", node->akernel->name);
+		return -1;
+	}
+	// merge node into supernode
+	supernode->nodeList.push_back(node);
+	for (vx_uint32 i = 0; i < node->paramCount; i++) {
+		AgoData * data = node->paramList[i];
+		if (data) {
+			size_t index = std::find(supernode->dataList.begin(), supernode->dataList.end(), data) - supernode->dataList.begin();
+			if (index == supernode->dataList.size()) {
+				// add data with zero entries into the lists
+				AgoSuperNodeDataInfo info = { 0 };
+				info.needed_as_a_kernel_argument = true;
+				supernode->dataInfo.push_back(info);
+				supernode->dataList.push_back(data);
+				supernode->dataListForAgeDelay.push_back(data);
+			}
+			// update count for data direction
+			supernode->dataInfo[index].argument_usage[node->parameters[i].direction]++;
+		}
+	}
+	return 0;
+}
+
+static const char * agoGpuGetKernelFunctionName(AgoNode * node)
+{
+	const char * kname = node->akernel->name;
+	for (const char * p = kname; *p; p++)
+		if (*p == '.')
+			kname = p + 1;
+	return kname;
+}
+
+static const char * agoGpuImageFormat2RegType(vx_df_image format)
+{
+	const char * reg_type = "?";
+	if (format == VX_DF_IMAGE_U1_AMD) reg_type = "U1";
+	else if (format == VX_DF_IMAGE_U8) reg_type = "U8";
+	else if (format == VX_DF_IMAGE_S16) reg_type = "S16";
+	else if (format == VX_DF_IMAGE_U16) reg_type = "U16";
+	else if (format == VX_DF_IMAGE_U32) reg_type = "U32";
+	else if (format == VX_DF_IMAGE_RGB) reg_type = "U24";
+	else if (format == VX_DF_IMAGE_RGBX) reg_type = "U32";
+	else if (format == VX_DF_IMAGE_UYVY) reg_type = "U16";
+	else if (format == VX_DF_IMAGE_YUYV) reg_type = "U16";
+	else if (format == VX_DF_IMAGE_F32_AMD) reg_type = "F32";
+	return reg_type;
+}
+
+int agoGpuOclDataSetBufferAsKernelArg(AgoData * data, cl_kernel opencl_kernel, vx_uint32 kernelArgIndex, vx_uint32 group)
+{
+	cl_int err = CL_INVALID_MEM_OBJECT;
+	if (data->opencl_buffer) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->opencl_buffer), &data->opencl_buffer);
+		if (err) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,buffer) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1;
+		}
+	}
+	else if (data->opencl_svm_buffer) {
+		err = clSetKernelArgSVMPointer(opencl_kernel, (cl_uint)kernelArgIndex, data->opencl_svm_buffer);
+		if (err) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArgSVMPointer(supernode,%d,*,buffer) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1;
+		}
+	}
+	return err;
+}
+
+static int agoGpuOclSetKernelArgs(cl_kernel opencl_kernel, vx_uint32& kernelArgIndex, AgoData * data, bool need_access, vx_uint32 dataFlags, vx_uint32 group)
+{
+	cl_int err;
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		if (need_access) { // only use image objects that need read/write access
+			if (dataFlags & NODE_OPENCL_TYPE_NEED_IMGSIZE) {
+				err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->u.img.width), &data->u.img.width);
+				if (err) { 
+					agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,width) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+					return -1; 
+				}
+				kernelArgIndex++;
+				err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->u.img.height), &data->u.img.height);
+				if (err) { 
+					agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,height) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+					return -1; 
+				}
+				kernelArgIndex++;
+			}
+			if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+				return -1;
+			kernelArgIndex++;
+			err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->u.img.stride_in_bytes), &data->u.img.stride_in_bytes);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,stride) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+				return -1; 
+			}
+			kernelArgIndex++;
+			err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->opencl_buffer_offset), &data->opencl_buffer_offset);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+				return -1; 
+			}
+			kernelArgIndex++;
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+			return -1;
+		kernelArgIndex++;
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->opencl_buffer_offset), &data->opencl_buffer_offset);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,arr:offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+		// NOTE: capacity is used when array is atomic output and numitems is used otherwise
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(vx_uint32), &data->u.arr.capacity);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,arr:capacity) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == AGO_TYPE_CANNY_STACK) {
+		if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+			return -1;
+		kernelArgIndex++;
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->opencl_buffer_offset), &data->opencl_buffer_offset);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,cannystack:offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+		// NOTE: count is used when cannystack is output and stacktop is used when cannystack is input
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(vx_uint32), &data->u.cannystack.count);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,cannystack:count) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_THRESHOLD) {
+		size_t size = sizeof(cl_uint);
+		cl_uint2 value;
+		value.s0 = data->u.thr.threshold_lower;
+		if (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+			size = sizeof(cl_uint2);
+			value.s1 = data->u.thr.threshold_upper;
+		}
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, size, &value);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,%d,threshold) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, (int)size, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(cl_uint), &data->u.scalar.u.u);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,scalar) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_MATRIX) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, data->size, data->buffer);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,matrix) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_CONVOLUTION) {
+		agoAllocData(data); // make sure that the data has been allocated
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, data->size << 1, data->reserved);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,convolution) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+			return -1;
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+			return -1;
+		kernelArgIndex++;
+		vx_uint32 stride = data->u.remap.dst_width * sizeof(vx_uint32);
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(stride), &stride);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,stride) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->u.scalar.u.i), &data->u.scalar.u.i);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,scalar) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else {
+		agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclSetKernelArgs: doesn't support object type %s in group#%d for kernel arg setting\n", agoEnum2Name(data->ref.type), group);
+		return -1;
+	}
+	return 0;
+}
+
+static int agoGpuOclDataInputSync(AgoGraph * graph, cl_kernel opencl_kernel, vx_uint32& kernelArgIndex, AgoData * data, vx_uint32 dataFlags, vx_uint32 group, bool need_access, bool need_read_access, bool need_atomic_access)
+{
+	cl_int err;
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		if (need_access) { // only use image objects that need read access
+			if (dataFlags & NODE_OPENCL_TYPE_NEED_IMGSIZE) {
+				kernelArgIndex += 2;
+			}
+			if (data->isDelayed) {
+				// needs to set opencl_buffer everytime when the buffer is part of a delay object
+				if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+					return -1;
+			}
+			else if (data->u.img.enableUserBufferOpenCL && data->opencl_buffer) {
+				// need to set opencl_buffer and opencl_buffer_offset everytime if enableUserBufferOpenCL is true
+				if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+					return -1;
+				err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex + 2, sizeof(data->opencl_buffer_offset), &data->opencl_buffer_offset);
+				if (err) {
+					agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+					return -1;
+				}
+			}
+			kernelArgIndex += 3;
+			if (need_read_access) {
+				auto dataToSync = data->u.img.isROI ? data->u.img.roiMasterImage : data;
+				if (!(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+					if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
+						int64_t stime = agoGetClockCounter();
+						if (dataToSync->opencl_buffer) {
+							cl_int err = clEnqueueWriteBuffer(graph->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, dataToSync->size, dataToSync->buffer, 0, NULL, NULL);
+							if (err) { 
+								agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
+								return -1; 
+							}
+						}
+						dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+						int64_t etime = agoGetClockCounter();
+						graph->opencl_perf.buffer_write += etime - stime;
+#if ENABLE_DEBUG_DUMP_CL_BUFFERS
+						char fileName[128]; sprintf(fileName, "input_%%04d_%dx%d.yuv", dataToSync->u.img.width, dataToSync->u.img.height);
+						clDumpBuffer(fileName, graph->opencl_cmdq, dataToSync);
+#endif
+					}
+				}
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		if (data->isDelayed) {
+			// needs to set opencl_buffer everytime when the buffer is part of a delay object
+			if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+				return -1;
+		}
+		kernelArgIndex += 3;
+		if (need_read_access) {
+			if (!(data->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+				if (data->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
+					int64_t stime = agoGetClockCounter();
+					vx_size size = data->u.arr.numitems * data->u.arr.itemsize;
+					if (size > 0 && data->opencl_buffer) {
+						cl_int err = clEnqueueWriteBuffer(graph->opencl_cmdq, data->opencl_buffer, CL_TRUE, data->opencl_buffer_offset, size, data->buffer, 0, NULL, NULL);
+						if (err) { 
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d (array)\n", err);
+							return -1;
+						}
+					}
+					data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+					int64_t etime = agoGetClockCounter();
+					graph->opencl_perf.buffer_write += etime - stime;
+#if ENABLE_DEBUG_DUMP_CL_BUFFERS
+					clDumpBuffer("input_%04d.bin", graph->opencl_cmdq, data);
+#endif
+				}
+			}
+		}
+		if (need_read_access || !need_atomic_access) {
+			// set numitems of the array
+			err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex - 1, sizeof(vx_uint32), &data->u.arr.numitems);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,numitems) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex - 1, err, group);
+				return -1; 
+			}
+		}
+	}
+	else if (data->ref.type == AGO_TYPE_CANNY_STACK) {
+		if (data->isDelayed) {
+			// needs to set opencl_buffer everytime when the buffer is part of a delay object
+			if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+				return -1;
+		}
+		kernelArgIndex += 3;
+		if (need_read_access) {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclDataSyncInputs: doesn't support object type %s for read-access in group#%d for kernel arg setting\n", agoEnum2Name(data->ref.type), group);
+			return -1;
+		}
+	}
+	else if (data->ref.type == VX_TYPE_THRESHOLD) {
+		size_t size = sizeof(cl_uint);
+		cl_uint2 value;
+		value.s0 = data->u.thr.threshold_lower;
+		if (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+			size = sizeof(cl_uint2);
+			value.s1 = data->u.thr.threshold_upper;
+		}
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, size, &value);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,%d,threshold) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, (int)size, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(data->u.scalar.u.i), &data->u.scalar.u.i);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,scalar) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_MATRIX) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, data->size, data->buffer);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,matrix) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_CONVOLUTION) {
+		err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, data->size << 1, data->reserved);
+		if (err) { 
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,convolution) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
+			return -1; 
+		}
+		kernelArgIndex++;
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		if (need_access) { // only use lut objects that need read access
+			if (data->isDelayed) {
+				// needs to set opencl_buffer everytime when the buffer is part of a delay object
+				if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+					return -1;
+			}
+			kernelArgIndex += 1;
+			if (need_read_access) {
+				if (!(data->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+					if (data->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
+						int64_t stime = agoGetClockCounter();
+						size_t origin[3] = { 0, 0, 0 };
+						size_t region[3] = { 256, 1, 1 };
+						err = clEnqueueWriteImage(graph->opencl_cmdq, data->opencl_buffer, CL_TRUE, origin, region, 256, 0, data->buffer, 0, NULL, NULL);
+						if (err) { 
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
+							return -1; 
+						}
+						data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+						int64_t etime = agoGetClockCounter();
+						graph->opencl_perf.buffer_write += etime - stime;
+					}
+				}
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		if (need_access) { // only use image objects that need read access
+			if (data->isDelayed) {
+				// needs to set opencl_buffer everytime when the buffer is part of a delay object
+				if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
+					return -1;
+			}
+			kernelArgIndex += 2;
+			if (need_read_access) {
+				if (!(data->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+					if (data->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
+						int64_t stime = agoGetClockCounter();
+						cl_int err = clEnqueueWriteBuffer(graph->opencl_cmdq, data->opencl_buffer, CL_TRUE, data->opencl_buffer_offset, data->size, data->buffer, 0, NULL, NULL);
+						if (err) { 
+							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
+							return -1; 
+						}
+						data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+						int64_t etime = agoGetClockCounter();
+						graph->opencl_perf.buffer_write += etime - stime;
+					}
+				}
+			}
+		}
+	}
+	else {
+		agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclDataSyncInputs: doesn't support object type %s in group#%d for kernel arg setting\n", agoEnum2Name(data->ref.type), group);
+		return -1;
+	}
+	return 0;
+}
+
+static int agoGpuOclDataOutputMarkDirty(AgoGraph * graph, AgoData * data, bool need_access, bool need_write_access)
+{
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		if (need_access) { // only use image objects that need write access
+			if (need_write_access) {
+				auto dataToSync = data->u.img.isROI ? data->u.img.roiMasterImage : data;
+				dataToSync->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+				dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL;
+			}
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		if (need_access) { // only use image objects that need write access
+			if (need_write_access) {
+				data->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+				data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL;
+			}
+		}
+	}
+	return 0;
+}
+
+static int agoGpuOclDataOutputAtomicSync(AgoGraph * graph, AgoData * data)
+{
+	if (data->ref.type == VX_TYPE_ARRAY) {
+#if ENABLE_DEBUG_DUMP_CL_BUFFERS
+		clDumpBuffer("output_%04d_array.bin", graph->opencl_cmdq, data);
+		//printf("Press ENTER to continue... ");  char line[256]; gets(line);
+#endif
+		// update number of items
+		cl_int err = CL_SUCCESS;
+		int64_t stime = agoGetClockCounter();
+		vx_uint32 * pNumItems = (vx_uint32 *)data->opencl_svm_buffer;
+		if (data->opencl_buffer) {
+			pNumItems = (vx_uint32 *)clEnqueueMapBuffer(graph->opencl_cmdq, data->opencl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(vx_uint32), 0, NULL, NULL, &err);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueMapBuffer() for numitems => %d\n", err);
+				return -1; 
+			}
+		}
+		int64_t etime = agoGetClockCounter();
+		graph->opencl_perf.buffer_read += etime - stime;
+		// read and reset the counter
+		data->u.arr.numitems = *pNumItems;
+		*pNumItems = 0;
+		if (data->opencl_buffer) {
+			// unmap
+			stime = agoGetClockCounter();
+			err = clEnqueueUnmapMemObject(graph->opencl_cmdq, data->opencl_buffer, pNumItems, 0, NULL, NULL);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueUnmapMemObject() for numitems => %d\n", err);
+				return -1; 
+			}
+			etime = agoGetClockCounter();
+			graph->opencl_perf.buffer_write += etime - stime;
+		}
+	}
+	else if (data->ref.type == AGO_TYPE_CANNY_STACK) {
+		// update number of items and reset it for next use
+		int64_t stime = agoGetClockCounter();
+		cl_int err = CL_SUCCESS;
+		vx_uint8 * stack = data->opencl_svm_buffer;
+		if (data->opencl_buffer) {
+			stack = (vx_uint8 *)clEnqueueMapBuffer(graph->opencl_cmdq, data->opencl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(vx_uint32), 0, NULL, NULL, &err);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueMapBuffer() for stacktop => %d\n", err);
+				return -1; 
+			}
+		}
+		int64_t etime = agoGetClockCounter();
+		graph->opencl_perf.buffer_read += etime - stime;
+		data->u.cannystack.stackTop = *(vx_uint32 *)stack;
+		*(vx_uint32 *)stack = 0;
+		if (data->opencl_buffer) {
+			stime = agoGetClockCounter();
+			err = clEnqueueUnmapMemObject(graph->opencl_cmdq, data->opencl_buffer, stack, 0, NULL, NULL);
+			if (err) { 
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueUnmapMemObject() for stacktop => %d\n", err);
+				return -1; 
+			}
+			etime = agoGetClockCounter();
+			graph->opencl_perf.buffer_write += etime - stime;
+			// read data
+			if (data->u.cannystack.stackTop > 0) {
+				int64_t stime = agoGetClockCounter();
+				err = clEnqueueReadBuffer(graph->opencl_cmdq, data->opencl_buffer, CL_TRUE, data->opencl_buffer_offset, data->u.cannystack.stackTop * sizeof(ago_coord2d_ushort_t), data->buffer, 0, NULL, NULL);
+				if (err) { 
+					agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d (stacktop)\n", err);
+					return -1; 
+				}
+				int64_t etime = agoGetClockCounter();
+				graph->opencl_perf.buffer_read += etime - stime;
+			}
+		}
+	}
+	return 0;
+}
+
+static std::string agoGpuOclData2Decl(AgoData * data, vx_uint32 index, vx_uint32 dataFlags, vx_uint32 group)
+{
+	std::string code;
+	char item[256];
+	// add the object to argument
+	if (data->ref.type == VX_TYPE_IMAGE) {
+		if (dataFlags & NODE_OPENCL_TYPE_NEED_IMGSIZE) {
+			sprintf(item, "uint p%d_width, uint p%d_height, ", index, index);
+			code += item;
+		}
+		sprintf(item, "__global uchar * p%d_buf, uint p%d_stride, uint p%d_offset", index, index, index);
+		code += item;
+		if (dataFlags & DATA_OPENCL_FLAG_NEED_LOCAL) {
+			sprintf(item, ", __local uchar * p%d_lbuf", index);
+			code += item;
+		}
+	}
+	else if (data->ref.type == VX_TYPE_ARRAY) {
+		sprintf(item, "__global uchar * p%d_buf, uint p%d_offset, uint p%d_numitems", index, index, index);
+		code += item;
+	}
+	else if (data->ref.type == VX_TYPE_SCALAR) {
+		sprintf(item, "%s p%d", (data->u.scalar.type == VX_TYPE_FLOAT32) ? "float" : "uint", index);
+		code += item;
+	}
+	else if (data->ref.type == VX_TYPE_THRESHOLD) {
+		sprintf(item, "%s p%d", (data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) ? "uint2" : "uint", index);
+		code += item;
+	}
+	else if (data->ref.type == VX_TYPE_MATRIX) {
+		if (data->u.mat.columns == 2 && data->u.mat.rows == 3) {
+			sprintf(item, "ago_affine_matrix_t p%d", index);
+			code += item;
+		}
+		else if (data->u.mat.columns == 3 && data->u.mat.rows == 3) {
+			sprintf(item, "ago_perspective_matrix_t p%d", index);
+			code += item;
+		}
+		else {
+			agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclData2Decl: doesn't support " VX_FMT_SIZE "x" VX_FMT_SIZE " matrix in group#%d for kernel declaration\n", data->u.mat.columns, data->u.mat.rows, group);
+		}
+	}
+	else if (data->ref.type == VX_TYPE_CONVOLUTION) {
+		sprintf(item, "COEF_" VX_FMT_SIZE "x" VX_FMT_SIZE " p%d", data->u.conv.columns, data->u.conv.rows, index);
+		code += item;
+	}
+	else if (data->ref.type == VX_TYPE_LUT) {
+		sprintf(item, "__read_only image1d_t p%d", index);
+		code += item;
+	}
+	else if (data->ref.type == VX_TYPE_REMAP) {
+		sprintf(item, "__global uchar * p%d_buf, uint p%d_stride", index, index);
+		code += item;
+	}
+	else {
+		agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclData2Decl: doesn't support object type %s in group#%d for kernel declaration\n", agoEnum2Name(data->ref.type), group);
+	}
+	return code;
+}
+
+int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode)
+{
+	// make sure that all output images have same dimensions
+	// check to make sure that max input hierarchy level is less than min output hierarchy level
+	vx_uint32 width = 0, height = 0;
+	vx_uint32 max_input_hierarchical_level = 0, min_output_hierarchical_level = (1 << 30);
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+		if (data->ref.type == VX_TYPE_IMAGE && supernode->dataInfo[index].argument_usage[VX_INPUT] == 0) {
+			if (!width || !height) {
+				width = data->u.img.width;
+				height = data->u.img.height;
+			}
+			else if (width != data->u.img.width || height != data->u.img.height) {
+				agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeFinalize: doesn't support different image dimensions inside same group#%d\n", supernode->group);
+				return -1;
+			}
+		}
+		if (data->isVirtual && data->ref.type != VX_TYPE_SCALAR &&
+			data->inputUsageCount == supernode->dataInfo[index].argument_usage[VX_INPUT] &&
+			data->outputUsageCount == supernode->dataInfo[index].argument_usage[VX_OUTPUT] &&
+			data->inoutUsageCount == supernode->dataInfo[index].argument_usage[VX_BIDIRECTIONAL])
+		{
+			// no need of this parameter as an argument into the kernel
+			// mark that this will be an internal variable for the kernel
+			supernode->dataInfo[index].needed_as_a_kernel_argument = false;
+			// TBD: mark this the buffer doesn't need allocation
+		}
+		if (data->hierarchical_level > min_output_hierarchical_level) min_output_hierarchical_level = data->hierarchical_level;
+		if (data->hierarchical_level < max_input_hierarchical_level) max_input_hierarchical_level = data->hierarchical_level;
+	}
+	if (max_input_hierarchical_level > min_output_hierarchical_level) {
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeFinalize: doesn't support mix of hierarchical levels inside same group#%d\n", supernode->group);
+		return -1;
+	}
+	// decide work group dimensions (256 work-items)
+	vx_uint32 work_group_width = AGO_OPENCL_WORKGROUP_SIZE_0;
+	vx_uint32 work_group_height = AGO_OPENCL_WORKGROUP_SIZE_1;
+	// save image size and compute global work
+	//   - each work item processes 8x1 pixels
+	supernode->width = width;
+	supernode->height = height;
+	supernode->opencl_global_work[0] = (((width + 7) >> 3) + (work_group_width  - 1)) & ~(work_group_width  - 1);
+	supernode->opencl_global_work[1] = (  height           + (work_group_height - 1)) & ~(work_group_height - 1);
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+	}
+	// clear the data flags
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		supernode->dataInfo[index].data_type_flags = 0;
+	}
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+	}
+	// generate code: node functions in OpenCL
+	char item[256];
+	std::string code = OPENCL_FORMAT(
+		"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+		"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable\n"
+		"float4 amd_unpack(uint src)\n"
+		"{\n"
+		"  return (float4)(amd_unpack0(src), amd_unpack1(src), amd_unpack2(src), amd_unpack3(src));\n"
+		"}\n"
+		"\n"
+		"///////////////////////////////////////////////////////////////////////////////\n"
+		"// Data Types\n"
+		"typedef uchar   U1x8;\n"
+		"typedef uint2   U8x8;\n"
+		"typedef  int4  S16x8;\n"
+		"typedef uint4  U16x8;\n"
+		"typedef uint8  U24x8;\n"
+		"typedef uint8  U32x8;\n"
+		"typedef float8 F32x8;\n"
+		"typedef struct {\n"
+		"  float M[3][2];\n"
+		"} ago_affine_matrix_t;\n"
+		"typedef struct {\n"
+		"  float M[3][3];\n"
+		"} ago_perspective_matrix_t;\n"
+		"\n"
+		"///////////////////////////////////////////////////////////////////////////////\n"
+		"// load/store data\n"
+		"void load_U1x8(U1x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + (x >> 3);\n"
+		"  *r = *((__global U1x8 *) p);\n"
+		"}\n"
+		"\n"
+		"void load_U8x8(U8x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x;\n"
+		"  *r = *((__global U8x8 *) p);\n"
+		"}\n"
+		"\n"
+		"void load_S16x8(S16x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x + x;\n"
+		"  *r = *((__global S16x8 *) p);\n"
+		"}\n"
+		"\n"
+		"void load_U16x8(U16x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x + x;\n"
+		"  *r = *((__global U16x8 *) p);\n"
+		"}\n"
+		"\n"
+		"void load_U24x8(U24x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x * 3;\n"
+		"  (*r).s012 = *((__global uint3 *)(p + 0));\n"
+		"  (*r).s345 = *((__global uint3 *)(p + 12));\n"
+		"}\n"
+		"\n"
+		"void load_U32x8(U32x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + (x << 2);\n"
+		"  *r = *((__global U32x8 *) p);\n"
+		"}\n"
+		"\n"
+		"void load_F32x8(F32x8 * r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + (x << 2);\n"
+		"  *r = *((__global F32x8 *) p);\n"
+		"}\n"
+		"\n"
+		"void store_U1x8(U1x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + (x >> 3);\n"
+		"  *((__global U1x8 *)p) = r;\n"
+		"}\n"
+		"\n"
+		"void store_U8x8(U8x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x;\n"
+		"  *((__global U8x8 *)p) = r;\n"
+		"}\n"
+		"\n"
+		"void store_S16x8(S16x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x + x;\n"
+		"  *((__global S16x8 *)p) = r;\n"
+		"}\n"
+		"\n"
+		"void store_U16x8(U16x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x + x;\n"
+		"  *((__global U16x8 *)p) = r;\n"
+		"}\n"
+		"\n"
+		"void store_U24x8(U24x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + x * 3;\n"
+		"  *((__global uint3 *)(p + 0)) = r.s012;\n"
+		"  *((__global uint3 *)(p + 12)) = r.s345;\n"
+		"}\n"
+		"\n"
+		"void store_U32x8(U32x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + (x << 2);\n"
+		"  *((__global U32x8 *)p) = r;\n"
+		"}\n"
+		"\n"
+		"void store_F32x8(F32x8 r, uint x, uint y, __global uchar * p, uint stride)\n"
+		"{\n"
+		"  p += y*stride + (x << 2);\n"
+		"  *((__global F32x8 *)p) = r;\n"
+		"}\n"
+		"\n"
+		"void Convert_U8_U1 (U8x8 * p0, U1x8 p1)\n"
+		"{\n"
+		"	U8x8 r;\n"
+		"	r.s0  = (-(p1 &   1)) & 0x000000ff;\n"
+		"	r.s0 |= (-(p1 &   2)) & 0x0000ff00;\n"
+		"	r.s0 |= (-(p1 &   4)) & 0x00ff0000;\n"
+		"	r.s0 |= (-(p1 &   8)) & 0xff000000;\n"
+		"	r.s1  = (-((p1 >> 4) & 1)) & 0x000000ff;\n"
+		"	r.s1 |= (-(p1 &  32)) & 0x0000ff00;\n"
+		"	r.s1 |= (-(p1 &  64)) & 0x00ff0000;\n"
+		"	r.s1 |= (-(p1 & 128)) & 0xff000000;\n"
+		"	*p0 = r;\n"
+		"}\n"
+		"\n"
+		"void Convert_U1_U8 (U1x8 * p0, U8x8 p1)\n"
+		"{\n"
+		"	U1x8 r;\n"
+		"	r  =  p1.s0        &   1;\n"
+		"	r |= (p1.s0 >>  7) &   2;\n"
+		"	r |= (p1.s0 >> 14) &   4;\n"
+		"	r |= (p1.s0 >> 21) &   8;\n"
+		"	r |= (p1.s1 <<  4) &  16;\n"
+		"	r |= (p1.s1 >>  3) &  32;\n"
+		"	r |= (p1.s1 >> 10) &  64;\n"
+		"	r |= (p1.s1 >> 17) & 128;\n"
+		"	*p0 = r;\n"
+		"}\n"
+		);
+	for (size_t index = 0; index < supernode->nodeList.size(); index++) {
+		// get node and set node name
+		AgoNode * node = supernode->nodeList[index];
+		sprintf(node->opencl_name, "_n7%04d6f", (int)index ^ 3123);
+		// generate kernel function code
+		int status = VX_ERROR_NOT_IMPLEMENTED;
+		if (node->akernel->func) {
+			node->opencl_code = "";
+			status = node->akernel->func(node, ago_kernel_cmd_opencl_codegen);
+		}
+		else if (node->akernel->opencl_codegen_callback_f) {
+			// generation function declaration
+			std::string code2;
+			char item[256];
+			sprintf(item, "void %s(", node->opencl_name); code2 = item;
+			for (vx_uint32 i = 0; i < node->paramCount; i++) {
+				AgoData * data = node->paramList[i];
+				if (data) {
+					if (i) code2 += ", ";
+					size_t data_index = std::find(supernode->dataList.begin(), supernode->dataList.end(), data) - supernode->dataList.begin();
+					if (data->ref.type == VX_TYPE_IMAGE) {
+						if (node->akernel->argConfig[i] & AGO_KERNEL_ARG_INPUT_FLAG) {
+							code2 += "uint x, uint y";
+							sprintf(item, ", __global uchar * p%d_buf, uint p%d_stride", (int)data_index, (int)data_index);
+							code2 += item;
+							sprintf(item, ", uint p%d_width, uint p%d_height", (int)data_index, (int)data_index);
+							code2 += item;
+						}
+						else {
+							const char * reg_type = agoGpuImageFormat2RegType(data->u.img.format);
+							sprintf(item, "%s p%d", reg_type, (int)data_index);
+							code2 += item;
+						}
+					}
+					else if (data->ref.type == VX_TYPE_REMAP) {
+						sprintf(item, "__global uchar * p%d_buf, uint p%d_stride", (int)data_index, (int)data_index);
+						code2 += item;
+					}
+					else {
+						agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeFinalize: doesn't support object type %s in group#%d for kernel declaration\n", agoEnum2Name(data->ref.type), supernode->group);
+						return -1;
+					}
+				}
+			}
+			code2 += "\n";
+			// generate function code
+			node->opencl_code = code2;
+			node->opencl_type = NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE;
+			node->opencl_param_mem2reg_mask = 0;
+			node->opencl_param_discard_mask = 0;
+			node->opencl_param_atomic_mask = 0;
+			node->opencl_compute_work_multiplier = 0;
+			node->opencl_compute_work_param_index = 0;
+			node->opencl_output_array_param_index_plus1 = 0;
+			node->opencl_local_buffer_usage_mask = 0;
+			node->opencl_local_buffer_size_in_bytes = 0;
+			vx_uint32 work_dim = 2;
+			vx_size global_work[3] = { supernode->opencl_global_work[0], supernode->opencl_global_work[1], 1 };
+			vx_size local_work[3] = { work_group_width, work_group_height, 1 };
+			status = node->akernel->opencl_codegen_callback_f(node, true, node->opencl_name, node->opencl_code, node->opencl_build_options, work_dim, global_work, 
+				local_work, node->opencl_local_buffer_usage_mask, node->opencl_local_buffer_size_in_bytes);
+		}
+		if (status != VX_SUCCESS) {
+			agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeFinalize: kernel %s in group#%d is not supported yet\n", node->akernel->name, supernode->group);
+			return -1;
+		}
+		code += node->opencl_code;
+		// update dataFlags[] if needed
+		if (node->opencl_type & (NODE_OPENCL_TYPE_REG2REG | NODE_OPENCL_TYPE_MEM2REG)) {
+			node->opencl_param_mem2reg_mask = 0;
+			for (vx_uint32 i = 0; i < node->paramCount; i++) {
+				AgoData * data = node->paramList[i];
+				if (data) {
+					if (node->opencl_param_discard_mask & (1 << i)) {
+						// when code generator asked to discard this argument, mark that this argument is not needed anymore
+						size_t data_index = std::find(supernode->dataList.begin(), supernode->dataList.end(), data) - supernode->dataList.begin();
+						supernode->dataInfo[data_index].data_type_flags |= DATA_OPENCL_FLAG_DISCARD_PARAM;
+					}
+					else if (data->ref.type == VX_TYPE_IMAGE) {
+						if (node->parameters[i].direction != VX_OUTPUT) {
+							size_t data_index = std::find(supernode->dataList.begin(), supernode->dataList.end(), data) - supernode->dataList.begin();
+							supernode->dataInfo[data_index].data_type_flags |= (node->opencl_type & (NODE_OPENCL_TYPE_REG2REG | NODE_OPENCL_TYPE_MEM2REG | NODE_OPENCL_TYPE_NEED_IMGSIZE));
+							if (i > 0) {
+								if ((node->opencl_local_buffer_size_in_bytes > 0) && (node->opencl_local_buffer_usage_mask & (1 << i))) {
+									// mark that local data buffer is needed and specify the buffer size
+									supernode->dataInfo[data_index].data_type_flags |= DATA_OPENCL_FLAG_NEED_LOCAL;
+									supernode->dataInfo[data_index].local_buffer_size_in_bytes = node->opencl_local_buffer_size_in_bytes;
+								}
+								if (node->opencl_type & NODE_OPENCL_TYPE_MEM2REG) {
+									// mark that the image has NODE_OPENCL_TYPE_MEM2REG
+									node->opencl_param_mem2reg_mask = (1 << i);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	// generate code: kernel declaration
+	sprintf(item, OPENCL_FORMAT("__kernel __attribute__((reqd_work_group_size(%d, %d, 1)))\nvoid %s(uint width, uint height"), work_group_width, work_group_height, NODE_OPENCL_KERNEL_NAME);
+	code += item;
+#if ENABLE_LOCAL_DEBUG_MESSAGES
+	printf("===> *** supernode-%d has dataList.size()=%d\n", supernode->group, supernode->dataList.size());
+#endif
+	for (size_t index = 0, line_length = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+#if ENABLE_LOCAL_DEBUG_MESSAGES
+		printf("===> karg[%d] = { %d, 0x%08x, [ %2d %2d %2d ], %5d } -- %s\n", index, supernode->dataInfo[index].needed_as_a_kernel_argument, supernode->dataInfo[index].data_type_flags, supernode->dataInfo[index].argument_usage[0], supernode->dataInfo[index].argument_usage[1], supernode->dataInfo[index].argument_usage[2], supernode->dataInfo[index].local_buffer_size_in_bytes, data->name.c_str());
+#endif
+		if (supernode->dataInfo[index].needed_as_a_kernel_argument && !(supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_DISCARD_PARAM)) { // only use objects that need read/write access
+			// add the object to argument
+			std::string arg = agoGpuOclData2Decl(data, (vx_uint32)index, supernode->dataInfo[index].data_type_flags & ~DATA_OPENCL_FLAG_NEED_LOCAL, supernode->group);
+			if (arg.length() > 0) {
+				line_length += arg.length();
+				if (line_length > 800) {
+					// make sure that lines never exceed 1000 characters: assumption made by the CObfuscator
+					code += "\n    ";
+					line_length = 0;
+				}
+				code += ", ";
+				code += arg;
+				if (data->ref.type == VX_TYPE_IMAGE) {
+					supernode->dataInfo[index].data_type_flags |= DATA_OPENCL_FLAG_BUFFER;
+				}
+			}
+			else {
+				return -1;
+			}
+		}
+	}
+	code += ")\n";
+	// generate code: workitem (x,y) computation
+	code += "{\n\tuint x = get_global_id(0) * 8;\n\tuint y = get_global_id(1);\n\tbool valid = (x < width) && (y < height);\n\n";
+	// generate code: add offset to image address
+	bool uses_local_memory = false;
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+		if (data->ref.type == VX_TYPE_IMAGE) {
+			if (supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_NEED_LOCAL) {
+				sprintf(item, "\t__local uchar p%d_lbuf[%d];\n", (int)index, supernode->dataInfo[index].local_buffer_size_in_bytes);
+				code += item;
+				uses_local_memory = true;
+			}
+			if (supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_BUFFER) {
+				sprintf(item, "\tp%d_buf += p%d_offset;\n", (int)index, (int)index);
+				code += item;
+			}
+			if (supernode->dataInfo[index].needed_as_a_kernel_argument) { // only use objects that need read/write access
+				if (supernode->dataInfo[index].argument_usage[VX_INPUT] || supernode->dataInfo[index].argument_usage[VX_BIDIRECTIONAL]) {
+					// mark that load is needed
+					supernode->dataInfo[index].data_type_flags |= (DATA_OPENCL_FLAG_NEED_LOAD_R2R | DATA_OPENCL_FLAG_NEED_LOAD_M2R);
+				}
+			}
+		}
+	}
+	if (!uses_local_memory) {
+		code += "\tif (valid) {\n";
+	}
+	// generate code: declara register variables for images
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+		if (data->ref.type == VX_TYPE_IMAGE) {
+			const char * reg_type = agoGpuImageFormat2RegType(data->u.img.format);
+			sprintf(item, "\t\t%sx8 p%d;\n", reg_type, (int)index);
+			code += item;
+			if (supernode->dataInfo[index].needed_as_a_kernel_argument) { // only use objects that need read/write access
+				if (supernode->dataInfo[index].argument_usage[VX_OUTPUT]) {
+					// mark that load is not needed
+					supernode->dataInfo[index].data_type_flags &= ~DATA_OPENCL_FLAG_NEED_LOAD_R2R;
+				}
+			}
+		}
+	}
+	// generate code: actual computation
+	for (size_t index = 0; index < supernode->nodeList.size(); index++) {
+		AgoNode * node = supernode->nodeList[index];
+		// issues all required loads
+		for (vx_uint32 i = 0; i < node->paramCount; i++) {
+			AgoData * data = node->paramList[i];
+			if (data) {
+				size_t data_index = std::find(supernode->dataList.begin(), supernode->dataList.end(), data) - supernode->dataList.begin();
+				if ((supernode->dataInfo[data_index].data_type_flags & NODE_OPENCL_TYPE_REG2REG) && (supernode->dataInfo[data_index].data_type_flags & DATA_OPENCL_FLAG_NEED_LOAD_R2R)) {
+					const char * reg_type = agoGpuImageFormat2RegType(data->u.img.format);
+					sprintf(item, "\t\tload_%sx8(&p%d, x, y, p%d_buf, p%d_stride);\n", reg_type, (int)data_index, (int)data_index, (int)data_index);
+					code += item;
+					// mark that load has been issued
+					supernode->dataInfo[data_index].data_type_flags &= ~DATA_OPENCL_FLAG_NEED_LOAD_R2R;
+				}
+			}
+		}
+		// generate computation
+		sprintf(item, "\t\t%s(", node->opencl_name); code += item;
+		for (vx_uint32 i = 0; i < node->paramCount; i++) {
+			AgoData * data = node->paramList[i];
+			if (data) {
+				size_t data_index = std::find(supernode->dataList.begin(), supernode->dataList.end(), data) - supernode->dataList.begin();
+				if (!(supernode->dataInfo[data_index].data_type_flags & DATA_OPENCL_FLAG_DISCARD_PARAM)) {
+					if ((supernode->dataInfo[data_index].data_type_flags & NODE_OPENCL_TYPE_MEM2REG) && 
+						(supernode->dataInfo[data_index].data_type_flags & DATA_OPENCL_FLAG_NEED_LOAD_M2R) &&
+						(node->opencl_param_mem2reg_mask & (1 << i)))
+					{
+						code += ", x, y";
+						if (node->opencl_local_buffer_usage_mask & (1 << i)) {
+							sprintf(item, ", p%d_lbuf", (int)data_index);
+							code += item;
+						}
+						sprintf(item, ", p%d_buf, p%d_stride", (int)data_index, (int)data_index);
+						code += item;
+						if (supernode->dataInfo[data_index].data_type_flags & NODE_OPENCL_TYPE_NEED_IMGSIZE) {
+							sprintf(item, ", p%d_width, p%d_height", (int)data_index, (int)data_index);
+							code += item;
+						}
+						// mark that load has been issued
+						supernode->dataInfo[data_index].data_type_flags &= ~DATA_OPENCL_FLAG_NEED_LOAD_M2R;
+					}
+					else if (data->ref.type == VX_TYPE_REMAP) {
+						sprintf(item, ", p%d_buf, p%d_stride", (int)data_index, (int)data_index);
+						code += item;
+					}
+					else {
+						sprintf(item, "%s%sp%d", i ? ", " : "", (node->akernel->argConfig[i] & AGO_KERNEL_ARG_OUTPUT_FLAG) ? "&" : "", (int)data_index);
+						code += item;
+					}
+				}
+			}
+		}
+		// end of function call with actual kernel name as a comment for debug
+		code += "); // ";
+		code += agoGpuGetKernelFunctionName(node);
+		code += "\n";
+	}
+	if (uses_local_memory) {
+		code += "\tif (valid) {\n";
+	}
+	// generate code: issue stores
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		AgoData * data = supernode->dataList[index];
+		if (data->ref.type == VX_TYPE_IMAGE) {
+			if (supernode->dataInfo[index].needed_as_a_kernel_argument &&
+				(supernode->dataInfo[index].argument_usage[VX_OUTPUT] || supernode->dataInfo[index].argument_usage[VX_BIDIRECTIONAL]))
+			{ // only use objects that need write access
+				const char * reg_type = agoGpuImageFormat2RegType(data->u.img.format);
+				sprintf(item, "\t\tstore_%sx8(p%d, x, y, p%d_buf, p%d_stride);\n", reg_type, (int)index, (int)index, (int)index);
+				code += item;
+			}
+		}
+	}
+	// generate code: end of function and save
+	code += "\t}\n}\n";
+	supernode->opencl_code = code;
+	const char * opencl_code = supernode->opencl_code.c_str();
+
+	// dump OpenCL kernel if environment variable AGO_DUMP_GPU is specified with dump file path prefix
+	// the output file name will be "$(AGO_DUMP_GPU)-<group>.cl"
+	char textBuffer[1024];
+	if (agoGetEnvironmentVariable("AGO_DUMP_GPU", textBuffer, sizeof(textBuffer))) {
+		char fileName[1024];
+		sprintf(fileName, "%s-%d.cl", textBuffer, supernode->group);
+		FILE * fp = fopen(fileName, "w");
+		if (!fp) agoAddLogEntry(NULL, VX_FAILURE, "ERROR: unable to create: %s\n", fileName);
+		else {
+			fprintf(fp, "%s", opencl_code);
+			fclose(fp);
+			agoAddLogEntry(NULL, VX_SUCCESS, "OK: created %s\n", fileName);
+		}
+	}
+
+	// create compile the OpenCL code into OpenCL kernel object
+	supernode->opencl_cmdq = graph->opencl_cmdq;
+	cl_int err;
+	supernode->opencl_program = clCreateProgramWithSource(graph->ref.context->opencl_context, 1, &opencl_code, NULL, &err);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clCreateProgramWithSource(%p,1,*,NULL,*) failed(%d) for group#%d\n", graph->ref.context->opencl_context, err, supernode->group);
+		return -1; 
+	}
+	std::string opencl_build_options = graph->ref.context->opencl_build_options;
+	err = clBuildProgram(supernode->opencl_program, 1, &graph->opencl_device, opencl_build_options.c_str(), NULL, NULL);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clBuildProgram(%p,%s) failed(%d) for group#%d\n", supernode->opencl_program, graph->ref.context->opencl_build_options, err, supernode->group);
+#if _DEBUG // dump warnings/errors to console in debug build mode
+		size_t logSize = 1024 * 1024; char * log = new char[logSize]; memset(log, 0, logSize);
+		clGetProgramBuildInfo(supernode->opencl_program, graph->opencl_device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+		printf("<<<<\n%s\n>>>>\n", log);
+		delete[] log;
+#endif
+		return -1;
+	}
+	supernode->opencl_kernel = clCreateKernel(supernode->opencl_program, NODE_OPENCL_KERNEL_NAME, &err);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clCreateKernel(%p,supernode) failed(%d) for group#%d\n", supernode->opencl_program, err, supernode->group);
+		return -1; 
+	}
+	// set all kernel objects
+	vx_uint32 kernelArgIndex = 0;
+	err = clSetKernelArg(supernode->opencl_kernel, (cl_uint)kernelArgIndex, sizeof(cl_uint), &width);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,width) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, supernode->group);
+		return -1; 
+	}
+	kernelArgIndex++;
+	err = clSetKernelArg(supernode->opencl_kernel, (cl_uint)kernelArgIndex, sizeof(cl_uint), &height);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,height) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, supernode->group);
+		return -1; 
+	}
+	kernelArgIndex++;
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		if (!(supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_DISCARD_PARAM)) {
+			bool need_access = supernode->dataInfo[index].needed_as_a_kernel_argument;
+			if (agoGpuOclSetKernelArgs(supernode->opencl_kernel, kernelArgIndex, supernode->dataList[index], need_access, supernode->dataInfo[index].data_type_flags, supernode->group) < 0) {
+				return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoGpuOclSuperNodeLaunch(AgoGraph * graph, AgoSuperNode * supernode)
+{
+	// make sure that all input buffers are synched and other arguments are updated
+	vx_uint32 kernelArgIndex = 2;
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		if (!(supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_DISCARD_PARAM)) {
+			bool need_access = supernode->dataInfo[index].needed_as_a_kernel_argument;
+			bool need_read_access = supernode->dataInfo[index].argument_usage[VX_INPUT] || supernode->dataInfo[index].argument_usage[VX_BIDIRECTIONAL];
+			if (agoGpuOclDataInputSync(graph, supernode->opencl_kernel, kernelArgIndex, supernode->dataList[index], supernode->dataInfo[index].data_type_flags, supernode->group, need_access, need_read_access, false) < 0) {
+				return -1;
+			}
+		}
+	}
+	// launch the kernel
+	int64_t stime = agoGetClockCounter();
+	cl_int err;
+	err = clEnqueueNDRangeKernel(supernode->opencl_cmdq, supernode->opencl_kernel, 2, NULL, supernode->opencl_global_work, NULL, 0, NULL, &supernode->opencl_event);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clEnqueueNDRangeKernel(supernode,2,*,%dx%d,...) failed(%d) for group#%d\n", (cl_uint)supernode->opencl_global_work[0], (cl_uint)supernode->opencl_global_work[1], err, supernode->group);
+		return -1; 
+	}
+	err = clFlush(supernode->opencl_cmdq);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clFlush(supernode) failed(%d) for group#%d\n", err, supernode->group);
+		return -1; 
+	}
+	int64_t etime = agoGetClockCounter();
+	graph->opencl_perf.kernel_enqueue += etime - stime;
+	// mark that supernode outputs are dirty
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		if (!(supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_DISCARD_PARAM)) {
+			bool need_access = supernode->dataInfo[index].needed_as_a_kernel_argument;
+			bool need_write_access = supernode->dataInfo[index].argument_usage[VX_OUTPUT] || supernode->dataInfo[index].argument_usage[VX_BIDIRECTIONAL];
+			if (agoGpuOclDataOutputMarkDirty(graph, supernode->dataList[index], need_access, need_write_access) < 0) {
+				return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoGpuOclSuperNodeWait(AgoGraph * graph, AgoSuperNode * supernode)
+{
+	// wait for completion
+	int64_t stime = agoGetClockCounter();
+	cl_int err;
+	err = clWaitForEvents(1, &supernode->opencl_event);
+	if (err) { 
+		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clWaitForEvents(1,%p) failed(%d) for group#%d\n", supernode->opencl_event, err, supernode->group);
+		return -1; 
+	}
+	clReleaseEvent(supernode->opencl_event);
+	supernode->opencl_event = NULL;
+	int64_t etime = agoGetClockCounter();
+	graph->opencl_perf.kernel_wait += etime - stime;
+#if ENABLE_DEBUG_DUMP_CL_BUFFERS
+	// dump supernode outputs
+	for (size_t index = 0; index < supernode->dataList.size(); index++) {
+		if (!(supernode->dataInfo[index].data_type_flags & DATA_OPENCL_FLAG_DISCARD_PARAM)) {
+			bool need_access = supernode->dataInfo[index].needed_as_a_kernel_argument;
+			bool need_write_access = supernode->dataInfo[index].argument_usage[VX_OUTPUT] || supernode->dataInfo[index].argument_usage[VX_BIDIRECTIONAL];
+			auto data = supernode->dataList[index];
+			if (data->ref.type == VX_TYPE_IMAGE) {
+				if (need_access) { // only use image objects that need write access
+					if (need_write_access) {
+						auto dataToSync = data->u.img.isROI ? data->u.img.roiMasterImage : data;
+						char fileName[128]; sprintf(fileName, "output_%%04d_%dx%d.yuv", dataToSync->u.img.width, dataToSync->u.img.height);
+						clDumpBuffer(fileName, graph->opencl_cmdq, dataToSync);
+						//printf("Press ENTER to continue... ");  char line[256]; gets(line);
+					}
+				}
+			}
+		}
+	}
+#endif
+	return 0;
+}
+
+int agoGpuOclSingleNodeFinalize(AgoGraph * graph, AgoNode * node)
+{
+	const char * opencl_code = node->opencl_code.c_str();
+
+	// dump OpenCL kernel if environment variable AGO_DUMP_GPU is specified with dump file path prefix
+	// the output file name will be "$(AGO_DUMP_GPU)-0.<counter>.cl"
+	char textBuffer[1024];
+	if (agoGetEnvironmentVariable("AGO_DUMP_GPU", textBuffer, sizeof(textBuffer))) {
+		char fileName[1024]; static int counter = 0;
+		sprintf(fileName, "%s-0.%04d.cl", textBuffer, counter++);
+		FILE * fp = fopen(fileName, "w");
+		if (!fp) agoAddLogEntry(NULL, VX_FAILURE, "ERROR: unable to create: %s\n", fileName);
+		else {
+			fprintf(fp, "%s", opencl_code);
+			fclose(fp);
+			agoAddLogEntry(NULL, VX_SUCCESS, "OK: created %s\n", fileName);
+		}
+	}
+
+	// create compile the OpenCL code into OpenCL kernel object
+	vx_context context = graph->ref.context;
+	cl_int err;
+	node->opencl_program = clCreateProgramWithSource(context->opencl_context, 1, &opencl_code, NULL, &err);
+	if (err) { 
+		agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clCreateProgramWithSource(%p,1,*,NULL,*) failed(%d) for %s\n", context->opencl_context, err, node->akernel->name);
+		return -1; 
+	}
+	err = clBuildProgram(node->opencl_program, 1, &graph->opencl_device, node->opencl_build_options.c_str(), NULL, NULL);
+	if (err) {
+		agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clBuildProgram(%p,%s) failed(%d) for %s\n", node->opencl_program, node->opencl_build_options.c_str(), err, node->akernel->name);
+#if _DEBUG // dump warnings/errors to console in debug build mode
+		size_t logSize = 1024 * 1024; char * log = new char[logSize]; memset(log, 0, logSize);
+		clGetProgramBuildInfo(node->opencl_program, graph->opencl_device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+		printf("<<<<\n%s\n>>>>\n", log);
+		delete[] log;
+#endif
+		return -1;
+	}
+	node->opencl_kernel = clCreateKernel(node->opencl_program, node->opencl_name, &err);
+	if (err) { 
+		agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clCreateKernel(%p,supernode) failed(%d) for %s\n", node->opencl_program, err, node->akernel->name);
+		return -1; 
+	}
+	// set all kernel objects
+	vx_uint32 kernelArgIndex = 0;
+	for (size_t index = 0; index < node->paramCount; index++) {
+		if (node->paramList[index] && !(node->opencl_param_discard_mask & (1 << index))) {
+			vx_uint32 dataFlags = 0;
+			if (node->paramList[index]->ref.type == VX_TYPE_IMAGE) {
+				dataFlags |= NODE_OPENCL_TYPE_NEED_IMGSIZE;
+			}
+			else if (node->paramList[index]->ref.type == VX_TYPE_ARRAY) {
+				if (node->opencl_param_atomic_mask & (1 << index)) {
+					dataFlags |= NODE_OPENCL_TYPE_ATOMIC;
+				}
+			}
+			if (agoGpuOclSetKernelArgs(node->opencl_kernel, kernelArgIndex, node->paramList[index], true, dataFlags, 0) < 0) {
+				return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoGpuOclSingleNodeLaunch(AgoGraph * graph, AgoNode * node)
+{
+	// compute global work (if requested) and set numitems of output array (if requested further)
+	if (node->opencl_compute_work_multiplier > 0) {
+		AgoData * data = node->paramList[node->opencl_compute_work_param_index];
+		if (data->ref.type == VX_TYPE_ARRAY) {
+			// derive global_work[0] from numitems of array
+			node->opencl_global_work[0] = data->u.arr.numitems * node->opencl_compute_work_multiplier;
+			if (node->opencl_local_work[0] > 0) {
+				size_t mask = node->opencl_local_work[0] - 1;
+				node->opencl_global_work[0] = (node->opencl_global_work[0] + mask) & ~mask;
+			}
+			// set numitems of output array param index (if requested)
+			if (node->opencl_output_array_param_index_plus1 > 0) {
+				AgoData * arr = node->paramList[node->opencl_output_array_param_index_plus1 - 1];
+				if (arr->ref.type == VX_TYPE_ARRAY) {
+					arr->u.arr.numitems = data->u.arr.numitems;
+				}
+			}
+		}
+		else {
+			agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoGpuOclSingleNodeLaunch: invalid opencl_compute_work_multiplier=%d\n", node->opencl_compute_work_multiplier);
+			return -1;
+		}
+	}
+	// make sure that all input buffers are synched and other arguments are updated
+	vx_uint32 kernelArgIndex = 0;
+	for (size_t index = 0; index < node->paramCount; index++) {
+		if (node->paramList[index] && !(node->opencl_param_discard_mask & (1 << index))) {
+			bool need_read_access = node->parameters[index].direction != VX_OUTPUT ? true : false;
+			bool need_atomic_access = (node->opencl_param_atomic_mask & (1 << index)) ? true : false;
+			if (agoGpuOclDataInputSync(graph, node->opencl_kernel, kernelArgIndex, node->paramList[index], NODE_OPENCL_TYPE_NEED_IMGSIZE, 0, true, need_read_access, need_atomic_access) < 0) {
+				return -1;
+			}
+		}
+	}
+	// launch the kernel
+	int64_t stime = agoGetClockCounter();
+	cl_int err;
+	err = clEnqueueNDRangeKernel(graph->opencl_cmdq, node->opencl_kernel, node->opencl_work_dim, NULL, node->opencl_global_work, NULL, 0, NULL, &node->opencl_event);
+	if (err) { 
+		agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clEnqueueNDRangeKernel(supernode,%d,*,{%d,%d,%d},...) failed(%d) for %s\n", (cl_uint)node->opencl_work_dim, (cl_uint)node->opencl_global_work[0], (cl_uint)node->opencl_global_work[1], (cl_uint)node->opencl_global_work[2], err, node->akernel->name);
+		return -1; 
+	}
+	err = clFlush(graph->opencl_cmdq);
+	if (err) { 
+		agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clFlush(supernode) failed(%d) for %s\n", err, node->akernel->name);
+		return -1; 
+	}
+	int64_t etime = agoGetClockCounter();
+	graph->opencl_perf.kernel_enqueue += etime - stime;
+	// mark that node outputs are dirty
+	for (size_t index = 0; index < node->paramCount; index++) {
+		if (node->paramList[index]) {
+			bool need_write_access = node->parameters[index].direction != VX_INPUT ? true : false;
+			if (agoGpuOclDataOutputMarkDirty(graph, node->paramList[index], true, need_write_access) < 0) {
+				return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+int agoGpuOclSingleNodeWait(AgoGraph * graph, AgoNode * node)
+{
+	// wait for completion
+	int64_t stime = agoGetClockCounter();
+	cl_int err;
+	err = clWaitForEvents(1, &node->opencl_event);
+	if (err) { 
+		agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clWaitForEvents(1,%p) failed(%d) for %s\n", node->opencl_event, err, node->akernel->name);
+		return -1; 
+	}
+	clReleaseEvent(node->opencl_event);
+	node->opencl_event = NULL;
+	int64_t etime = agoGetClockCounter();
+	graph->opencl_perf.kernel_wait += etime - stime;
+	// sync the outputs
+	for (size_t index = 0; index < node->paramCount; index++) {
+		if (node->paramList[index]) {
+			bool need_write_access = node->parameters[index].direction != VX_INPUT ? true : false;
+			if (need_write_access && node->opencl_param_atomic_mask & (1 << index)) {
+				if (agoGpuOclDataOutputAtomicSync(graph, node->paramList[index]) < 0) {
+					return -1;
+				}
+			}
+#if ENABLE_DEBUG_DUMP_CL_BUFFERS
+			else if (node->paramList[index]->ref.type == VX_TYPE_IMAGE) {
+				if (need_write_access) {
+					auto dataToSync = node->paramList[index]->u.img.isROI ? node->paramList[index]->u.img.roiMasterImage : node->paramList[index];
+					char fileName[128]; sprintf(fileName, "input_%%04d_%dx%d.yuv", dataToSync->u.img.width, dataToSync->u.img.height);
+					clDumpBuffer(fileName, graph->opencl_cmdq, node->paramList[index]);
+					//printf("Press ENTER to continue... ");  char line[256]; gets(line);
+				}
+			}
+#endif
+		}
+	}
+	if (node->opencl_scalar_array_output_sync.enable && 
+		node->paramList[node->opencl_scalar_array_output_sync.paramIndexScalar] && 
+		node->paramList[node->opencl_scalar_array_output_sync.paramIndexArray])
+	{
+		// updated scalar with numitems of array
+		node->paramList[node->opencl_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
+			node->paramList[node->opencl_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
+	}
+
+	// The num items in an array should not exceed the capacity unless kernels need it for reporting number of items detected (ex. FAST corners)
+	for (size_t index = 0; index < node->paramCount; index++) {
+		if (node->paramList[index]) {
+			bool need_write_access = node->parameters[index].direction != VX_INPUT ? true : false;
+			if (need_write_access && node->opencl_param_atomic_mask & (1 << index)) {
+				if (node->paramList[index]->ref.type == VX_TYPE_ARRAY) {
+					node->paramList[index]->u.arr.numitems = min(node->paramList[index]->u.arr.numitems, node->paramList[index]->u.arr.capacity);
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+#endif
diff --git a/openvx/api/vx_api.cpp b/openvx/api/vx_api.cpp
new file mode 100644
index 0000000..679c6ce
--- /dev/null
+++ b/openvx/api/vx_api.cpp
@@ -0,0 +1,4934 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include "ago_internal.h"
+
+vx_uint32 vxComputePatchOffset(vx_uint32 x, vx_uint32 y, const vx_imagepatch_addressing_t *addr)
+{
+	// TBD: division can be changed to right-shift
+	return ((addr->stride_y * ((addr->scale_y * y) / VX_SCALE_UNITY)) +
+		(addr->stride_x * ((addr->scale_x * x) / VX_SCALE_UNITY)));
+}
+
+/*! \brief Creates a <tt>\ref vx_context</tt>.
+* \details This creates a top-level object context for OpenVX.
+* \note This is required to do anything else.
+* \returns The reference to the implementation context.
+* \retval 0 No context was created.
+* \retval * A context reference.
+* \ingroup group_context
+* \post <tt>\ref vxReleaseContext</tt>
+*/
+VX_API_ENTRY vx_context VX_API_CALL vxCreateContext()
+{
+	vx_context context = agoCreateContext();
+	return context;
+}
+
+/*! \brief Releases the OpenVX object context.
+* \details All reference counted objects are garbage-collected by the return of this call.
+* No calls are possible using the parameter context after the context has been
+* released until a new reference from <tt>\ref vxCreateContext</tt> is returned.
+* All outstanding references to OpenVX objects from this context are invalid
+* after this call.
+* \param [in] context The pointer to the reference to the context.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_context
+* \pre <tt>\ref vxCreateContext</tt>
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseContext(vx_context *context)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (context && !agoReleaseContext(*context)) {
+		*context = NULL;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+/*! \brief Retrieves the context from any reference from within a context.
+* \param [in] reference The reference from which to extract the context.
+* \ingroup group_context
+* \return The overall context that created the particular
+* reference.
+*/
+VX_API_ENTRY vx_context VX_API_CALL vxGetContext(vx_reference reference)
+{
+	vx_context context = NULL;
+	if (agoIsValidReference(reference)) {
+		context = reference->context;
+	}
+	return context;
+}
+
+/*! \brief Queries the context for some specific information.
+* \param [in] context The reference to the context.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_context_attribute_e</tt>.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If the context is not a <tt>\ref vx_context</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+* \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation.
+* \ingroup group_context
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidContext(context)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			CAgoLock lock(context->cs);
+			switch (attribute)
+			{
+			case VX_CONTEXT_ATTRIBUTE_VENDOR_ID:
+				if (size == sizeof(vx_uint16)) {
+					*(vx_uint16 *)ptr = VX_ID_AMD;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_VERSION:
+				if (size == sizeof(vx_uint16)) {
+					*(vx_uint16 *)ptr = (vx_uint16)VX_VERSION;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_MODULES:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = (vx_uint32)context->num_active_modules;
+					status = VX_SUCCESS;
+				}
+			case VX_CONTEXT_ATTRIBUTE_REFERENCES:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = (vx_uint32)context->num_active_references;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_IMPLEMENTATION:
+				if (size <= VX_MAX_IMPLEMENTATION_NAME) {
+					strncpy((char *)ptr, "AMD OpenVX " AGO_VERSION, VX_MAX_IMPLEMENTATION_NAME);
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_EXTENSIONS_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = strlen(context->extensions) + 1;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_EXTENSIONS:
+				if (size >= strlen(context->extensions) + 1) {
+					strcpy((char *)ptr, context->extensions);
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = AGO_MAX_CONVOLUTION_DIM;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_OPTICAL_FLOW_WINDOW_MAXIMUM_DIMENSION:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = AGO_OPTICALFLOWPYRLK_MAX_DIM;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_IMMEDIATE_BORDER_MODE:
+				if (size == sizeof(vx_border_mode_t)) {
+					*(vx_border_mode_t *)ptr = context->immediate_border_mode;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNELS:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = (vx_uint32)context->kernelList.count;
+					status = VX_SUCCESS;
+				}
+			case VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNEL_TABLE:
+				if (size == (context->kernelList.count * sizeof(vx_kernel_info_t))) {
+					vx_kernel_info_t * table = (vx_kernel_info_t *)ptr;
+					for (AgoKernel * kernel = context->kernelList.head; kernel; kernel = kernel->next, table++) {
+						table->enumeration = kernel->id;
+						strncpy(table->name, kernel->name, VX_MAX_KERNEL_NAME);
+					}
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_AMD_AFFINITY:
+				if (size == sizeof(AgoTargetAffinityInfo_)) {
+					*(AgoTargetAffinityInfo_ *)ptr = context->attr_affinity;
+					status = VX_SUCCESS;
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_CONTEXT_ATTRIBUTE_AMD_OPENCL_CONTEXT:
+				if (size == sizeof(cl_context)) {
+					if (!context->opencl_context && agoGpuOclCreateContext(context, nullptr) != VX_SUCCESS) {
+						status = VX_FAILURE;
+					}
+					else {
+						*(cl_context *)ptr = context->opencl_context;
+						status = VX_SUCCESS;
+					}
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets an attribute on the context.
+* \param [in] context The handle to the overall context.
+* \param [in] attribute The attribute to set from <tt>\ref vx_context_attribute_e</tt>.
+* \param [in] ptr The pointer to the data to which to set the attribute.
+* \param [in] size The size in bytes of the data to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If the context is not a <tt>\ref vx_context</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+* \retval VX_ERROR_NOT_SUPPORTED If the attribute is not settable.
+* \ingroup group_context
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetContextAttribute(vx_context context, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidContext(context)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			CAgoLock lock(context->cs);
+			switch (attribute)
+			{
+			case VX_CONTEXT_ATTRIBUTE_IMMEDIATE_BORDER_MODE:
+				if (size == sizeof(vx_border_mode_t)) {
+					vx_border_mode_t immediate_border_mode = *(vx_border_mode_t *)ptr;
+					if (immediate_border_mode.mode == VX_BORDER_MODE_UNDEFINED || immediate_border_mode.mode == VX_BORDER_MODE_CONSTANT || immediate_border_mode.mode == VX_BORDER_MODE_REPLICATE) {
+						context->immediate_border_mode = immediate_border_mode;
+						if (immediate_border_mode.mode == VX_BORDER_MODE_UNDEFINED || immediate_border_mode.mode == VX_BORDER_MODE_REPLICATE)
+							context->immediate_border_mode.constant_value = 0;
+						status = VX_SUCCESS;
+					}
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_AMD_SET_TEXT_MACRO:
+				if (size == sizeof(AgoContextTextMacroInfo)) {
+					status = VX_SUCCESS;
+					AgoContextTextMacroInfo * info = (AgoContextTextMacroInfo *)ptr;
+					for (auto it = context->macros.begin(); it != context->macros.end(); ++it) {
+						if (!strcmp(it->name, info->macroName)) {
+							status = VX_FAILURE;
+							agoAddLogEntry(&context->ref, status, "ERROR: vxSetContextAttribute: macro already exists: %s\n", info->macroName);
+							break;
+						}
+					}
+					if (status == VX_SUCCESS) {
+						MacroData macro;
+						macro.text = macro.text_allocated = (char *)calloc(1, strlen(info->text) + 1);
+						if (!macro.text) {
+							status = VX_ERROR_NO_MEMORY;
+						}
+						else {
+							strncpy(macro.name, info->macroName, sizeof(macro.name) - 1);
+							strcpy(macro.text, info->text);
+							context->macros.push_back(macro);
+						}
+					}
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_AMD_SET_MERGE_RULE:
+				if (size == sizeof(AgoNodeMergeRule)) {
+					status = VX_SUCCESS;
+					context->merge_rules.push_back(*(AgoNodeMergeRule *)ptr);
+				}
+				break;
+			case VX_CONTEXT_ATTRIBUTE_AMD_AFFINITY:
+				if (size == sizeof(AgoTargetAffinityInfo_)) {
+					status = VX_SUCCESS;
+					context->attr_affinity = *(AgoTargetAffinityInfo_ *)ptr;
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_CONTEXT_ATTRIBUTE_AMD_OPENCL_CONTEXT:
+				if (size == sizeof(cl_context)) {
+					if (!context->opencl_context) {
+						status = agoGpuOclCreateContext(context, *(cl_context *)ptr);
+					}
+					else {
+						status = VX_FAILURE;
+					}
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Provides a generic API to give platform-specific hints to the implementation.
+* \param [in] context The reference to the implementation context.
+* \param [in] reference The reference to the object to hint at.
+* This could be <tt>\ref vx_context</tt>, <tt>\ref vx_graph</tt>, <tt>\ref vx_node</tt>, <tt>\ref vx_image</tt>, <tt>\ref vx_array</tt>, or any other reference.
+* \param [in] hint A <tt>\ref vx_hint_e</tt> \a hint to give the OpenVX context. This is a platform-specific optimization or implementation mechanism.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No error.
+* \retval VX_ERROR_INVALID_REFERENCE If context or reference is invalid.
+* \retval VX_ERROR_NOT_SUPPORTED If the hint is not supported.
+* \ingroup group_hint
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxHint(vx_reference reference, vx_enum hint)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidReference(reference)) {
+		vx_context context = reference->context;
+		if (agoIsValidContext(context)) {
+			CAgoLock lock(context->cs);
+			status = VX_SUCCESS;
+			switch (hint)
+			{
+			case VX_HINT_SERIALIZE:
+				reference->hint_serialize = true;
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Provides a generic API to give platform-specific directives to the implementations.
+* \param [in] context The reference to the implementation context.
+* \param [in] reference The reference to the object to set the directive on.
+* This could be <tt>\ref vx_context</tt>, <tt>\ref vx_graph</tt>, <tt>\ref vx_node</tt>, <tt>\ref vx_image</tt>, <tt>\ref vx_array</tt>, or any other reference.
+* \param [in] directive The directive to set.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No error.
+* \retval VX_ERROR_INVALID_REFERENCE If context or reference is invalid.
+* \retval VX_ERROR_NOT_SUPPORTED If the directive is not supported.
+* \ingroup group_directive
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxDirective(vx_reference reference, vx_enum directive)
+{
+	return agoDirective(reference, directive);
+}
+
+/*! \brief Provides a generic API to return status values from Object constructors if they
+* fail.
+* \note Users do not need to strictly check every object creator as the errors
+* should properly propogate and be detected during verification time or run-time.
+* \code
+* vx_image img = vxCreateImage(context, 639, 480, VX_DF_IMAGE_UYVY);
+* vx_status status = vxGetStatus((vx_reference)img);
+* // status == VX_ERROR_INVALID_DIMENSIONS
+* vxReleaseImage(&img);
+* \endcode
+* \pre Appropriate Object Creator function.
+* \post Appropriate Object Release function.
+* \param [in] reference The reference to check for construction errors.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No error.
+* \retval * Some error occurred, please check enumeration list and constructor.
+* \ingroup group_basic_features
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxGetStatus(vx_reference reference)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidReference(reference)) {
+		status = reference->status;
+	}
+	return status;
+}
+
+/*!
+* \brief Registers user-defined structures to the context.
+* \param [in] context  The reference to the implementation context.
+* \param [in] size     The size of user struct in bytes.
+* \return A <tt>\ref vx_enum</tt> value that is a type given to the User
+* to refer to their custom structure when declaring a <tt>\ref vx_array</tt>
+* of that structure.
+* \retval VX_TYPE_INVALID If the namespace of types has been exhausted.
+* \note This call should only be used once within the lifetime of a context for
+* a specific structure.
+*
+* \snippet vx_arrayrange.c array define
+* \ingroup group_adv_array
+*/
+VX_API_ENTRY vx_enum VX_API_CALL vxRegisterUserStruct(vx_context context, vx_size size)
+{
+	vx_enum type = VX_TYPE_INVALID;
+	if (agoIsValidContext(context) && (size > 0)) {
+		CAgoLock lock(context->cs);
+		type = agoAddUserStruct(context, size, NULL);
+	}
+	return type;
+}
+
+/*==============================================================================
+IMAGE
+=============================================================================*/
+
+/*! \brief Creates an opaque reference to an image buffer.
+* \details Not guaranteed to exist until the <tt>\ref vx_graph</tt> containing it has been verified.
+* \param [in] context The reference to the implementation context.
+* \param [in] width The image width in pixels.
+* \param [in] height The image height in pixels.
+* \param [in] color The VX_DF_IMAGE (<tt>\ref vx_df_image_e</tt>) code that represents the format of the image and the color space.
+* \return An image reference or zero when an error is encountered.
+* \see vxAccessImagePatch to obtain direct memory access to the image data.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_image VX_API_CALL vxCreateImage(vx_context context, vx_uint32 width, vx_uint32 height, vx_df_image color)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		char desc[128]; sprintf(desc, "image:%4.4s,%d,%d", FORMAT_STR(color), width, height);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "image", data->name);
+			agoAddData(&context->dataList, data);
+			// if data has children, add them too
+			if (data->children) {
+				for (vx_uint32 i = 0; i < data->numChildren; i++) {
+					agoAddData(&context->dataList, data->children[i]);
+				}
+			}
+		}
+	}
+	return (vx_image)data;
+}
+
+/*! \brief Creates an image from another image given a rectangle. This second
+* reference refers to the data in the original image. Updates to this image
+* updates the parent image. The rectangle must be defined within the pixel space
+* of the parent image.
+* \param [in] img The reference to the parent image.
+* \param [in] rect The region of interest rectangle. Must contain points within
+* the parent image pixel space.
+* \return The reference to the sub-image or zero if the rectangle is invalid.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromROI(vx_image img, const vx_rectangle_t *rect)
+{
+	AgoData * master_img = (AgoData *)img;
+	AgoData * data = NULL;
+	if (agoIsValidData(master_img, VX_TYPE_IMAGE)) {
+		vx_context context = master_img->ref.context;
+		CAgoLock lock(context->cs);
+		char desc[128]; sprintf(desc, "image-roi:%s,%d,%d,%d,%d", master_img->name.c_str(), rect->start_x, rect->start_y, rect->end_x, rect->end_y);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "image-roi", data->name);
+			agoAddData(&context->dataList, data);
+			// if data has children, add them too
+			if (data->children) {
+				for (vx_uint32 i = 0; i < data->numChildren; i++) {
+					agoAddData(&context->dataList, data->children[i]);
+				}
+			}
+		}
+	}
+	return (vx_image)data;
+}
+
+/*! \brief Creates a reference to an image object that has a singular,
+* uniform value in all pixels.
+* \details The value pointer must reflect the specific format of the desired
+* image. For example:
+* | Color       | Value Ptr  |
+* |:------------|:-----------|
+* | <tt>\ref VX_DF_IMAGE_U8</tt>   | vx_uint8 * |
+* | <tt>\ref VX_DF_IMAGE_S16</tt>  | vx_int16 * |
+* | <tt>\ref VX_DF_IMAGE_U16</tt>  | vx_uint16 *|
+* | <tt>\ref VX_DF_IMAGE_S32</tt>  | vx_int32 * |
+* | <tt>\ref VX_DF_IMAGE_U32</tt>  | vx_uint32 *|
+* | <tt>\ref VX_DF_IMAGE_RGB</tt>  | vx_uint8 pixel[3] in R, G, B order |
+* | <tt>\ref VX_DF_IMAGE_RGBX</tt> | vx_uint8 pixels[4] |
+* | Any YUV     | vx_uint8 pixel[3] in Y, U, V order |
+*
+* \param [in] context The reference to the implementation context.
+* \param [in] width The image width in pixels.
+* \param [in] height The image height in pixels.
+* \param [in] color The VX_DF_IMAGE (\ref vx_df_image_e) code that represents the format of the image and the color space.
+* \param [in] value The pointer to the pixel value to which to set all pixels.
+* \return An image reference or zero when an error is encountered.
+* <tt>\see vxAccessImagePatch</tt> to obtain direct memory access to the image data.
+* \note <tt>\ref vxAccessImagePatch</tt> and <tt>\ref vxCommitImagePatch</tt> may be called with
+* a uniform image reference.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_image VX_API_CALL vxCreateUniformImage(vx_context context, vx_uint32 width, vx_uint32 height, vx_df_image color, const void *value)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		char desc[128];
+		if (color == VX_DF_IMAGE_S16) {
+			sprintf(desc, "image-uniform:%4.4s,%d,%d,%d", FORMAT_STR(color), width, height, *(vx_int16 *)value);
+		}
+		else if (color == VX_DF_IMAGE_U16) {
+			sprintf(desc, "image-uniform:%4.4s,%d,%d,%d", FORMAT_STR(color), width, height, *(vx_uint16 *)value);
+		}
+		else if (color == VX_DF_IMAGE_S32) {
+			sprintf(desc, "image-uniform:%4.4s,%d,%d,%d", FORMAT_STR(color), width, height, *(vx_int32 *)value);
+		}
+		else if (color == VX_DF_IMAGE_U32) {
+			sprintf(desc, "image-uniform:%4.4s,%d,%d,%u", FORMAT_STR(color), width, height, *(vx_uint32 *)value);
+		}
+		else {
+			sprintf(desc, "image-uniform:%4.4s,%d,%d,%d,%d,%d,%d", FORMAT_STR(color), width, height, ((vx_uint8 *)value)[0], ((vx_uint8 *)value)[1], ((vx_uint8 *)value)[2], ((vx_uint8 *)value)[3]);
+		}
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "image-uniform", data->name);
+			agoAddData(&context->dataList, data);
+			// if data has children, add them too
+			if (data->children) {
+				for (vx_uint32 i = 0; i < data->numChildren; i++) {
+					agoAddData(&context->dataList, data->children[i]);
+				}
+			}
+		}
+	}
+	return (vx_image)data;
+}
+
+/*! \brief Creates an opaque reference to an image buffer with no direct
+* user access. This function allows setting the image width, height, or format.
+* \details Virtual data objects allow users to connect various nodes within a
+* graph via data references without access to that data, but they also permit the
+* implementation to take maximum advantage of possible optimizations. Use this
+* API to create a data reference to link two or more nodes together when the
+* intermediate data are not required to be accessed by outside entities. This API
+* in particular allows the user to define the image format of the data without
+* requiring the exact dimensions. Virtual objects are scoped within the graph
+* they are declared a part of, and can't be shared outside of this scope.
+* All of the following constructions of virtual images are valid.
+* \code
+* vx_context context = vxCreateContext();
+* vx_graph graph = vxCreateGraph(context);
+* vx_image virt[] = {
+*     vxCreateVirtualImage(graph, 0, 0, VX_DF_IMAGE_U8), // no specified dimension
+*     vxCreateVirtualImage(graph, 320, 240, VX_DF_IMAGE_VIRT), // no specified format
+*     vxCreateVirtualImage(graph, 640, 480, VX_DF_IMAGE_U8), // no user access
+* };
+* \endcode
+* \param [in] graph The reference to the parent graph.
+* \param [in] width The width of the image in pixels. A value of zero informs the interface that the value is unspecified.
+* \param [in] height The height of the image in pixels. A value of zero informs the interface that the value is unspecified.
+* \param [in] color The VX_DF_IMAGE (<tt>\ref vx_df_image_e</tt>) code that represents the format of the image and the color space. A value of <tt>\ref VX_DF_IMAGE_VIRT</tt> informs the interface that the format is unspecified.
+* \return An image reference or zero when an error is encountered.
+* \note Passing this reference to <tt>\ref vxAccessImagePatch</tt> will return an error.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_image VX_API_CALL vxCreateVirtualImage(vx_graph graph, vx_uint32 width, vx_uint32 height, vx_df_image color)
+{
+	AgoData * data = NULL;
+	if (agoIsValidGraph(graph)) {
+		vx_context context = graph->ref.context;
+		CAgoLock lock(graph->cs);
+		char desc[128]; sprintf(desc, "image-virtual:%4.4s,%d,%d", FORMAT_STR(color), width, height);
+		data = agoCreateDataFromDescription(context, graph, desc, true);
+		if (data) {
+			agoGenerateVirtualDataName(graph, "image", data->name);
+			agoAddData(&graph->dataList, data);
+			// if data has children, add them too
+			if (data->children) {
+				for (vx_uint32 i = 0; i < data->numChildren; i++) {
+					agoAddData(&graph->dataList, data->children[i]);
+				}
+			}
+		}
+	}
+	return (vx_image)data;
+}
+
+/*! \brief Creates a reference to an image object that was externally allocated.
+* \param [in] context The reference to the implementation context.
+* \param [in] color See the <tt>\ref vx_df_image_e</tt> codes. This mandates the
+* number of planes needed to be valid in the \a addrs and \a ptrs arrays based on the format given.
+* \param [in] addrs[] The array of image patch addressing structures that
+* define the dimension and stride of the array of pointers.
+* \param [in] ptrs[] The array of platform-defined references to each plane.
+* \param [in] import_type <tt>\ref vx_import_type_e</tt>. When giving <tt>\ref VX_IMPORT_TYPE_HOST</tt>
+* the \a ptrs array is assumed to be HOST accessible pointers to memory.
+* \return <tt>\ref vx_image</tt>.
+* \retval 0 Image could not be created.
+* \retval * Valid Image reference.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromHandle(vx_context context, vx_df_image color, vx_imagepatch_addressing_t addrs[], void *ptrs[], vx_enum import_type)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context)) {
+		if (import_type == VX_IMPORT_TYPE_HOST) {
+			char desc[128]; sprintf(desc, "image:%4.4s,%d,%d", FORMAT_STR(color), addrs[0].dim_x, addrs[0].dim_y);
+			data = agoCreateDataFromDescription(context, NULL, desc, true);
+			if (data) {
+				agoGenerateDataName(context, "image-host", data->name);
+				agoAddData(&context->dataList, data);
+				// if data has children, add them too
+				if (data->children) {
+					for (vx_uint32 i = 0; i < data->numChildren; i++) {
+						agoAddData(&context->dataList, data->children[i]);
+					}
+				}
+				// set host allocated pointers
+				// TBD: check for errors in addrs[]
+				if (data->children) {
+					for (vx_uint32 i = 0; i < data->numChildren; i++) {
+						data->children[i]->import_type = VX_IMPORT_TYPE_HOST;
+						data->children[i]->buffer = (vx_uint8 *)ptrs[i];
+						data->children[i]->u.img.stride_in_bytes = addrs[i].stride_y;
+					}
+				}
+				else {
+					data->import_type = VX_IMPORT_TYPE_HOST;
+					data->buffer = (vx_uint8 *)ptrs[0];
+					data->u.img.stride_in_bytes = addrs[0].stride_y;
+				}
+			}
+		}
+	}
+	return (vx_image)data;
+}
+
+/*! \brief Retrieves various attributes of an image.
+* \param [in] image The reference to the image to query.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_image_attribute_e</tt>.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If the image is not a <tt>\ref vx_image</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+* \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryImage(vx_image image_, vx_enum attribute, void *ptr, vx_size size)
+{
+	AgoData * image = (AgoData *)image_;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(image, VX_TYPE_IMAGE)) {
+		CAgoLock lock(image->ref.context->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_IMAGE_ATTRIBUTE_WIDTH:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = image->u.img.width;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_HEIGHT:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = image->u.img.height;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_FORMAT:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_df_image *)ptr = image->u.img.format;
+					status = VX_SUCCESS;
+				}
+			case VX_IMAGE_ATTRIBUTE_PLANES:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = image->u.img.planes;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_SPACE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = image->u.img.color_space;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_RANGE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = image->u.img.channel_range;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_SIZE:
+				if (size == sizeof(vx_size)) {
+					status = VX_SUCCESS;
+					if (image->numChildren) {
+						size = 0;
+						for (vx_uint32 plane = 0; plane < image->u.img.planes; plane++) {
+							if (!image->children[plane]->size) {
+								if (image->children[plane]->isNotFullyConfigured || agoDataSanityCheckAndUpdate(image->children[plane])) {
+									status = VX_ERROR_INVALID_REFERENCE;
+								}
+							}
+							size += image->children[plane]->size;
+						}
+						if (status == VX_SUCCESS)
+							*(vx_size *)ptr = size;
+					}
+					else {
+						if (!image->size) {
+							if (image->isNotFullyConfigured || agoDataSanityCheckAndUpdate(image)) {
+								status = VX_ERROR_INVALID_REFERENCE;
+							}
+						}
+						if (status == VX_SUCCESS)
+							*(vx_size *)ptr = image->size;
+					}
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_IMAGE_ATTRIBUTE_AMD_OPENCL_BUFFER:
+				if (size == sizeof(cl_mem)) {
+					if (image->opencl_buffer) {
+						*(cl_mem *)ptr = image->opencl_buffer;
+					}
+					else {
+						*(vx_uint8 **)ptr = image->opencl_svm_buffer;
+					}
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_AMD_OPENCL_BUFFER_OFFSET:
+				if (size == sizeof(cl_uint)) {
+					*(cl_uint *)ptr = image->opencl_buffer_offset;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL:
+				if (size == sizeof(vx_bool)) {
+					*(vx_bool *)ptr = image->u.img.enableUserBufferOpenCL;
+					status = VX_SUCCESS;
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Allows setting attributes on the image.
+* \param [in] image The reference to the image on which to set the attribute.
+* \param [in] attribute The attribute to set. Use a <tt>\ref vx_image_attribute_e</tt> enumeration.
+* \param [in] out The pointer to the location from which to read the value.
+* \param [in] size The size of the object pointed to by \a out.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If the image is not a <tt>\ref vx_image</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetImageAttribute(vx_image image_, vx_enum attribute, const void *ptr, vx_size size)
+{
+	AgoData * image = (AgoData *)image_;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(image, VX_TYPE_IMAGE)) {
+		CAgoLock lock(image->ref.context->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_IMAGE_ATTRIBUTE_SPACE:
+				if (size == sizeof(vx_enum)) {
+					image->u.img.color_space = *(vx_color_space_e *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_RANGE:
+				if (size == sizeof(vx_enum)) {
+					image->u.img.channel_range = *(vx_channel_range_e *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_IMAGE_ATTRIBUTE_AMD_OPENCL_BUFFER:
+				if (size == sizeof(cl_mem) && image->u.img.enableUserBufferOpenCL) {
+					image->opencl_buffer = *(cl_mem *)ptr;
+					if (image->opencl_buffer) {
+						image->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+						image->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL;
+					}
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_AMD_OPENCL_BUFFER_OFFSET:
+				if (size == sizeof(cl_uint) && image->u.img.enableUserBufferOpenCL) {
+					image->opencl_buffer_offset = *(cl_uint *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Releases a reference to an image object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] image The pointer to the image to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseImage(vx_image *image)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (image && agoIsValidData((AgoData*)*image, VX_TYPE_IMAGE)) {
+		if (!agoReleaseData((AgoData*)*image, true)) {
+			*image = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief This computes the size needed to retrieve an image patch from an image.
+* \param [in] image The reference to the image from which to extract the patch.
+* \param [in] rect The coordinates. Must be 0 <= start < end <= dimension where
+* dimension is width for x and height for y.
+* \param [in] plane_index The plane index from which to get the data.
+* \return vx_size
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_size VX_API_CALL vxComputeImagePatchSize(vx_image image_,
+	const vx_rectangle_t *rect,
+	vx_uint32 plane_index)
+{
+	AgoData * image = (AgoData *)image_;
+	vx_size size = 0;
+	if (agoIsValidData(image, VX_TYPE_IMAGE) && !image->isVirtual && rect && (plane_index < image->u.img.planes)) {
+		AgoData * img = image;
+		if (image->children) {
+			img = image->children[plane_index];
+		}
+		size = (((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * 
+			    ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3;
+	}
+	return size;
+}
+
+/*! \brief Allows the User to extract a rectangular patch (subset) of an image from a single plane.
+* \param [in] image The reference to the image from which to extract the patch.
+* \param [in] rect The coordinates from which to get the patch. Must be 0 <= start < end.
+* \param [in] plane_index The plane index from which to get the data.
+* \param [out] addr The addressing information for the image patch to be written into the data structure.
+* \param [out] ptr The pointer to a pointer of a location to store the data.
+* \arg If the user passes in a NULL, an error occurs.
+* \arg If the user passes in a pointer to a NULL, the function returns internal memory, map, or allocates a buffer and returns it.
+* \arg If the user passes in a pointer to a non-NULL pointer, the function attempts to
+* copy to the location provided by the user.
+*
+* (*ptr) must be given to <tt>\ref vxCommitImagePatch</tt>.
+* \param [in] usage This declares the intended usage of the pointer using the <tt>\ref vx_accessor_e</tt> enumeration.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_ERROR_OPTIMIZED_AWAY The reference is a virtual image and cannot be accessed or committed.
+* \retval VX_ERROR_INVALID_PARAMETERS The \a start, \a end, \a plane_index, \a stride_x, or \a stride_y pointer is incorrect.
+* \retval VX_ERROR_INVALID_REFERENCE The image reference is not actually an image reference.
+* \note The user may ask for data outside the bounds of the valid region, but
+* such data has an undefined value.
+* \note Users must be cautious to prevent passing in \e uninitialized pointers or
+* addresses of uninitialized pointers to this function.
+* \pre <tt>\ref vxComputeImagePatchSize</tt> if users wish to allocate their own memory.
+* \post <tt>\ref vxCommitImagePatch</tt> with same (*ptr) value.
+* \ingroup group_image
+* \include vx_imagepatch.c
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAccessImagePatch(vx_image image_,
+	const vx_rectangle_t *rect,
+	vx_uint32 plane_index,
+	vx_imagepatch_addressing_t *addr,
+	void **ptr,
+	vx_enum usage)
+{
+	AgoData * image = (AgoData *)image_;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(image, VX_TYPE_IMAGE)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (image->isVirtual && !image->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if ((plane_index < image->u.img.planes) && addr && ptr && rect && 
+				 rect->start_x < rect->end_x && rect->start_y < rect->end_y &&
+				 rect->end_x <= image->u.img.width && rect->end_y <= image->u.img.height &&
+				 (!image->u.img.isUniform || usage == VX_READ_ONLY) && !image->isNotFullyConfigured)
+		{
+			AgoData * img = image;
+			if (image->children) {
+				img = image->children[plane_index];
+			}
+			if (!img->buffer) {
+				CAgoLock lock(img->ref.context->cs);
+				if (agoAllocData(img)) {
+					return VX_FAILURE;
+				}
+			}
+			if (!*ptr) {
+				addr->dim_x = rect->end_x - rect->start_x;
+				addr->dim_y = rect->end_y - rect->start_y;
+				addr->scale_x = VX_SCALE_UNITY >> img->u.img.x_scale_factor_is_2;
+				addr->scale_y = VX_SCALE_UNITY >> img->u.img.y_scale_factor_is_2;
+				addr->step_x = 1 << img->u.img.x_scale_factor_is_2;
+				addr->step_y = 1 << img->u.img.y_scale_factor_is_2;
+				addr->stride_x = ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3;
+				addr->stride_y = img->u.img.stride_in_bytes;
+			}
+			vx_uint8 * ptr_internal = img->buffer + 
+				(rect->start_y >> img->u.img.y_scale_factor_is_2) * img->u.img.stride_in_bytes + 
+				(((rect->start_x >> img->u.img.x_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3);
+			vx_uint8 * ptr_returned = *ptr ? (vx_uint8 *)*ptr : ptr_internal;
+			// save the pointer and usage for use in vxCommitImagePatch
+			status = VX_SUCCESS;
+			for (auto i = img->mapped.begin(); i != img->mapped.end(); i++) {
+				if (i->ptr == ptr_returned) {
+					// can't support vxAccessImagePatch() more than once with same pointer
+					// the application needs to call vxCommitImagePatch() before calling vxAccessImagePatch()
+					status = VX_FAILURE;
+				}
+			}
+			if (status == VX_SUCCESS) {
+				MappedData item = { ptr_returned, usage, (ptr_returned != ptr_internal) ? true : false };
+				img->mapped.push_back(item);
+				*ptr = ptr_returned;
+				if (usage == VX_READ_ONLY || usage == VX_READ_AND_WRITE) {
+#if ENABLE_OPENCL
+					auto dataToSync = img->u.img.isROI ? img->u.img.roiMasterImage : img;
+					if (dataToSync->opencl_buffer && !(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+						// make sure dirty OpenCL buffers are synched before giving access for read
+						if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL)) {
+							cl_int err = clEnqueueReadBuffer(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, dataToSync->size, dataToSync->buffer, 0, NULL, NULL);
+							if (err) {
+								status = VX_FAILURE;
+								agoAddLogEntry(&image->ref, status, "ERROR: vxAccessImagePatch: clEnqueueReadBuffer() => %d\n", err);
+								return status;
+							}
+							dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+						}
+					}
+#endif
+					if (item.used_external_ptr) {
+						// copy if read is requested with explicit external buffer
+						if (addr->stride_x == ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3)
+							HafCpu_ChannelCopy_U8_U8((addr->dim_x >> img->u.img.x_scale_factor_is_2) * addr->stride_x, (addr->dim_y >> img->u.img.y_scale_factor_is_2),
+							ptr_returned, addr->stride_y, ptr_internal, img->u.img.stride_in_bytes);
+						else
+							HafCpu_BufferCopyDisperseInDst((addr->dim_x >> img->u.img.x_scale_factor_is_2), (addr->dim_y >> img->u.img.y_scale_factor_is_2),
+							((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3, ptr_returned, addr->stride_y, addr->stride_x, ptr_internal, img->u.img.stride_in_bytes);
+					}
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief This allows the User to commit a rectangular patch (subset) of an image from a single plane.
+* \param [in] image The reference to the image from which to extract the patch.
+* \param [in] rect The coordinates to which to set the patch. Must be 0 <= start <= end.
+* This may be 0 or a rectangle of zero area in order to indicate that the commit
+* must only decrement the reference count.
+* \param [in] plane_index The plane index to which to set the data.
+* \param [in] addr The addressing information for the image patch.
+* \param [in] ptr The pointer of a location from which to read the data. If the
+* user allocated the pointer they must free it. If the pointer
+* was set by <tt>\ref vxAccessImagePatch</tt>, the user may not access the pointer after
+* this call completes.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_ERROR_OPTIMIZED_AWAY The reference is a virtual image and cannot be accessed or committed.
+* \retval VX_ERROR_INVALID_PARAMETERS The \a start, \a end, \a plane_index, \a stride_x, or \a stride_y pointer is incorrect.
+* \retval VX_ERROR_INVALID_REFERENCE The image reference is not actually an image reference.
+* \ingroup group_image
+* \include vx_imagepatch.c
+* \note If the implementation gives the client a pointer from
+* <tt>\ref vxAccessImagePatch</tt> then implementation-specific behavior may occur.
+* If not, then a copy occurs from the users pointer to the internal data of the object.
+* \note If the rectangle intersects bounds of the current valid region, the
+* valid region grows to the union of the two rectangles as long as they occur
+* within the bounds of the original image dimensions.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxCommitImagePatch(vx_image image_,
+	vx_rectangle_t *rect,
+	vx_uint32 plane_index,
+	vx_imagepatch_addressing_t *addr,
+	const void *ptr)
+{
+	AgoData * image = (AgoData *)image_;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(image, VX_TYPE_IMAGE)) {
+		// check for ZERO AREA and mark rect as NULL for ZERO AREA
+		if (rect && ((rect->start_x == rect->end_x) || (rect->start_y == rect->end_y)))
+			rect = NULL;
+		// check for valid arguments
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (image->isVirtual && !image->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if ((plane_index < image->u.img.planes) && addr && ptr && 
+				 (!rect || (rect->start_x < rect->end_x && rect->start_y < rect->end_y && rect->end_x <= image->u.img.width && rect->end_y <= image->u.img.height)))
+		{
+			status = VX_SUCCESS;
+			AgoData * img = image;
+			if (image->children) {
+				img = image->children[plane_index];
+			}
+			if (!img->buffer) {
+				status = VX_FAILURE;
+			}
+			else if (!img->mapped.empty()) {
+				vx_enum usage = VX_READ_ONLY;
+				bool used_external_ptr = false;
+				for (auto i = img->mapped.begin(); i != img->mapped.end(); i++) {
+					if (i->ptr == ptr) {
+						if (rect) {
+							usage = i->usage;
+							used_external_ptr = i->used_external_ptr;
+						}
+						img->mapped.erase(i);
+						break;
+					}
+				}
+				if (usage == VX_WRITE_ONLY || usage == VX_READ_AND_WRITE) {
+					// mark valid region
+					img->u.img.rect_valid.start_x = rect->start_x >> img->u.img.x_scale_factor_is_2;
+					img->u.img.rect_valid.start_y = rect->start_y >> img->u.img.y_scale_factor_is_2;
+					img->u.img.rect_valid.end_x = rect->end_x >> img->u.img.x_scale_factor_is_2;
+					img->u.img.rect_valid.end_y = rect->end_y >> img->u.img.y_scale_factor_is_2;
+					if (used_external_ptr) {
+						// copy from external buffer
+						vx_uint8 * buffer = img->buffer + (rect->start_y >> img->u.img.y_scale_factor_is_2) * img->u.img.stride_in_bytes + 
+							(((rect->start_x >> img->u.img.x_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3);
+
+						if (addr->stride_x == ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3)
+							HafCpu_ChannelCopy_U8_U8(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * addr->stride_x, ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2),
+							buffer, img->u.img.stride_in_bytes, (vx_uint8 *)ptr, addr->stride_y);
+						else
+							HafCpu_BufferCopyDisperseInSrc(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * addr->stride_x, ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2),
+							((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3, buffer, img->u.img.stride_in_bytes, (vx_uint8 *)ptr, addr->stride_y, addr->stride_x);
+					}
+					// update sync flags
+					auto dataToSync = img->u.img.isROI ? img->u.img.roiMasterImage : img;
+					dataToSync->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+					dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT;
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*!
+* \brief Accesses a specific indexed pixel in an image patch.
+* \param [in] ptr The base pointer of the patch as returned from <tt>\ref vxAccessImagePatch</tt>.
+* \param [in] index The 0 based index of the pixel count in the patch. Indexes increase horizontally by 1 then wrap around to the next row.
+* \param [in] addr The pointer to the addressing mode information returned from <tt>\ref vxAccessImagePatch</tt>.
+* \return void * Returns the pointer to the specified pixel.
+* \pre <tt>\ref vxAccessImagePatch</tt>
+* \include vx_imagepatch.c
+* \ingroup group_image
+*/
+VX_API_ENTRY void * VX_API_CALL vxFormatImagePatchAddress1d(void *ptr, vx_uint32 index, const vx_imagepatch_addressing_t *addr)
+{
+	vx_uint8 *new_ptr = NULL;
+	if (ptr && index < addr->dim_x*addr->dim_y)
+	{
+		vx_uint32 x = index % addr->dim_x;
+		vx_uint32 y = index / addr->dim_x;
+		vx_uint32 offset = vxComputePatchOffset(x, y, addr);
+		new_ptr = (vx_uint8 *)ptr;
+		new_ptr = &new_ptr[offset];
+	}
+	return new_ptr;
+}
+
+/*!
+* \brief Accesses a specific pixel at a 2d coordinate in an image patch.
+* \param [in] ptr The base pointer of the patch as returned from <tt>\ref vxAccessImagePatch</tt>.
+* \param [in] x The x dimension within the patch.
+* \param [in] y The y dimension within the patch.
+* \param [in] addr The pointer to the addressing mode information returned from <tt>\ref vxAccessImagePatch</tt>.
+* \return void * Returns the pointer to the specified pixel.
+* \pre <tt>\ref vxAccessImagePatch</tt>
+* \include vx_imagepatch.c
+* \ingroup group_image
+*/
+VX_API_ENTRY void * VX_API_CALL vxFormatImagePatchAddress2d(void *ptr, vx_uint32 x, vx_uint32 y, const vx_imagepatch_addressing_t *addr)
+{
+	vx_uint8 *new_ptr = NULL;
+	if (ptr && x < addr->dim_x && y < addr->dim_y)
+	{
+		vx_uint32 offset = vxComputePatchOffset(x, y, addr);
+		new_ptr = (vx_uint8 *)ptr;
+		new_ptr = &new_ptr[offset];
+	}
+	return new_ptr;
+}
+
+/*! \brief Retrieves the valid region of the image as a rectangle.
+* \details After the image is allocated but has not been written to this
+* returns the full rectangle of the image so that functions do not have to manage
+* a case for uninitialized data. The image still retains an uninitialized
+* value, but once the image is written to via any means such as <tt>\ref vxCommitImagePatch</tt>,
+* the valid region is altered to contain the maximum bounds of the written
+* area.
+* \param [in] image The image from which to retrieve the valid region.
+* \param [out] rect The destination rectangle.
+* \return vx_status
+* \retval VX_ERROR_INVALID_REFERENCE Invalid image.
+* \retval VX_ERROR_INVALID_PARAMETERS Invalid rect.
+* \retval VX_STATUS Valid image.
+* \note This rectangle can be passed directly to <tt>\ref vxAccessImagePatch</tt> to get
+* the full valid region of the image. Modifications from <tt>\ref vxCommitImagePatch</tt>
+* grows the valid region.
+* \ingroup group_image
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxGetValidRegionImage(vx_image image_, vx_rectangle_t *rect)
+{
+	AgoData * image = (AgoData *)image_;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(image, VX_TYPE_IMAGE))
+	{
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (rect) {
+			*rect = image->u.img.rect_valid;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+KERNEL
+=============================================================================*/
+
+/*! \brief Loads one or more kernels into the OpenVX context. This is the interface
+* by which OpenVX is extensible. Once the set of kernels is loaded new kernels
+* and their parameters can be queried.
+* \note When all references to loaded kernels are released, the module
+* may be automatically unloaded.
+* \param [in] context The reference to the implementation context.
+* \param [in] module The short name of the module to load. On systems where
+* there are specific naming conventions for modules, the name passed
+* should ignore such conventions. For example: \c libxyz.so should be
+* passed as just \c xyz and the implementation will <i>do the right thing</i> that
+* the platform requires.
+* \note This API uses the system pre-defined paths for modules.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If the context is not a <tt>\ref vx_context</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+* \ingroup group_user_kernels
+* \see vxGetKernelByName
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxLoadKernels(vx_context context, const vx_char *module)
+{
+	return agoLoadModule(context, module);
+}
+
+/*! \brief Obtains a reference to a kernel using a string to specify the name.
+* \param [in] context The reference to the implementation context.
+* \param [in] name The string of the name of the kernel to get.
+* \return A kernel reference or zero if an error occurred.
+* \retval 0 The kernel name is not found in the context.
+* \ingroup group_kernel
+* \pre <tt>\ref vxLoadKernels</tt> if the kernel is not provided by the
+* OpenVX implementation.
+* \note User Kernels should follow a "dotted" heirarchical syntax. For example:
+* "com.company.example.xyz".
+*/
+VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const vx_char *name)
+{
+	vx_kernel akernel = NULL;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		akernel = agoFindKernelByName(context, name);
+		if (akernel) {
+			akernel->ref.external_count++;
+		}
+	}
+	return akernel;
+}
+
+/*! \brief Obtains a reference to the kernel using the <tt>\ref vx_kernel_e</tt> enumeration.
+* \details Enum values above the standard set are assumed to apply to
+* loaded libraries.
+* \param [in] context The reference to the implementation context.
+* \param [in] kernel A value from <tt>\ref vx_kernel_e</tt> or a vendor or client-defined value.
+* \return A <tt>\ref vx_kernel</tt>.
+* \retval 0 The kernel enumeration is not found in the context.
+* \ingroup group_kernel
+* \pre <tt>\ref vxLoadKernels</tt> if the kernel is not provided by the
+* OpenVX implementation.
+*/
+VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByEnum(vx_context context, vx_enum kernel)
+{
+	vx_kernel akernel = NULL;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		akernel = agoFindKernelByEnum(context, kernel);
+		if (akernel) {
+			akernel->ref.external_count++;
+		}
+	}
+	return akernel;
+}
+
+/*! \brief This allows the client to query the kernel to get information about
+* the number of parameters, enum values, etc.
+* \param [in] kernel The kernel reference to query.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_kernel_attribute_e</tt>.
+* \param [out] ptr The pointer to the location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If the kernel is not a <tt>\ref vx_kernel</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+* \retval VX_ERROR_NOT_SUPPORTED If the attribute value is not supported in this implementation.
+* \ingroup group_kernel
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryKernel(vx_kernel kernel, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidKernel(kernel)) {
+		CAgoLock lock(kernel->ref.context->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_KERNEL_ATTRIBUTE_PARAMETERS:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = kernel->argCount;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_KERNEL_ATTRIBUTE_NAME:
+				if (ptr != NULL && size >= VX_MAX_KERNEL_NAME) {
+					strncpy((char *)ptr, kernel->name, size);
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_KERNEL_ATTRIBUTE_ENUM:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = kernel->id;
+					status = VX_SUCCESS;
+				}
+			case VX_KERNEL_ATTRIBUTE_LOCAL_DATA_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = kernel->localDataSize;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_KERNEL_ATTRIBUTE_LOCAL_DATA_PTR:
+				if (size == sizeof(void *)) {
+					*(void **)ptr = kernel->localDataPtr;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Release the reference to the kernel.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] kernel The pointer to the kernel reference to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_kernel
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseKernel(vx_kernel *kernel)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (kernel && agoIsValidKernel(*kernel)) {
+		if (!agoReleaseKernel(*kernel, true)) {
+			*kernel = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Allows users to add custom kernels to the known kernel
+* database in OpenVX at run-time. This would primarily be used by the module function
+* \c vxPublishKernels.
+* \param [in] context The reference to the implementation context.
+* \param [in] name The string to use to match the kernel.
+* \param [in] enumeration The enumerated value of the kernel to be used by clients.
+* \param [in] func_ptr The process-local function pointer to be invoked.
+* \param [in] numParams The number of parameters for this kernel.
+* \param [in] input The pointer to <tt>\ref vx_kernel_input_validate_f</tt>, which validates the
+* input parameters to this kernel.
+* \param [in] output The pointer to <tt>\ref vx_kernel_output_validate_f </tt>, which validates the
+* output parameters to this kernel.
+* \param [in] init The kernel initialization function.
+* \param [in] deinit The kernel de-initialization function.
+* \ingroup group_user_kernels
+* \return <tt>\ref vx_kernel</tt>
+* \retval 0 Indicates that an error occurred when adding the kernel.
+* \retval * Kernel added to OpenVX.
+*/
+VX_API_ENTRY vx_kernel VX_API_CALL vxAddKernel(vx_context context,
+	const vx_char name[VX_MAX_KERNEL_NAME],
+	vx_enum enumeration,
+	vx_kernel_f func_ptr,
+	vx_uint32 numParams,
+	vx_kernel_input_validate_f input,
+	vx_kernel_output_validate_f output,
+	vx_kernel_initialize_f init,
+	vx_kernel_deinitialize_f deinit)
+{
+	vx_kernel kernel = NULL;
+	if (agoIsValidContext(context) && numParams > 0 && numParams <= AGO_MAX_PARAMS && func_ptr && input && output) {
+		CAgoLock lock(context->cs);
+		// make sure there are no kernels with the same name
+		if (!agoFindKernelByEnum(context, enumeration) && !agoFindKernelByName(context, name)) {
+			kernel = new AgoKernel;
+			// initialize references
+			agoResetReference(&kernel->ref, VX_TYPE_KERNEL, context, NULL);
+			for (vx_uint32 index = 0; index < AGO_MAX_PARAMS; index++) {
+				agoResetReference(&kernel->parameters[index].ref, VX_TYPE_PARAMETER, kernel->ref.context, &kernel->ref);
+				kernel->parameters[index].scope = &kernel->ref;
+			}
+			// add kernel object to context
+			kernel->external_kernel = true;
+			kernel->ref.internal_count = 1;
+			kernel->ref.external_count = 1;
+			kernel->id = enumeration;
+			kernel->flags = AGO_KERNEL_FLAG_GROUP_USER | AGO_KERNEL_FLAG_DEVICE_CPU;
+			strcpy(kernel->name, name);
+			kernel->argCount = numParams;
+			kernel->kernel_f = func_ptr;
+			kernel->input_validate_f = input;
+			kernel->output_validate_f = output;
+			kernel->initialize_f = init;
+			kernel->deinitialize_f = deinit;
+			kernel->importing_module_index_plus1 = context->importing_module_index_plus1;
+			agoAddKernel(&context->kernelList, kernel);
+			// update reference count
+			kernel->ref.context->num_active_references++;
+		}
+	}
+	return kernel;
+}
+
+/*! \brief This API is called after all parameters have been added to the
+* kernel and the kernel is \e ready to be used.
+* \param [in] kernel The reference to the loaded kernel from <tt>\ref vxAddKernel</tt>.
+* \return A <tt>\ref vx_status_e</tt> enumeration. If an error occurs, the kernel is not available
+* for usage by the clients of OpenVX. Typically this is due to a mismatch
+* between the number of parameters requested and given.
+* \pre <tt>\ref vxAddKernel</tt> and <tt>\ref vxAddParameterToKernel</tt>
+* \ingroup group_user_kernels
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxFinalizeKernel(vx_kernel kernel)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidKernel(kernel)) {
+		CAgoLock lock(kernel->ref.context->cs);
+		if (kernel->external_kernel && !kernel->finalized && kernel->argCount > 0) {
+			status = VX_SUCCESS;
+			// check if kernel has been initialized properly
+			for (vx_uint32 i = 0; i < kernel->argCount; i++) {
+				if (!kernel->argType[i] || !kernel->argConfig[i] || !kernel->parameters[i].scope) {
+					status = VX_ERROR_INVALID_REFERENCE;
+					break;
+				}
+			}
+			if (status == VX_SUCCESS) {
+				// mark that kernel has been finalized
+				kernel->finalized = true;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Allows users to set the signatures of the custom kernel.
+* \param [in] kernel The reference to the kernel added with <tt>\ref vxAddKernel</tt>.
+* \param [in] index The index of the parameter to add.
+* \param [in] dir The direction of the parameter. This must be a value from <tt>\ref vx_direction_e</tt>.
+* \param [in] data_type The type of parameter. This must be a value from <tt>\ref vx_type_e</tt>.
+* \param [in] state The state of the parameter (required or not). This must be a value from <tt>\ref vx_parameter_state_e</tt>.
+* \return A <tt>\ref vx_status_e</tt> enumerated value.
+* \retval VX_SUCCESS Parameter is successfully set on kernel.
+* \retval VX_ERROR_INVALID_REFERENCE The value passed as kernel was not a \c vx_kernel.
+* \pre <tt>\ref vxAddKernel</tt>
+* \ingroup group_user_kernels
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToKernel(vx_kernel kernel, vx_uint32 index, vx_enum dir, vx_enum data_type, vx_enum state)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidKernel(kernel)) {
+		CAgoLock lock(kernel->ref.context->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		// add parameter if the kernel is not finalized and not a built-in kernel and not initialized earlier
+		if (kernel->external_kernel && !kernel->finalized && 
+			index < AGO_MAX_PARAMS &&
+			(dir == VX_INPUT || dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) && 
+			(state == VX_PARAMETER_STATE_REQUIRED || state == VX_PARAMETER_STATE_OPTIONAL))
+		{
+			status = VX_SUCCESS;
+			// save parameter details
+			kernel->parameters[index].index = index;
+			kernel->parameters[index].direction = (vx_direction_e)dir;
+			kernel->argConfig[index] = (dir == VX_INPUT) ? AGO_KERNEL_ARG_INPUT_FLAG : 
+				((dir == VX_OUTPUT) ? AGO_KERNEL_ARG_OUTPUT_FLAG : (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG));
+			kernel->parameters[index].type = data_type;
+			kernel->argType[index] = data_type;
+			kernel->parameters[index].state = (vx_parameter_state_e)state;
+			if (state == VX_PARAMETER_STATE_OPTIONAL)
+				kernel->argConfig[index] |= AGO_KERNEL_ARG_OPTIONAL_FLAG;
+			kernel->parameters[index].scope = &kernel->ref;
+			// update argument count
+			if (index >= kernel->argCount)
+				kernel->argCount = index + 1;
+		}
+	}
+	return status;
+}
+
+/*! \brief Removes a non-finalized <tt>\ref vx_kernel</tt> from the <tt>\ref vx_context</tt>.
+* Once a <tt>\ref vx_kernel</tt> has been finalized it cannot be removed.
+* \param [in] kernel The reference to the kernel to remove. Returned from <tt>\ref vxAddKernel</tt>.
+* \note Any kernel enumerated in the base standard
+* cannot be removed; only kernels added through <tt>\ref vxAddKernel</tt> can
+* be removed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_ERROR_INVALID_REFERENCE If an invalid kernel is passed in.
+* \retval VX_ERROR_INVALID_PARAMETER If a base kernel is passed in.
+* \ingroup group_user_kernels
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxRemoveKernel(vx_kernel kernel)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidKernel(kernel)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		// release if the kernel is not finalized and not a built-in kernel
+		if (!kernel->finalized) {
+			CAgoLock lock(kernel->ref.context->cs);
+			if (!agoReleaseKernel(kernel, true)) {
+				status = VX_SUCCESS;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets kernel attributes.
+* \param [in] kernel The reference to the kernel.
+* \param [in] attribute The enumeration of the attributes. See <tt>\ref vx_kernel_attribute_e</tt>.
+* \param [in] ptr The pointer to the location from which to read the attribute.
+* \param [in] size The size of the data area indicated by \a ptr in bytes.
+* \note After a kernel has been passed to <tt>\ref vxFinalizeKernel</tt>, no attributes
+* can be altered.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_user_kernels
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetKernelAttribute(vx_kernel kernel, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidKernel(kernel)) {
+		CAgoLock lock(kernel->ref.context->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_KERNEL_ATTRIBUTE_LOCAL_DATA_SIZE:
+				if (size == sizeof(vx_size)) {
+					kernel->localDataSize = *(vx_size *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_KERNEL_ATTRIBUTE_LOCAL_DATA_PTR:
+				if (size == sizeof(void *)) {
+					kernel->localDataPtr = (vx_uint8 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_KERNEL_ATTRIBUTE_AMD_NODE_REGEN_CALLBACK:
+				if (size == sizeof(void *)) {
+					if (!kernel->finalized) {
+						*((void **)&kernel->regen_callback_f) = *(void **)ptr;
+						status = VX_SUCCESS;
+					}
+					else {
+						status = VX_ERROR_NOT_SUPPORTED;
+					}
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT:
+				if (size == sizeof(void *)) {
+					if (!kernel->finalized) {
+						*((void **)&kernel->query_target_support_f) = *(void **)ptr;
+						status = VX_SUCCESS;
+					}
+					else {
+						status = VX_ERROR_NOT_SUPPORTED;
+					}
+				}
+				break;
+			case VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK:
+				if (size == sizeof(void *)) {
+					if (!kernel->finalized) {
+						*((void **)&kernel->opencl_codegen_callback_f) = *(void **)ptr;
+						status = VX_SUCCESS;
+					}
+					else {
+						status = VX_ERROR_NOT_SUPPORTED;
+					}
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Retrieves a <tt>\ref vx_parameter</tt> from a <tt>\ref vx_kernel</tt>.
+* \param [in] kernel The reference to the kernel.
+* \param [in] index The index of the parameter.
+* \return A <tt>\ref vx_parameter</tt>.
+* \retval 0 Either the kernel or index is invalid.
+* \retval * The parameter reference.
+* \ingroup group_parameter
+*/
+VX_API_ENTRY vx_parameter VX_API_CALL vxGetKernelParameterByIndex(vx_kernel kernel, vx_uint32 index)
+{
+	vx_parameter parameter = NULL;
+	if (agoIsValidKernel(kernel) && index < kernel->argCount) {
+		parameter = &kernel->parameters[index];
+		parameter->ref.external_count++;
+	}
+	return parameter;
+}
+
+/*==============================================================================
+GRAPH
+=============================================================================*/
+
+/*! \brief Creates an empty graph.
+* \param [in] context The reference to the implementation context.
+* \return A graph reference.
+* \retval 0 if an error occurred.
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_graph VX_API_CALL vxCreateGraph(vx_context context)
+{
+	vx_graph graph = NULL;
+	if (agoIsValidContext(context)) {
+		graph = agoCreateGraph(context);
+	}
+	return graph;
+}
+
+/*! \brief Releases a reference to a graph.
+* The object may not be garbage collected until its total reference count is zero.
+* Once the reference count is zero, all node references in the graph are automatically
+* released as well. Data referenced by those nodes may not be released as
+* the user may have external references to the data.
+* \param [in] graph The pointer to the graph to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseGraph(vx_graph *graph)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (graph && agoIsValidGraph(*graph)) {
+		if (!agoReleaseGraph(*graph)) {
+			*graph = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Verifies the state of the graph before it is executed.
+* This is useful to catch programmer errors and contract errors. If not verified,
+* the graph verifies before being processed.
+* \pre Memory for data objects is not guarenteed to exist before
+* this call. \post After this call data objects exist unless
+* the implementation optimized them out.
+* \param [in] graph The reference to the graph to verify.
+* \return A status code for graphs with more than one error; it is
+* undefined which error will be returned. Register a log callback using <tt>\ref vxRegisterLogCallback</tt>
+* to receive each specific error in the graph.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \retval VX_ERROR_MULTIPLE_WRITERS If the graph contains more than one writer
+* to any data object.
+* \retval VX_ERROR_INVALID_NODE If a node in the graph is invalid or failed be created.
+* \retval VX_ERROR_INVALID_GRAPH If the graph contains cycles or some other invalid topology.
+* \retval VX_ERROR_INVALID_TYPE If any parameter on a node is given the wrong type.
+* \retval VX_ERROR_INVALID_VALUE If any value of any parameter is out of bounds of specification.
+* \retval VX_ERROR_INVALID_FORMAT If the image format is not compatible.
+* \ingroup group_graph
+* \see vxConvertReference
+* \see vxProcessGraph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxVerifyGraph(vx_graph graph)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph)) {
+		CAgoLock lock(graph->cs);
+		CAgoLock lock2(graph->ref.context->cs);
+
+		// mark that graph is not verified and can't be executed
+		graph->verified = vx_false_e;
+		graph->isReadyToExecute = vx_false_e;
+
+		// check to see if user requested for graph dump
+		vx_uint32 ago_graph_dump = 0;
+		char textBuffer[256];
+		if (agoGetEnvironmentVariable("AGO_DUMP_GRAPH", textBuffer, sizeof(textBuffer))) {
+			ago_graph_dump = atoi(textBuffer);
+		}
+		if (ago_graph_dump) {
+			agoWriteGraph(graph, NULL, 0, stdout, "*INPUT*");
+		}
+
+		// verify graph per OpenVX specification
+		status = agoVerifyGraph(graph);
+		if (status == VX_SUCCESS) {
+			graph->verified = vx_true_e;
+			// run graph optimizer
+			if (agoOptimizeGraph(graph)) {
+				status = VX_FAILURE;
+			}
+			// initialize graph
+			else if (agoInitializeGraph(graph)) {
+				status = VX_FAILURE;
+			}
+			// graph is ready to execute
+			else {
+				graph->isReadyToExecute = vx_true_e;
+			}
+		}
+
+		if (ago_graph_dump) {
+			if (status == VX_SUCCESS) {
+				agoWriteGraph(graph, NULL, 0, stdout, "*FINAL*");
+			}
+		}
+	}
+
+	return status;
+}
+
+/*! \brief This function causes the synchronous processing of a graph. If the graph
+* has not been verified, then the implementation verifies the graph
+* immediately. If verification fails this function returns a status
+* identical to what <tt>\ref vxVerifyGraph</tt> would return. After
+* the graph verfies successfully then processing occurs. If the graph was
+* previously verified via <tt>\ref vxVerifyGraph</tt> or <tt>\ref vxProcessGraph</tt>
+* then the graph is processed. This function blocks until the graph is completed.
+* \param [in] graph The graph to execute.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS Graph has been processed.
+* \retval VX_FAILURE A catastrophic error occurred during processing.
+* \retval * See <tt>\ref vxVerifyGraph</tt>.
+* \pre <tt>\ref vxVerifyGraph</tt> must return <tt>\ref VX_SUCCESS</tt> before this function will pass.
+* \ingroup group_graph
+* \see vxVerifyGraph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxProcessGraph(vx_graph graph)
+{
+	return agoProcessGraph(graph);
+}
+
+/*! \brief Schedules a graph for future execution.
+* \param [in] graph The graph to schedule.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_ERROR_NO_RESOURCES The graph cannot be scheduled now.
+* \retval VX_ERROR_NOT_SUFFICIENT The graph is not verified and has failed
+forced verification.
+* \retval VX_SUCCESS The graph has been scheduled.
+* \pre <tt>\ref vxVerifyGraph</tt> must return <tt>\ref VX_SUCCESS</tt> before this function will pass.
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxScheduleGraph(vx_graph graph)
+{
+	return agoScheduleGraph(graph);
+}
+
+/*! \brief Waits for a specific graph to complete.
+* \param [in] graph The graph to wait on.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS The graph has completed.
+* \retval VX_FAILURE The graph has not completed yet.
+* \pre <tt>\ref vxScheduleGraph</tt>
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxWaitGraph(vx_graph graph)
+{
+	return agoWaitGraph(graph);
+}
+
+/*! \brief Allows the user to query attributes of the Graph.
+* \param [in] graph The reference to the created graph.
+* \param [in] attribute The <tt>\ref vx_graph_attribute_e</tt> type needed.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryGraph(vx_graph graph, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			CAgoLock lock(graph->cs);
+			switch (attribute)
+			{
+			case VX_GRAPH_ATTRIBUTE_NUMNODES:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = graph->nodeList.count;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_STATUS:
+				if (size == sizeof(vx_status)) {
+					*(vx_status *)ptr = graph->status;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_PERFORMANCE:
+				if (size == sizeof(vx_perf_t)) {
+					agoPerfCopyNormalize(graph->ref.context, (vx_perf_t *)ptr, &graph->perf);
+					status = VX_SUCCESS;
+				}
+			case VX_GRAPH_ATTRIBUTE_NUMPARAMETERS:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = (vx_uint32)graph->parameters.size();
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = graph->optimizer_flags;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_AFFINITY:
+				if (size == sizeof(AgoTargetAffinityInfo_)) {
+					*(AgoTargetAffinityInfo_ *)ptr = graph->attr_affinity;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_LAST:
+				if (size == sizeof(AgoGraphPerfInternalInfo)) {
+#if ENABLE_OPENCL
+					((AgoGraphPerfInternalInfo *)ptr)->kernel_enqueue = graph->opencl_perf.kernel_enqueue;
+					((AgoGraphPerfInternalInfo *)ptr)->kernel_wait = graph->opencl_perf.kernel_wait;
+					((AgoGraphPerfInternalInfo *)ptr)->buffer_read = graph->opencl_perf.buffer_read;
+					((AgoGraphPerfInternalInfo *)ptr)->buffer_write = graph->opencl_perf.buffer_write;
+#else
+					memset(ptr, 0, size);
+#endif
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_AVG:
+				if (size == sizeof(AgoGraphPerfInternalInfo)) {
+#if ENABLE_OPENCL
+					if (graph->perf.num > 0) {
+						((AgoGraphPerfInternalInfo *)ptr)->kernel_enqueue = graph->opencl_perf_total.kernel_enqueue / graph->perf.num;
+						((AgoGraphPerfInternalInfo *)ptr)->kernel_wait = graph->opencl_perf_total.kernel_wait / graph->perf.num;
+						((AgoGraphPerfInternalInfo *)ptr)->buffer_read = graph->opencl_perf_total.buffer_read / graph->perf.num;
+						((AgoGraphPerfInternalInfo *)ptr)->buffer_write = graph->opencl_perf_total.buffer_write / graph->perf.num;
+					}
+					else
+#endif
+					{
+						memset(ptr, 0, size);
+					}
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_PROFILE:
+				if (graph->perf.num > 0) {
+					status = agoGraphDumpPerformanceProfile(graph, (const char *)ptr);
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_GRAPH_ATTRIBUTE_AMD_OPENCL_COMMAND_QUEUE:
+				if (size == sizeof(cl_command_queue)) {
+					*(cl_command_queue *)ptr = graph->opencl_cmdq;
+					status = VX_SUCCESS;
+				}
+				break;
+#endif
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Allows the set to attributes on the Graph.
+* \param [in] graph The reference to the graph.
+* \param [in] attribute The <tt>\ref vx_graph_attribute_e</tt> type needed.
+* \param [in] ptr The location from which to read the value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetGraphAttribute(vx_graph graph, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			CAgoLock lock(graph->cs);
+			switch (attribute)
+			{
+			case VX_GRAPH_ATTRIBUTE_AMD_IMPORT_FROM_TEXT:
+				if (size == sizeof(AgoGraphImportInfo)) {
+					status = VX_SUCCESS;
+					AgoGraphImportInfo * info = (AgoGraphImportInfo *)ptr;
+					if (agoReadGraphFromString(graph, info->ref, info->num_ref, info->data_registry_callback_f, info->data_registry_callback_obj, info->text, info->dumpToConsole)) {
+						status = VX_FAILURE;
+					}
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_EXPORT_TO_TEXT:
+				if (size == sizeof(AgoGraphExportInfo)) {
+					status = VX_SUCCESS;
+					AgoGraphExportInfo * info = (AgoGraphExportInfo *)ptr;
+					FILE * fp = stdout;
+					if (strcmp(info->fileName, "stdout") != 0) {
+						fp = fopen(info->fileName, "w");
+						if (!fp) {
+							status = VX_FAILURE;
+							agoAddLogEntry(&graph->ref, status, "ERROR: vxSetGraphAttribute: unable to create: %s\n", info->fileName);
+						}
+					}
+					else if (agoWriteGraph(graph, info->ref, info->num_ref, fp, info->comment)) {
+						status = VX_FAILURE;
+					}
+					if (fp && fp != stdout) {
+						fclose(fp);
+					}
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS:
+				if (size == sizeof(vx_uint32)) {
+					graph->optimizer_flags = *(vx_uint32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_GRAPH_ATTRIBUTE_AMD_AFFINITY:
+				if (size == sizeof(AgoTargetAffinityInfo_)) {
+					status = VX_SUCCESS;
+					graph->attr_affinity = *(AgoTargetAffinityInfo_ *)ptr;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Adds the given parameter extracted from a <tt>\ref vx_node</tt> to the graph.
+* \param [in] graph The graph reference that contains the node.
+* \param [in] parameter The parameter reference to add to the graph from the node.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS Parameter added to Graph.
+* \retval VX_ERROR_INVALID_REFERENCE The parameter is not a valid <tt>\ref vx_parameter</tt>.
+* \retval VX_ERROR_INVALID_PARAMETER The parameter is of a node not in this
+* graph.
+* \ingroup group_graph_parameters
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToGraph(vx_graph graph, vx_parameter parameter)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph) && !graph->verified) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (!parameter || (agoIsValidParameter(parameter) && parameter->scope->type == VX_TYPE_NODE)) {
+			graph->parameters.push_back(parameter);
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets a reference to the parameter on the graph. The implementation
+* must set this parameter on the originating node as well.
+* \param [in] graph The graph reference.
+* \param [in] index The parameter index.
+* \param [in] value The reference to set to the parameter.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS Parameter set to Graph.
+* \retval VX_ERROR_INVALID_REFERENCE The value is not a valid <tt>\ref vx_reference</tt>.
+* \retval VX_ERROR_INVALID_PARAMETER The parameter index is out of bounds or the
+* dir parameter is incorrect.
+* \ingroup group_graph_parameters
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetGraphParameterByIndex(vx_graph graph, vx_uint32 index, vx_reference value)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidGraph(graph) && !graph->verified) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if ((index < graph->parameters.size()) && graph->parameters[index] && (!value || agoIsValidReference(value))) {
+			vx_parameter parameter = graph->parameters[index];
+			if (((vx_node)parameter->scope)->paramList[parameter->index]) {
+				agoReleaseData(((vx_node)parameter->scope)->paramList[parameter->index], false);
+			}
+			((vx_node)parameter->scope)->paramList[parameter->index] = (AgoData *)value;
+			if (((vx_node)parameter->scope)->paramList[parameter->index]) {
+				agoRetainData(graph, ((vx_node)parameter->scope)->paramList[parameter->index], false);
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Retrieves a <tt>\ref vx_parameter</tt> from a <tt>\ref vx_graph</tt>.
+* \param [in] graph The graph.
+* \param [in] index The index of the parameter.
+* \return <tt>\ref vx_parameter</tt> reference.
+* \retval 0 if the index is out of bounds.
+* \retval * The parameter reference.
+* \ingroup group_graph_parameters
+*/
+VX_API_ENTRY vx_parameter VX_API_CALL vxGetGraphParameterByIndex(vx_graph graph, vx_uint32 index)
+{
+	vx_parameter parameter = NULL;
+	if (agoIsValidGraph(graph) && (index < graph->parameters.size())) {
+		parameter = graph->parameters[index];
+		parameter->ref.external_count++;
+	}
+	return parameter;
+}
+
+/*! \brief Returns a Boolean to indicate the state of graph verification.
+* \param [in] graph The reference to the graph to check.
+* \return A <tt>\ref vx_bool</tt> value.
+* \retval vx_true_e The graph is verified.
+* \retval vx_false_e The graph is not verified. It must be verified before
+* execution either through <tt>\ref vxVerifyGraph</tt> or automatically through
+* <tt>\ref vxProcessGraph</tt> or <tt>\ref vxScheduleGraph</tt>.
+* \ingroup group_graph
+*/
+VX_API_ENTRY vx_bool VX_API_CALL vxIsGraphVerified(vx_graph graph)
+{
+	vx_bool verified = vx_false_e;
+	if (agoIsValidGraph(graph)) {
+		verified = graph->verified ? vx_true_e : vx_false_e;
+	}
+	return verified;
+}
+
+/*==============================================================================
+NODE
+=============================================================================*/
+
+/*! \brief Creates a reference to a node object for a given kernel.
+* \details This node has no references assigned as parameters after completion.
+* The client is then required to set these parameters manually by <tt>\ref vxSetParameterByIndex</tt>.
+* When clients supply their own node creation functions (for use with User Kernels), this is the API
+* to use along with the parameter setting API.
+* \param [in] graph The reference to the graph in which this node exists.
+* \param [in] kernel The kernel reference to associate with this new node.
+* \return vx_node
+* \retval 0 The node failed to create.
+* \retval * A node was created.
+* \ingroup group_adv_node
+* \post Call <tt>\ref vxSetParameterByIndex</tt> for as many parameters as needed to be set.
+*/
+VX_API_ENTRY vx_node VX_API_CALL vxCreateGenericNode(vx_graph graph, vx_kernel kernel)
+{
+	vx_node node = NULL;
+	if (agoIsValidGraph(graph) && agoIsValidKernel(kernel) && !graph->verified && kernel->finalized) {
+		CAgoLock lock(graph->cs);
+		node = agoCreateNode(graph, kernel);
+		node->ref.external_count++;
+	}
+	return node;
+}
+
+/*! \brief Allows a user to query information out of a node.
+* \param [in] node The reference to the node to query.
+* \param [in] attribute Use <tt>\ref vx_node_attribute_e</tt> value to query for information.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS Successful
+* \retval VX_ERROR_INVALID_PARAMETERS The type or size is incorrect.
+* \ingroup group_node
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryNode(vx_node node, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidNode(node)) {
+		CAgoLock lock(((vx_graph)node->ref.scope)->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_NODE_ATTRIBUTE_STATUS:
+				if (size == sizeof(vx_status)) {
+					*(vx_status *)ptr = node->status;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_PERFORMANCE:
+				if (size == sizeof(vx_perf_t)) {
+					vx_perf_t * perf = &node->perf;
+					if (node->perf.num == 0) {
+                        // TBD: need mapping of node performance into its subsets or superset
+                        // For now, nodes that doesn't exist in the graph will report the overall graph
+                        // performance because the nodes might have got morphed into other nodes have
+                        // no accountability
+						perf = &((AgoGraph *)node->ref.scope)->perf;
+					}
+					agoPerfCopyNormalize(node->ref.context, (vx_perf_t *)ptr, perf);
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_BORDER_MODE:
+				if (size == sizeof(vx_border_mode_t)) {
+					*(vx_border_mode_t *)ptr = node->attr_border_mode;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_LOCAL_DATA_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = node->localDataSize;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_LOCAL_DATA_PTR:
+				if (size == sizeof(void *)) {
+					*(void **)ptr = node->localDataPtr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_AMD_AFFINITY:
+				if (size == sizeof(AgoTargetAffinityInfo_)) {
+					*(AgoTargetAffinityInfo_ *)ptr = node->attr_affinity;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Allows a user to set attribute of a node before Graph Validation.
+* \param [in] node The reference to the node to set.
+* \param [in] attribute Use <tt>\ref vx_node_attribute_e</tt> value to query for information.
+* \param [out] ptr The output pointer to where to send the value.
+* \param [in] size The size of the objects to which \a ptr points.
+* \note Some attributes are inherited from the <tt>\ref vx_kernel</tt>, which was used
+* to create the node. Some of these can be overridden using this API, notably
+* \ref VX_NODE_ATTRIBUTE_LOCAL_DATA_SIZE and \ref VX_NODE_ATTRIBUTE_LOCAL_DATA_PTR.
+* \ingroup group_node
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS The attribute was set.
+* \retval VX_ERROR_INVALID_REFERENCE node is not a vx_node.
+* \retval VX_ERROR_INVALID_PARAMETER size is not correct for the type needed.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetNodeAttribute(vx_node node, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidNode(node)) {
+		CAgoLock lock(((vx_graph)node->ref.scope)->cs);
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_NODE_ATTRIBUTE_BORDER_MODE:
+				if (size == sizeof(vx_border_mode_t)) {
+					node->attr_border_mode = *(vx_border_mode_t *)ptr;
+					status = VX_SUCCESS;
+				}
+			case VX_NODE_ATTRIBUTE_LOCAL_DATA_SIZE:
+				if (size == sizeof(vx_size)) {
+					node->localDataSize = *(vx_size *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_LOCAL_DATA_PTR:
+				if (size == sizeof(void *)) {
+					node->localDataPtr = *(vx_uint8 **)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_NODE_ATTRIBUTE_AMD_AFFINITY:
+				if (size == sizeof(AgoTargetAffinityInfo_)) {
+					node->attr_affinity = *(AgoTargetAffinityInfo_ *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Releases a reference to a Node object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] node The pointer to the reference of the node to release.
+* \ingroup group_node
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseNode(vx_node *node)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (node && agoIsValidNode(*node)) {
+		if (!agoReleaseNode(*node)) {
+			*node = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Removes a Node from its parent Graph and releases it.
+* \param [in] node The pointer to the node to remove and release.
+* \ingroup group_node
+* \post After returning from this function the reference is zeroed.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxRemoveNode(vx_node *node)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (node && agoIsValidNode(*node)) {
+		vx_node anode = *node;
+		vx_graph graph = (vx_graph)anode->ref.scope;
+		CAgoLock lock(graph->cs);
+		if (!graph->verified && anode->ref.external_count == 1) {
+			// only remove the kernels that are created externally
+			if (agoRemoveNode(&graph->nodeList, anode, true)) {
+				status = VX_FAILURE;
+				agoAddLogEntry(&anode->ref, status, "ERROR: vxRemoveNode: failed for %s\n", anode->akernel->name);
+			}
+			else {
+				*node = NULL;
+				status = VX_SUCCESS;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Assigns a callback to a node.
+* If a callback already exists in this node, this function must return an error
+* and the user may clear the callback by passing a NULL pointer as the callback.
+* \param [in] node The reference to the node.
+* \param [in] callback The callback to associate with completion of this
+* specific node.
+* \warning This must be used with <b><i>extreme</i></b> caution as it can \e ruin
+* optimizations in the power/performance efficiency of a graph.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS Callback assigned.
+* \retval VX_ERROR_INVALID_REFERENCE The value passed as node was not a <tt>\ref vx_node</tt>.
+* \ingroup group_node_callback
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecomplete_f callback)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidNode(node)) {
+		node->callback = callback;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+/*! \brief Retrieves the current node callback function pointer set on the node.
+* \param [in] node The reference to the <tt>\ref vx_node</tt> object.
+* \ingroup group_node_callback
+* \return vx_nodecomplete_f The pointer to the callback function.
+* \retval NULL No callback is set.
+* \retval * The node callback function.
+*/
+VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node)
+{
+	vx_nodecomplete_f callback = NULL;
+	if (agoIsValidNode(node)) {
+		callback = node->callback;
+	}
+	return callback;
+}
+
+/*==============================================================================
+PARAMETER
+=============================================================================*/
+
+/*! \brief Retrieves a <tt>\ref vx_parameter</tt> from a <tt>\ref vx_node</tt>.
+* \param [in] node The node from which to extract the parameter.
+* \param [in] index The index of the parameter to which to get a reference.
+* \return <tt>\ref vx_parameter</tt>
+* \ingroup group_parameter
+*/
+VX_API_ENTRY vx_parameter VX_API_CALL vxGetParameterByIndex(vx_node node, vx_uint32 index)
+{
+	vx_parameter parameter = NULL;
+	if (agoIsValidNode(node) && (index < node->paramCount)) {
+		parameter = &node->parameters[index];
+		parameter->ref.external_count++;
+	}
+	return parameter;
+}
+
+/*! \brief Releases a reference to a parameter object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] param The pointer to the parameter to release.
+* \ingroup group_parameter
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseParameter(vx_parameter *param)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (param && agoIsValidParameter(*param)) {
+		if ((*param)->ref.external_count > 0) {
+			(*param)->ref.external_count--;
+			*param = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets the specified parameter data for a kernel on the node.
+* \param [in] node The node that contains the kernel.
+* \param [in] index The index of the parameter desired.
+* \param [in] value The reference to the parameter.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_parameter
+* \see vxSetParameterByReference
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByIndex(vx_node node, vx_uint32 index, vx_reference value)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidNode(node)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		vx_graph graph = (AgoGraph *)node->ref.scope;
+		if (!graph->verified && (index < node->paramCount) && (!node->parameters[index].type || node->parameters[index].type == value->type)) {
+			if (node->paramList[index]) {
+				agoReleaseData(node->paramList[index], false);
+			}
+			node->paramList[index] = (AgoData *)value;
+			if (node->paramList[index]) {
+				agoRetainData((AgoGraph *)node->ref.scope, node->paramList[index], false);
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Associates a parameter reference and a data reference with a kernel
+* on a node.
+* \param [in] parameter The reference to the kernel parameter.
+* \param [in] value The value to associate with the kernel parameter.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_parameter
+* \see vxGetParameterByIndex
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByReference(vx_parameter parameter, vx_reference value)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidParameter(parameter) && parameter->scope->type == VX_TYPE_NODE && parameter->ref.external_count > 0) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		vx_node node = (vx_node)parameter->scope;
+		vx_uint32 index = parameter->index;
+		if ((index < node->paramCount) && (!node->parameters[index].type || node->parameters[index].type == value->type)) {
+			if (node->paramList[index]) {
+				agoReleaseData(node->paramList[index], false);
+			}
+			node->paramList[index] = (AgoData *)value;
+			if (node->paramList[index]) {
+				agoRetainData((AgoGraph *)node->ref.scope, node->paramList[index], false);
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Allows the client to query a parameter to determine its meta-information.
+* \param [in] param The reference to the parameter.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_parameter_attribute_e</tt>.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_parameter
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryParameter(vx_parameter param, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidParameter(param)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_PARAMETER_ATTRIBUTE_DIRECTION:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = param->direction;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PARAMETER_ATTRIBUTE_INDEX:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = param->index;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PARAMETER_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = param->type;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PARAMETER_ATTRIBUTE_STATE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = param->state;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PARAMETER_ATTRIBUTE_REF:
+				if (size == sizeof(vx_reference)) {
+					vx_node node = (vx_node)param->scope;
+					if (agoIsValidNode(node)) {
+						if (param->index < node->paramCount) {
+							vx_reference ref = (vx_reference)node->paramList[param->index];
+							*(vx_reference *)ptr = ref;
+							// TBD: handle optimized buffers and kernels
+							if (ref) {
+								ref->external_count++;
+							}
+							status = VX_SUCCESS;
+						}
+					}
+					else {
+						status = VX_ERROR_NOT_SUPPORTED;
+					}
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+SCALAR
+=============================================================================*/
+
+/*! \brief Creates a reference to a scalar object. Also see \ref sub_node_parameters.
+* \param [in] context The reference to the system context.
+* \param [in] data_type The <tt>\ref vx_type_e</tt> of the scalar. Must be greater than
+* <tt>\ref VX_TYPE_INVALID</tt> and less than <tt>\ref VX_TYPE_SCALAR_MAX</tt>.
+* \param [in] ptr The pointer to the initial value of the scalar.
+* \ingroup group_scalar
+* \return A <tt>\ref vx_scalar</tt> reference.
+* \retval 0 The scalar could not be created.
+* \retval * The scalar was created. Check for further errors with <tt>\ref vxGetStatus</tt>.
+*/
+VX_API_ENTRY vx_scalar VX_API_CALL vxCreateScalar(vx_context context, vx_enum data_type, const void *ptr)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		data = agoCreateDataFromDescription(context, NULL, "scalar:UINT32,0", true);
+		if (data) {
+			agoAddData(&context->dataList, data);
+			data->u.scalar.type = data_type;
+			switch (data_type) {
+			case VX_TYPE_ENUM:
+				data->u.scalar.itemsize = sizeof(vx_enum);
+				if (ptr) data->u.scalar.u.e = *(vx_enum *)ptr;
+				break;
+			case VX_TYPE_UINT32:
+				data->u.scalar.itemsize = sizeof(vx_uint32);
+				if (ptr) data->u.scalar.u.u = *(vx_uint32 *)ptr;
+				break;
+			case VX_TYPE_INT32:
+				data->u.scalar.itemsize = sizeof(vx_int32);
+				if (ptr) data->u.scalar.u.i = *(vx_int32 *)ptr;
+				break;
+			case VX_TYPE_UINT16:
+				data->u.scalar.itemsize = sizeof(vx_uint16);
+				if (ptr) data->u.scalar.u.u = *(vx_uint16 *)ptr;
+				break;
+			case VX_TYPE_INT16:
+				data->u.scalar.itemsize = sizeof(vx_int16);
+				if (ptr) data->u.scalar.u.i = *(vx_int16 *)ptr;
+				break;
+			case VX_TYPE_UINT8:
+				data->u.scalar.itemsize = sizeof(VX_TYPE_UINT8);
+				if (ptr) data->u.scalar.u.u = *(vx_uint8 *)ptr;
+				break;
+			case VX_TYPE_INT8:
+				data->u.scalar.itemsize = sizeof(VX_TYPE_INT8);
+				if (ptr) data->u.scalar.u.i = *(vx_int8 *)ptr;
+				break;
+			case VX_TYPE_CHAR:
+				data->u.scalar.itemsize = sizeof(VX_TYPE_CHAR);
+				if (ptr) data->u.scalar.u.i = *(vx_char *)ptr;
+				break;
+			case VX_TYPE_FLOAT32:
+				data->u.scalar.itemsize = sizeof(vx_float32);
+				if (ptr) data->u.scalar.u.f = *(vx_float32 *)ptr;
+				break;
+			case VX_TYPE_SIZE:
+				data->u.scalar.itemsize = sizeof(vx_size);
+				if (ptr) data->u.scalar.u.s = *(vx_size *)ptr;
+				break;
+			case VX_TYPE_BOOL:
+				data->u.scalar.itemsize = sizeof(vx_bool);
+				if (ptr) data->u.scalar.u.u = *(vx_bool *)ptr;
+				break;
+			case VX_TYPE_DF_IMAGE:
+				data->u.scalar.itemsize = sizeof(vx_df_image);
+				if (ptr) data->u.scalar.u.df = *(vx_df_image *)ptr;
+				break;
+			case VX_TYPE_FLOAT64:
+				data->u.scalar.itemsize = sizeof(vx_float64);
+				if (ptr) data->u.scalar.u.f64 = *(vx_float64 *)ptr;
+				break;
+			case VX_TYPE_INT64:
+				data->u.scalar.itemsize = sizeof(vx_int64);
+				if (ptr) data->u.scalar.u.i64 = *(vx_int64 *)ptr;
+				break;
+			case VX_TYPE_UINT64:
+				data->u.scalar.itemsize = sizeof(vx_uint64);
+				if (ptr) data->u.scalar.u.u64 = *(vx_uint64 *)ptr;
+				break;
+			case VX_TYPE_STRING_AMD: {
+					data->u.scalar.itemsize = sizeof(char *);
+					data->size = VX_MAX_STRING_BUFFER_SIZE_AMD;
+					data->buffer_allocated = data->buffer = (vx_uint8 *)agoAllocMemory(data->size);
+					if (data->buffer_allocated) {
+						data->buffer[0] = 0;
+						if (ptr) {
+							strncpy((char *)data->buffer, (const char *)ptr, VX_MAX_STRING_BUFFER_SIZE_AMD);
+							data->buffer[VX_MAX_STRING_BUFFER_SIZE_AMD - 1] = 0; // NUL terminate string in case of overflow
+						}
+						data->isInitialized = vx_true_e;
+					}
+					else {
+						agoReleaseData(data, true);
+						data = NULL;
+					}
+				}
+				break;
+			default:
+				agoReleaseData(data, true);
+				data = NULL;
+				break;
+			}
+		}
+	}
+	return (vx_scalar)data;
+}
+
+/*! \brief Releases a reference to a scalar object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] scalar The pointer to the scalar to release.
+* \ingroup group_scalar
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseScalar(vx_scalar *scalar)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (scalar && agoIsValidData((AgoData *)*scalar, VX_TYPE_SCALAR)) {
+		if (!agoReleaseData((AgoData *)*scalar, true)) {
+			*scalar = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries attributes from a scalar.
+* \param [in] scalar The scalar object.
+* \param [in] attribute The enumeration to query. Use a <tt>\ref vx_scalar_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_scalar
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryScalar(vx_scalar scalar, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)scalar;
+	if (agoIsValidData(data, VX_TYPE_SCALAR)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_SCALAR_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.scalar.type;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Gets the scalar value out of a reference.
+* \note Use this in conjunction with Query APIs that return references which
+* should be converted into values.
+* \ingroup group_scalar
+* \param [in] ref The reference from which to get the scalar value.
+* \param [out] ptr An appropriate typed pointer that points to a location to which to copy
+* the scalar value.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_ERROR_INVALID_REFERENCE If the ref is not a valid
+* reference.
+* \retval VX_ERROR_INVALID_PARAMETERS If \a ptr is NULL.
+* \retval VX_ERROR_INVALID_TYPE If the type does not match the type in the reference or is a bad value.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReadScalarValue(vx_scalar ref, void *ptr)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)ref;
+	if (agoIsValidData(data, VX_TYPE_SCALAR)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			status = VX_SUCCESS;
+			switch (data->u.scalar.type)
+			{
+			case VX_TYPE_ENUM:
+				*(vx_enum *)ptr = data->u.scalar.u.e;
+				break;
+			case VX_TYPE_UINT32:
+				*(vx_uint32 *)ptr = data->u.scalar.u.u;
+				break;
+			case VX_TYPE_INT32:
+				*(vx_int32 *)ptr = data->u.scalar.u.i;
+				break;
+			case VX_TYPE_UINT16:
+				*(vx_uint16 *)ptr = data->u.scalar.u.u;
+				break;
+			case VX_TYPE_INT16:
+				*(vx_int16 *)ptr = data->u.scalar.u.i;
+				break;
+			case VX_TYPE_UINT8:
+				*(vx_uint8 *)ptr = data->u.scalar.u.u;
+				break;
+			case VX_TYPE_INT8:
+				*(vx_int8 *)ptr = data->u.scalar.u.i;
+				break;
+			case VX_TYPE_CHAR:
+				*(vx_char *)ptr = data->u.scalar.u.i;
+				break;
+			case VX_TYPE_FLOAT32:
+				*(vx_float32 *)ptr = data->u.scalar.u.f;
+				break;
+			case VX_TYPE_SIZE:
+				*(vx_size *)ptr = data->u.scalar.u.s;
+				break;
+			case VX_TYPE_BOOL:
+				*(vx_bool *)ptr = data->u.scalar.u.u ? vx_true_e : vx_false_e;
+				break;
+			case VX_TYPE_DF_IMAGE:
+				*(vx_df_image *)ptr = data->u.scalar.u.df;
+				break;
+			case VX_TYPE_FLOAT64:
+				*(vx_float64 *)ptr = data->u.scalar.u.f64;
+				break;
+			case VX_TYPE_UINT64:
+				*(vx_uint64 *)ptr = data->u.scalar.u.u64;
+				break;
+			case VX_TYPE_INT64:
+				*(vx_int64 *)ptr = data->u.scalar.u.i64;
+				break;
+			case VX_TYPE_STRING_AMD:
+				strcpy((char *)ptr, (const char *)data->buffer);
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets the scalar value in a reference.
+* \note Use this in conjunction with Parameter APIs that return references
+* to parameters that need to be altered.
+* \ingroup group_scalar
+* \param [in] ref The reference from which to get the scalar value.
+* \param [in] ptr An appropriately typed pointer that points to a location to which to copy
+* the scalar value.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_ERROR_INVALID_REFERENCE If the ref is not a valid
+* reference.
+* \retval VX_ERROR_INVALID_PARAMETERS If \a ptr is NULL.
+* \retval VX_ERROR_INVALID_TYPE If the type does not match the type in the reference or is a bad value.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxWriteScalarValue(vx_scalar ref, const void *ptr)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)ref;
+	if (agoIsValidData(data, VX_TYPE_SCALAR) && !data->isVirtual) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			// TBD: need sem-lock for thread safety
+			status = VX_SUCCESS;
+			switch (data->u.scalar.type)
+			{
+			case VX_TYPE_ENUM:
+				data->u.scalar.u.e = *(vx_enum *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_UINT32:
+				data->u.scalar.u.u = *(vx_uint32 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_INT32:
+				data->u.scalar.u.i = *(vx_int32 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_UINT16:
+				data->u.scalar.u.u = *(vx_uint16 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_INT16:
+				data->u.scalar.u.i = *(vx_int16 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_UINT8:
+				data->u.scalar.u.u = *(vx_uint8 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_INT8:
+				data->u.scalar.u.i = *(vx_int8 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_CHAR:
+				data->u.scalar.u.i = *(vx_char *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_FLOAT32:
+				data->u.scalar.u.f = *(vx_float32 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_SIZE:
+				data->u.scalar.u.s = *(vx_size *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_BOOL:
+				data->u.scalar.u.u = *(vx_bool *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_DF_IMAGE:
+				data->u.scalar.u.df = *(vx_df_image *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_FLOAT64:
+				data->u.scalar.u.f64 = *(vx_float64 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_UINT64:
+				data->u.scalar.u.u64 = *(vx_uint64 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_INT64:
+				data->u.scalar.u.i64 = *(vx_int64 *)ptr;
+				data->isInitialized = vx_true_e;
+				break;
+			case VX_TYPE_STRING_AMD:
+				strncpy((char *)data->buffer, (const char *)ptr, VX_MAX_STRING_BUFFER_SIZE_AMD);
+				data->buffer[VX_MAX_STRING_BUFFER_SIZE_AMD - 1] = 0; // NUL terminate string in case of overflow
+				data->isInitialized = vx_true_e;
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+REFERENCE
+=============================================================================*/
+
+/*! \brief Queries any reference type for some basic information (count, type).
+* \param [in] ref The reference to query.
+* \param [in] attribute The value for which to query. Use <tt>\ref vx_reference_attribute_e</tt>.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_reference
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryReference(vx_reference ref, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidReference(ref)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_REF_ATTRIBUTE_COUNT:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = ref->external_count;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_REF_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = ref->type;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+DELAY
+=============================================================================*/
+
+/*! \brief Queries a <tt>\ref vx_delay</tt> object attribute.
+* \param [in] delay The coordinates object to set.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_delay_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_delay
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryDelay(vx_delay delay, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)delay;
+	if (agoIsValidData(data, VX_TYPE_DELAY)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_DELAY_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.delay.type;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_DELAY_ATTRIBUTE_SLOTS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.delay.count;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Releases a reference to a delay object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] delay The pointer to the delay to release.
+* \post After returning from this function the reference is zeroed.
+* \ingroup group_delay
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseDelay(vx_delay *delay)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (delay && agoIsValidData((AgoData*)*delay, VX_TYPE_DELAY)) {
+		if (!agoReleaseData((AgoData*)*delay, true)) {
+			*delay = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Creates a Delay object.
+* \details This function  uses only the metadata from the exemplar, ignoring the object
+* data. It does not alter the exemplar or keep or release the reference to the
+* exemplar.
+* \param [in] context The reference to the system context.
+* \param [in] exemplar The exemplar object.
+* \param [in] count The number of reference in the delay.
+* \return <tt>\ref vx_delay</tt>
+* \ingroup group_delay
+*/
+VX_API_ENTRY vx_delay VX_API_CALL vxCreateDelay(vx_context context,
+	vx_reference exemplar,
+	vx_size slots)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && agoIsValidReference(exemplar) && slots > 0) {
+		CAgoLock lock(context->cs);
+		char desc_exemplar[512]; agoGetDescriptionFromData(context, desc_exemplar, (AgoData *)exemplar);
+		char desc[512]; sprintf(desc, "delay:" VX_FMT_SIZE ",[%s]", slots, desc_exemplar);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "delay", data->name);
+			agoAddData(&context->dataList, data);
+			// add the children too
+			for (vx_uint32 i = 0; i < data->numChildren; i++) {
+				agoAddData(&context->dataList, data->children[i]);
+				for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
+					if (data->children[i]->children[j]) {
+						agoAddData(&context->dataList, data->children[i]->children[j]);
+					}
+				}
+			}
+		}
+	}
+	return (vx_delay)data;
+}
+
+/*! \brief Retrieves a reference from a delay object.
+* \param [in] delay The reference to the delay object.
+* \param [in] index An index into the delay from which to extract the
+* reference.
+* \return <tt>\ref vx_reference</tt>
+* \note The delay index is in the range \f$ [-count+1,0] \f$. 0 is always the
+* \e current object.
+* \ingroup group_delay
+* \note A reference from a delay object must not be given to its associated
+* release API (e.g. <tt>\ref vxReleaseImage</tt>). Use the <tt>\ref vxReleaseDelay</tt> only.
+*/
+VX_API_ENTRY vx_reference VX_API_CALL vxGetReferenceFromDelay(vx_delay delay, vx_int32 index)
+{
+	AgoData * data = (AgoData *)delay;
+	AgoData * item = NULL;
+	if (agoIsValidData(data, VX_TYPE_DELAY)) {
+		// convert the index from 0..-(N-1) to 0..N-1
+		vx_uint32 index_inverted = (vx_uint32)-index;
+		if (index_inverted < data->u.delay.count) {
+			item = data->children[index_inverted];
+		}
+	}
+	return (vx_reference)item;
+}
+
+/*! \brief Ages the internal delay ring by one. This means that once this API is
+* called the reference from index 0 will go to index -1 and so forth until
+* \f$ -count+1 \f$ is reached. This last object will become 0. Once the delay has
+* been aged, it updates the reference in any associated nodes.
+* \param [in] delay
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS Delay was aged.
+* \retval VX_ERROR_INVALID_REFERENCE The value passed as delay was not a <tt>\ref vx_delay</tt>.
+* \ingroup group_delay
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAgeDelay(vx_delay delay)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)delay;
+	if (agoIsValidData(data, VX_TYPE_DELAY)) {
+		status = VX_SUCCESS;
+		// cycle through all the pointers by swapping
+		CAgoLock lock(data->ref.context->cs);
+		AgoData * childLast = data->children[data->u.delay.count - 1];
+		for (vx_int32 i = (vx_int32)data->u.delay.count-1; i > 0; i--) {
+			data->children[i] = data->children[i-1];
+		}
+		data->children[0] = childLast;
+	}
+	return status;
+}
+
+
+/*==============================================================================
+LOGGING
+=============================================================================*/
+
+/*! \brief Adds a line to the log.
+* \param [in] ref The reference to add the log entry against. Some valid value must be provided.
+* \param [in] status The status code. <tt>\ref VX_SUCCESS</tt> status entries are ignored and not added.
+* \param [in] message The human readable message to add to the log.
+* \param [in] ... a list of variable arguments to the message.
+* \note Messages may not exceed <tt>\ref VX_MAX_LOG_MESSAGE_LEN</tt> bytes and will be truncated in the log if they exceed this limit.
+* \ingroup group_log
+*/
+VX_API_ENTRY void VX_API_CALL vxAddLogEntry(vx_reference ref, vx_status status, const char *message, ...)
+{
+	va_list ap;
+	if (agoIsValidReference(ref) && ref->enable_logging && ref->context->callback_log) {
+		vx_char string[VX_MAX_LOG_MESSAGE_LEN];
+		va_start(ap, message);
+		vsnprintf(string, VX_MAX_LOG_MESSAGE_LEN, message, ap);
+		string[VX_MAX_LOG_MESSAGE_LEN - 1] = 0; // for MSVC which is not C99 compliant
+		va_end(ap);
+		if (!ref->context->callback_reentrant) {
+			CAgoLock lock(ref->context->cs); // TBD: create a separate lock object for log_callback
+			ref->context->callback_log(ref->context, ref, status, string);
+		}
+		else {
+			ref->context->callback_log(ref->context, ref, status, string);
+		}
+	}
+}
+
+/*! \brief Registers a callback facility to the OpenVX implementation to receive error logs.
+* \param [in] context The overall context to OpenVX.
+* \param [in] callback The callback function. If NULL, the previous callback is removed.
+* \param [in] reentrant If reentrancy flag is <tt>\ref vx_true_e</tt>, then the callback may be entered from multiple
+* simultaneous tasks or threads (if the host OS supports this).
+* \ingroup group_log
+*/
+VX_API_ENTRY void VX_API_CALL vxRegisterLogCallback(vx_context context, vx_log_callback_f callback, vx_bool reentrant)
+{
+	if (agoIsValidContext(context)) {
+		context->callback_log = callback;
+		context->callback_reentrant = reentrant;
+	}
+}
+
+/*==============================================================================
+LUT
+=============================================================================*/
+
+/*! \brief Creates LUT object of a given type.
+* \param [in] context The reference to the context.
+* \param [in] data_type The type of data stored in the LUT.
+* \param [in] count The number of entries desired.
+* \if OPENVX_STRICT_1_0
+* \note For OpenVX 1.0, count must be equal to 256 and data_type can only be \ref VX_TYPE_UINT8.
+* \endif
+* \return <tt>\ref vx_lut</tt>
+* \ingroup group_lut
+*/
+VX_API_ENTRY vx_lut VX_API_CALL vxCreateLUT(vx_context context, vx_enum data_type, vx_size count)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && data_type == VX_TYPE_UINT8 && count == 256) {
+		CAgoLock lock(context->cs);
+		char desc[512]; sprintf(desc, "lut:%s," VX_FMT_SIZE "", agoEnum2Name(data_type), count);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "lut", data->name);
+			agoAddData(&context->dataList, data);
+		}
+	}
+	return (vx_lut)data;
+}
+
+/*! \brief Releases a reference to a LUT object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] lut The pointer to the LUT to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_lut
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseLUT(vx_lut *lut)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (lut && agoIsValidData((AgoData*)*lut, VX_TYPE_LUT)) {
+		if (!agoReleaseData((AgoData*)*lut, true)) {
+			*lut = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries attributes from a LUT.
+* \param [in] lut The LUT to query.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_lut_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_lut
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryLUT(vx_lut lut, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)lut;
+	if (agoIsValidData(data, VX_TYPE_LUT)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_LUT_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.lut.type;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_LUT_ATTRIBUTE_COUNT:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.lut.count;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_LUT_ATTRIBUTE_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->size;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Gets direct access to the LUT table data.
+* \details There are several variations of call methodology:
+* \arg If \a ptr is NULL (which means the current data of the LUT is not desired),
+* the LUT reference count is incremented.
+* \arg If \a ptr is not NULL but (*ptr) is NULL, (*ptr) will contain the address of the LUT data when the function returns and
+* the reference count will be incremented. Whether the (*ptr) address is mapped
+* or allocated is undefined. (*ptr) must be returned to <tt>\ref vxCommitLUT</tt>.
+* \arg If \a ptr is not NULL and (*ptr) is not NULL, the user is signalling the implementation to copy the LUT data into the location specified
+* by (*ptr). Users must use <tt>\ref vxQueryLUT</tt> with <tt>\ref VX_LUT_ATTRIBUTE_SIZE</tt> to
+* determine how much memory to allocate for the LUT data.
+*
+* In any case, <tt>\ref vxCommitLUT</tt> must be called after LUT access is complete.
+* \param [in] lut The LUT from which to get the data.
+* \param [in,out] ptr The address of the location to store the pointer to the LUT memory.
+* \param [in] usage This declares the intended usage of the pointer using the * <tt>\ref vx_accessor_e</tt> enumeration.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \post <tt>\ref vxCommitLUT</tt>
+* \ingroup group_lut
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAccessLUT(vx_lut lut, void **ptr, vx_enum usage)
+{
+	AgoData * data = (AgoData *)lut;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_LUT)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr) {
+			if (!data->buffer) {
+				CAgoLock lock(data->ref.context->cs);
+				if (agoAllocData(data)) {
+					return VX_FAILURE;
+				}
+			}
+			vx_uint8 * ptr_internal = data->buffer;
+			vx_uint8 * ptr_returned = *ptr ? (vx_uint8 *)*ptr : ptr_internal;
+			// save the pointer and usage for use in vxCommitXXX
+			status = VX_SUCCESS;
+			for (auto i = data->mapped.begin(); i != data->mapped.end(); i++) {
+				if (i->ptr == ptr_returned) {
+					// can't support vxAccessXXX() more than once with same pointer, the application
+					// needs to call vxCommitXXX() before calling vxAccessXXX()
+					status = VX_FAILURE;
+				}
+			}
+			if (status == VX_SUCCESS) {
+				MappedData item = { ptr_returned, usage, (ptr_returned != ptr_internal) ? true : false };
+				data->mapped.push_back(item);
+				*ptr = ptr_returned;
+				if (usage == VX_READ_ONLY || usage == VX_READ_AND_WRITE) {
+#if ENABLE_OPENCL
+					if (data->opencl_buffer && !(data->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+						// make sure dirty OpenCL buffers are synched before giving access for read
+						if (data->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL)) {
+							size_t origin[3] = { 0, 0, 0 };
+							size_t region[3] = { 256, 1, 1 };
+							cl_int err = clEnqueueReadImage(data->ref.context->opencl_cmdq, data->opencl_buffer, CL_TRUE, origin, region, 256, 0, data->buffer, 0, NULL, NULL);
+							if (err) {
+								status = VX_FAILURE;
+								agoAddLogEntry(&data->ref, status, "ERROR: vxAccessLUT: clEnqueueWriteImage() => %d\n", err);
+								return status;
+							}
+							data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+						}
+					}
+#endif
+					if (item.used_external_ptr) {
+						// copy if read is requested with explicit external buffer
+						HafCpu_BinaryCopy_U8_U8(data->size, ptr_returned, ptr_internal);
+					}
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Commits the Lookup Table.
+* \details Commits the data back to the LUT object and decrements the reference count.
+* There are several variations of call methodology:
+* \arg If a user should allocated their own memory for the LUT data copy, the user is
+* obligated to free this memory.
+* \arg If \a ptr is not NULL and the (*ptr) for <tt>\ref vxAccessLUT</tt> was NULL,
+* it is undefined whether the implementation will unmap or copy and free the memory.
+* \param [in] lut The LUT to modify.
+* \param [in] ptr The pointer used with <tt>\ref vxAccessLUT</tt>. This cannot be NULL.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \pre <tt>\ref vxAccessLUT</tt>.
+* \ingroup group_lut
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxCommitLUT(vx_lut lut, const void *ptr)
+{
+	AgoData * data = (AgoData *)lut;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_LUT)) {
+		// check for valid arguments
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr) {
+			status = VX_SUCCESS;
+			if (!data->buffer) {
+				status = VX_FAILURE;
+			}
+			else if (!data->mapped.empty()) {
+				vx_enum usage = VX_READ_ONLY;
+				bool used_external_ptr = false;
+				for (auto i = data->mapped.begin(); i != data->mapped.end(); i++) {
+					if (i->ptr == ptr) {
+						usage = i->usage;
+						used_external_ptr = i->used_external_ptr;
+						data->mapped.erase(i);
+						break;
+					}
+				}
+				if (usage == VX_WRITE_ONLY || usage == VX_READ_AND_WRITE) {
+					if (used_external_ptr) {
+						// copy from external buffer
+						HafCpu_BinaryCopy_U8_U8(data->size, data->buffer, (vx_uint8 *)ptr);
+					}
+					// update sync flags
+					data->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+					data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT;
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+DISTRIBUTION
+=============================================================================*/
+
+/*! \brief Creates a reference to a 1D Distribution with a start offset, valid range, and number of equally weighted bins.
+* \param [in] context The reference to the overall context.
+* \param [in] numBins The number of bins in the distribution.
+* \param [in] offset The offset into the range value.
+* \param [in] range The total range of the values.
+* \return <tt>\ref vx_distribution</tt>
+* \ingroup group_distribution
+*/
+VX_API_ENTRY vx_distribution VX_API_CALL vxCreateDistribution(vx_context context, vx_size numBins, vx_int32 offset, vx_uint32 range)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && numBins > 0 && range > 0) {
+		CAgoLock lock(context->cs);
+		char desc[512]; sprintf(desc, "distribution:" VX_FMT_SIZE ",%d,%u", numBins, offset, range);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "dist", data->name);
+			agoAddData(&context->dataList, data);
+		}
+	}
+	return (vx_distribution)data;
+}
+
+/*! \brief Releases a reference to a distribution object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] distribution The reference to the distribution to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_distribution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseDistribution(vx_distribution *distribution)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (distribution && agoIsValidData((AgoData*)*distribution, VX_TYPE_DISTRIBUTION)) {
+		if (!agoReleaseData((AgoData*)*distribution, true)) {
+			*distribution = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries a Distribution object.
+* \param [in] distribution The reference to the distribution to query.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_distribution_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_distribution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryDistribution(vx_distribution distribution, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)distribution;
+	if (agoIsValidData(data, VX_TYPE_DISTRIBUTION)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_DISTRIBUTION_ATTRIBUTE_DIMENSIONS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = 1;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_DISTRIBUTION_ATTRIBUTE_OFFSET:
+				if (size == sizeof(vx_int32)) {
+					*(vx_int32 *)ptr = data->u.dist.offset;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_DISTRIBUTION_ATTRIBUTE_RANGE:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = data->u.dist.range;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_DISTRIBUTION_ATTRIBUTE_BINS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.dist.numbins;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_DISTRIBUTION_ATTRIBUTE_WINDOW:
+				if (size == sizeof(vx_uint32)) {
+					vx_uint32 window = (data->u.dist.window * data->u.dist.numbins == data->u.dist.range) ? data->u.dist.window : 0;
+					*(vx_uint32 *)ptr = window;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_DISTRIBUTION_ATTRIBUTE_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->size;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Gets direct access to a Distribution in memory.
+* \param [in] distribution The reference to the distribution to access.
+* \param [out] ptr The address of the location to store the pointer to the
+* Distribution memory.
+* \arg If (*ptr) is not NULL, the Distribution will be copied to that address.
+* \arg If (*ptr) is NULL, the pointer will be allocated, mapped, or use internal memory.
+*
+* In any case, <tt>\ref vxCommitDistribution</tt> must be called with (*ptr).
+* \param [in] usage The <tt>\ref vx_accessor_e</tt> value to describe the access of the object.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \post <tt>\ref vxCommitDistribution</tt>
+* \ingroup group_distribution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAccessDistribution(vx_distribution distribution, void **ptr, vx_enum usage)
+{
+	AgoData * data = (AgoData *)distribution;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_DISTRIBUTION)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr) {
+			if (!data->buffer) {
+				CAgoLock lock(data->ref.context->cs);
+				if (agoAllocData(data)) {
+					return VX_FAILURE;
+				}
+			}
+			vx_uint8 * ptr_internal = data->buffer;
+			vx_uint8 * ptr_returned = *ptr ? (vx_uint8 *)*ptr : ptr_internal;
+			// save the pointer and usage for use in vxCommitXXX
+			status = VX_SUCCESS;
+			for (auto i = data->mapped.begin(); i != data->mapped.end(); i++) {
+				if (i->ptr == ptr_returned) {
+					// can't support vxAccessXXX() more than once with same pointer, the application
+					// needs to call vxCommitXXX() before calling vxAccessXXX()
+					status = VX_FAILURE;
+				}
+			}
+			if (status == VX_SUCCESS) {
+				MappedData item = { ptr_returned, usage, (ptr_returned != ptr_internal) ? true : false };
+				data->mapped.push_back(item);
+				*ptr = ptr_returned;
+				if (item.used_external_ptr && (usage == VX_READ_ONLY || usage == VX_READ_AND_WRITE)) {
+					// copy if read is requested with explicit external buffer
+					HafCpu_BinaryCopy_U8_U8(data->size, ptr_returned, ptr_internal);
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets the Distribution back to the memory. The memory must be
+* a vx_uint32 array of a value at least as big as the value returned via <tt>\ref VX_DISTRIBUTION_ATTRIBUTE_RANGE</tt>.
+* \param [in] distribution The Distribution to modify.
+* \param [in] ptr The pointer returned from (or not modified by) <tt>\ref vxAccessDistribution</tt>.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \pre <tt>\ref vxAccessDistribution</tt>.
+* \ingroup group_distribution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxCommitDistribution(vx_distribution distribution, const void * ptr)
+{
+	AgoData * data = (AgoData *)distribution;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_DISTRIBUTION)) {
+		// check for valid arguments
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr) {
+			status = VX_SUCCESS;
+			if (!data->buffer) {
+				status = VX_FAILURE;
+			}
+			else if (!data->mapped.empty()) {
+				vx_enum usage = VX_READ_ONLY;
+				bool used_external_ptr = false;
+				for (auto i = data->mapped.begin(); i != data->mapped.end(); i++) {
+					if (i->ptr == ptr) {
+						usage = i->usage;
+						used_external_ptr = i->used_external_ptr;
+						data->mapped.erase(i);
+						break;
+					}
+				}
+				if (usage == VX_WRITE_ONLY || usage == VX_READ_AND_WRITE) {
+					if (used_external_ptr) {
+						// copy from external buffer
+						HafCpu_BinaryCopy_U8_U8(data->size, data->buffer, (vx_uint8 *)ptr);
+					}
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+THRESHOLD
+=============================================================================*/
+
+/*! \brief Creates a reference to a threshold object of a given type.
+* \param [in] c The reference to the overall context.
+* \param [in] thresh_type The type of threshold to create.
+* \param [in] data_type The data type of the threshold's value(s).
+* \if OPENVX_STRICT_1_0
+* \note For OpenVX 1.0, data_type can only be <tt>\ref VX_TYPE_UINT8</tt>.
+* \endif
+* \return <tt>\ref vx_threshold</tt>
+* \ingroup group_threshold
+*/
+VX_API_ENTRY vx_threshold VX_API_CALL vxCreateThreshold(vx_context context, vx_enum thresh_type, vx_enum data_type)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && (thresh_type == VX_THRESHOLD_TYPE_BINARY || thresh_type == VX_THRESHOLD_TYPE_RANGE) &&
+		(data_type >= VX_TYPE_INT8) && (data_type <= VX_TYPE_INT32))
+	{
+		CAgoLock lock(context->cs);
+		char desc[512]; sprintf(desc, "threshold:%s,%s", agoEnum2Name(thresh_type), agoEnum2Name(data_type));
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "thr", data->name);
+			agoAddData(&context->dataList, data);
+		}
+	}
+	return (vx_threshold)data;
+}
+
+/*! \brief Releases a reference to a threshold object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] thresh The pointer to the threshold to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_threshold
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseThreshold(vx_threshold *thresh)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (thresh && agoIsValidData((AgoData*)*thresh, VX_TYPE_THRESHOLD)) {
+		if (!agoReleaseData((AgoData*)*thresh, true)) {
+			*thresh = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets attributes on the threshold object.
+* \param [in] thresh The threshold object to set.
+* \param [in] attribute The attribute to modify. Use a <tt>\ref vx_threshold_attribute_e</tt> enumeration.
+* \param [in] ptr The pointer to the value to which to set the attribute.
+* \param [in] size The size of the data pointed to by \a ptr.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_threshold
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetThresholdAttribute(vx_threshold thresh, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)thresh;
+	if (agoIsValidData(data, VX_TYPE_THRESHOLD)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_THRESHOLD_ATTRIBUTE_THRESHOLD_VALUE:
+				if (size == sizeof(vx_int32) && data->u.thr.thresh_type == VX_THRESHOLD_TYPE_BINARY) {
+					data->u.thr.threshold_lower = *(vx_int32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_THRESHOLD_LOWER:
+				if (size == sizeof(vx_int32) && data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+					data->u.thr.threshold_lower = *(vx_int32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_THRESHOLD_UPPER:
+				if (size == sizeof(vx_int32) && data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+					data->u.thr.threshold_upper = *(vx_int32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries an attribute on the threshold object.
+* \param [in] thresh The threshold object to set.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_threshold_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_threshold
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryThreshold(vx_threshold thresh, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)thresh;
+	if (agoIsValidData(data, VX_TYPE_THRESHOLD)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_THRESHOLD_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.thr.thresh_type;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_DATA_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.thr.data_type;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_THRESHOLD_VALUE:
+				if (size == sizeof(vx_int32) && data->u.thr.thresh_type == VX_THRESHOLD_TYPE_BINARY) {
+					*(vx_int32 *)ptr = data->u.thr.threshold_lower;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_THRESHOLD_LOWER:
+				if (size == sizeof(vx_int32) && data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+					*(vx_int32 *)ptr = data->u.thr.threshold_lower;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_THRESHOLD_UPPER:
+				if (size == sizeof(vx_int32) && data->u.thr.thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+					*(vx_int32 *)ptr = data->u.thr.threshold_upper;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_TRUE_VALUE:
+				if (size == sizeof(vx_int32)) {
+					*(vx_int32 *)ptr = data->u.thr.true_value;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_THRESHOLD_ATTRIBUTE_FALSE_VALUE:
+				if (size == sizeof(vx_int32)) {
+					*(vx_int32 *)ptr = data->u.thr.false_value;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+MATRIX
+=============================================================================*/
+
+/*! \brief Creates a reference to a matrix object.
+* \param [in] c The reference to the overall context.
+* \param [in] data_type The unit format of the matrix. <tt>\ref VX_TYPE_INT32</tt> or <tt>\ref VX_TYPE_FLOAT32</tt>.
+* \param [in] columns The first dimensionality.
+* \param [in] rows The second dimensionality.
+* \return <tt>\ref vx_matrix</tt>
+* \ingroup group_matrix
+*/
+VX_API_ENTRY vx_matrix VX_API_CALL vxCreateMatrix(vx_context context, vx_enum data_type, vx_size columns, vx_size rows)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && (data_type == VX_TYPE_INT32 || data_type == VX_TYPE_FLOAT32) && columns > 0 && rows > 0) {
+		CAgoLock lock(context->cs);
+		char desc[512]; sprintf(desc, "matrix:%s," VX_FMT_SIZE "," VX_FMT_SIZE "", agoEnum2Name(data_type), columns, rows);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "matrix", data->name);
+			agoAddData(&context->dataList, data);
+		}
+	}
+	return (vx_matrix)data;
+}
+
+/*! \brief Releases a reference to a matrix object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] mat The matrix reference to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_matrix
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseMatrix(vx_matrix *mat)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (mat && agoIsValidData((AgoData*)*mat, VX_TYPE_MATRIX)) {
+		if (!agoReleaseData((AgoData*)*mat, true)) {
+			*mat = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries an attribute on the matrix object.
+* \param [in] mat The matrix object to set.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_matrix_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_matrix
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryMatrix(vx_matrix mat, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)mat;
+	if (agoIsValidData(data, VX_TYPE_MATRIX)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_MATRIX_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.mat.type;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_MATRIX_ATTRIBUTE_ROWS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.mat.rows;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_MATRIX_ATTRIBUTE_COLUMNS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.mat.columns;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_MATRIX_ATTRIBUTE_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->size;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Gets the matrix data (copy).
+* \param [in] mat The reference to the matrix.
+* \param [out] array The array in which to place the matrix.
+* \see vxQueryMatrix and <tt>\ref VX_MATRIX_ATTRIBUTE_COLUMNS</tt> and <tt>\ref VX_MATRIX_ATTRIBUTE_ROWS</tt>
+* to get the needed number of elements of the array.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \post <tt>\ref vxCommitMatrix</tt>
+* \ingroup group_matrix
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReadMatrix(vx_matrix mat, void *array)
+{
+	AgoData * data = (AgoData *)mat;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_MATRIX)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else {
+			if (array) {
+				if (!data->buffer) {
+					CAgoLock lock(data->ref.context->cs);
+					if (agoAllocData(data)) {
+						return VX_FAILURE;
+					}
+				}
+				// copy to external buffer
+				HafCpu_BinaryCopy_U8_U8(data->size, (vx_uint8 *)array, data->buffer);
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets the matrix data (copy)
+* \param [in] mat The reference to the matrix.
+* \param [out] array The array to read the matrix.
+* \see vxQueryMatrix and <tt>\ref VX_MATRIX_ATTRIBUTE_COLUMNS</tt> and <tt>\ref VX_MATRIX_ATTRIBUTE_ROWS</tt>
+* to get the needed number of elements of the array.'
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \pre <tt>\ref vxAccessMatrix</tt>
+* \ingroup group_matrix
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxWriteMatrix(vx_matrix mat, const void *array)
+{
+	AgoData * data = (AgoData *)mat;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_MATRIX)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (data->ref.read_only) {
+			status = VX_ERROR_NOT_SUPPORTED;
+		}
+		else {
+			if (array) {
+				if (!data->buffer) {
+					CAgoLock lock(data->ref.context->cs);
+					if (agoAllocData(data)) {
+						return VX_FAILURE;
+					}
+				}
+				// copy from external buffer
+				HafCpu_BinaryCopy_U8_U8(data->size, data->buffer, (vx_uint8 *)array);
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+CONVOLUTION
+=============================================================================*/
+
+/*! \brief Creates a reference to a convolution matrix object.
+* \param [in] context The reference to the overall context.
+* \param [in] columns The columns dimension of the convolution.
+* Must be odd and greater than or equal to 3 and less than the value returned
+* from <tt>\ref VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION</tt>.
+* \param [in] rows The rows dimension of the convolution.
+* Must be odd and greater than or equal to 3 and less than the value returned
+* from <tt>\ref VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION</tt>.
+* \return <tt>\ref vx_convolution</tt>
+* \ingroup group_convolution
+*/
+VX_API_ENTRY vx_convolution VX_API_CALL vxCreateConvolution(vx_context context, vx_size columns, vx_size rows)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && columns > 0 && rows > 0) {
+		CAgoLock lock(context->cs);
+		char desc[512]; sprintf(desc, "convolution:" VX_FMT_SIZE "," VX_FMT_SIZE "", columns, rows);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "conv", data->name);
+			agoAddData(&context->dataList, data);
+		}
+	}
+	return (vx_convolution)data;
+}
+
+/*! \brief Releases the reference to a convolution matrix.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] conv The pointer to the convolution matrix to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_convolution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseConvolution(vx_convolution *conv)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (conv && agoIsValidData((AgoData*)*conv, VX_TYPE_CONVOLUTION)) {
+		if (!agoReleaseData((AgoData*)*conv, true)) {
+			*conv = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries an attribute on the convolution matrix object.
+* \param [in] conv The convolution matrix object to set.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_convolution_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_convolution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryConvolution(vx_convolution conv, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)conv;
+	if (agoIsValidData(data, VX_TYPE_CONVOLUTION)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_CONVOLUTION_ATTRIBUTE_ROWS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.conv.rows;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONVOLUTION_ATTRIBUTE_COLUMNS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.conv.columns;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONVOLUTION_ATTRIBUTE_SCALE:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = 1u << data->u.conv.shift;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_CONVOLUTION_ATTRIBUTE_SIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->size;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets attributes on the convolution object.
+* \param [in] conv The coordinates object to set.
+* \param [in] attribute The attribute to modify. Use a <tt>\ref vx_convolution_attribute_e</tt> enumeration.
+* \param [in] ptr The pointer to the value to which to set the attribute.
+* \param [in] size The size of the data pointed to by \a ptr.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_convolution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetConvolutionAttribute(vx_convolution conv, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)conv;
+	if (agoIsValidData(data, VX_TYPE_CONVOLUTION)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_CONVOLUTION_ATTRIBUTE_SCALE:
+				if (size == sizeof(vx_uint32)) {
+					status = VX_ERROR_INVALID_VALUE;
+					vx_uint32 scale = *(vx_uint32 *)ptr;
+					for (vx_uint32 shift = 0; shift < 32; shift++) {
+						if (scale == (1u << shift)) {
+							data->u.conv.shift = shift;
+							status = VX_SUCCESS;
+							if (data->buffer && data->reserved) {
+								// update float values
+								vx_uint32 N = (vx_uint32)data->u.conv.columns * (vx_uint32)data->u.conv.rows;
+								float scale = 1.0f / (float)(1 << data->u.conv.shift);
+								short * ps = (short *)data->buffer;
+								float * pf = (float *)data->reserved;
+								for (vx_uint32 i = 0; i < N; i++)
+									pf[N - 1 - i] = scale * ps[i]; // NOTE: the reversing of coefficients order required to be able to re-use linear filter
+							}
+							break;
+						}
+					}
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Gets the convolution data (copy).
+* \param [in] conv The reference to the convolution.
+* \param [out] array The array to place the convolution.
+* \see vxQueryConvolution and <tt>\ref VX_CONVOLUTION_ATTRIBUTE_SIZE</tt> to get the
+* needed number of bytes of the array.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \post <tt>\ref vxWriteConvolutionCoefficients</tt>
+* \ingroup group_convolution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReadConvolutionCoefficients(vx_convolution conv, vx_int16 *array)
+{
+	AgoData * data = (AgoData *)conv;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_CONVOLUTION)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else {
+			if (array) {
+				if (!data->buffer) {
+					CAgoLock lock(data->ref.context->cs);
+					if (agoAllocData(data)) {
+						return VX_FAILURE;
+					}
+				}
+				// copy to external buffer
+				HafCpu_BinaryCopy_U8_U8(data->size, (vx_uint8 *)array, data->buffer);
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Sets the convolution data (copy),
+* \param [in] conv The reference to the convolution.
+* \param [out] array The array to read the convolution.
+* \see <tt>\ref vxQueryConvolution</tt> and <tt>\ref VX_CONVOLUTION_ATTRIBUTE_SIZE</tt> to get the
+* needed number of bytes of the array.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \pre <tt>\ref vxReadConvolutionCoefficients</tt>
+* \ingroup group_convolution
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxWriteConvolutionCoefficients(vx_convolution conv, const vx_int16 *array)
+{
+	AgoData * data = (AgoData *)conv;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_CONVOLUTION)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (data->ref.read_only) {
+			status = VX_ERROR_NOT_SUPPORTED;
+		}
+		else {
+			if (array) {
+				if (!data->buffer) {
+					CAgoLock lock(data->ref.context->cs);
+					if (agoAllocData(data)) {
+						return VX_FAILURE;
+					}
+				}
+				// copy from external buffer
+				HafCpu_BinaryCopy_U8_U8(data->size, data->buffer, (vx_uint8 *)array);
+				// update float values
+				vx_uint32 N = (vx_uint32)data->u.conv.columns * (vx_uint32)data->u.conv.rows;
+				float scale = 1.0f / (float)(1 << data->u.conv.shift);
+				if (data->buffer && data->reserved) {
+					short * ps = (short *)data->buffer;
+					float * pf = (float *)data->reserved;
+					for (vx_uint32 i = 0; i < N; i++)
+						pf[N - 1 - i] = scale * ps[i]; // NOTE: the reversing of coefficients order required to be able to re-use linear filter
+				}
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+PYRAMID
+=============================================================================*/
+
+/*! \brief Creates a reference to a pyramid object of the supplied number of levels.
+* \param [in] context The reference to the overall context.
+* \param [in] levels The number of levels desired. This is required to be a non-zero value.
+* \param [in] scale Used to indicate the scale between pyramid levels. This is required to be a non-zero positive value.
+* \if OPENVX_STRICT_1_0
+* In OpenVX 1.0, the only permissible values are <tt>\ref VX_SCALE_PYRAMID_HALF</tt> or <tt>\ref VX_SCALE_PYRAMID_ORB</tt>.
+* \endif
+* \param [in] width The width of the 0th level image in pixels.
+* \param [in] height The height of the 0th level image in pixels.
+* \param [in] format The format of all images in the pyramid.
+* \return <tt>\ref vx_pyramid</tt>
+* \retval 0 No pyramid was created.
+* \retval * A pyramid reference.
+* \ingroup group_pyramid
+*/
+VX_API_ENTRY vx_pyramid VX_API_CALL vxCreatePyramid(vx_context context, vx_size levels, vx_float32 scale, vx_uint32 width, vx_uint32 height, vx_df_image format)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context)) {
+		CAgoLock lock(context->cs);
+		char desc_scale[64];
+		if (scale == VX_SCALE_PYRAMID_HALF) sprintf(desc_scale, "HALF");
+		else if (scale == VX_SCALE_PYRAMID_ORB) sprintf(desc_scale, "ORB");
+		else sprintf(desc_scale, "%.12g", scale);
+		char desc[512]; sprintf(desc, "pyramid:%4.4s,%d,%d," VX_FMT_SIZE ",%s", FORMAT_STR(format), width, height, levels, desc_scale);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "pyramid", data->name);
+			agoAddData(&context->dataList, data);
+			// add the children too
+			for (vx_uint32 i = 0; i < data->numChildren; i++) {
+				agoAddData(&context->dataList, data->children[i]);
+				for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
+					if (data->children[i]->children[j]) {
+						agoAddData(&context->dataList, data->children[i]->children[j]);
+					}
+				}
+			}
+		}
+	}
+	return (vx_pyramid)data;
+}
+
+/*! \brief Creates a reference to a virtual pyramid object of the supplied number of levels.
+* \details Virtual Pyramids can be used to connect Nodes together when the contents of the pyramids will
+* not be accessed by the user of the API.
+* All of the following constructions are valid:
+* \code
+* vx_context context = vxCreateContext();
+* vx_graph graph = vxCreateGraph(context);
+* vx_pyramid virt[] = {
+*     vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 0, 0, VX_DF_IMAGE_VIRT), // no dimension and format specified for level 0
+*     vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 640, 480, VX_DF_IMAGE_VIRT), // no format specified.
+*     vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 640, 480, VX_DF_IMAGE_U8), // no access
+* };
+* \endcode
+* \param [in] graph The reference to the parent graph.
+* \param [in] levels The number of levels desired. This is required to be a non-zero value.
+* \param [in] scale Used to indicate the scale between pyramid levels. This is required to be a non-zero positive value.
+* \if OPENVX_STRICT_1_0
+* In OpenVX 1.0, the only permissible values are <tt>\ref VX_SCALE_PYRAMID_HALF</tt> or <tt>\ref VX_SCALE_PYRAMID_ORB</tt>.
+* \endif
+* \param [in] width The width of the 0th level image in pixels. This may be set to zero to indicate to the interface that the value is unspecified.
+* \param [in] height The height of the 0th level image in pixels. This may be set to zero to indicate to the interface that the value is unspecified.
+* \param [in] format The format of all images in the pyramid. This may be set to <tt>\ref VX_DF_IMAGE_VIRT</tt> to indicate that the format is unspecified.
+* \return A <tt>\ref vx_pyramid</tt> reference.
+* \note Images extracted with <tt>\ref vxGetPyramidLevel</tt> behave as Virtual Images and
+* cause <tt>\ref vxAccessImagePatch</tt> to return errors.
+* \retval 0 No pyramid was created.
+* \retval * A pyramid reference.
+* \ingroup group_pyramid
+*/
+VX_API_ENTRY vx_pyramid VX_API_CALL vxCreateVirtualPyramid(vx_graph graph, vx_size levels, vx_float32 scale, vx_uint32 width, vx_uint32 height, vx_df_image format)
+{
+	AgoData * data = NULL;
+	if (agoIsValidGraph(graph)) {
+		CAgoLock lock(graph->cs);
+		char desc_scale[64];
+		if (scale == VX_SCALE_PYRAMID_HALF) sprintf(desc_scale, "HALF");
+		else if (scale == VX_SCALE_PYRAMID_ORB) sprintf(desc_scale, "ORB");
+		else sprintf(desc_scale, "%.12g", scale);
+		char desc[512]; sprintf(desc, "pyramid-virtual:%4.4s,%d,%d," VX_FMT_SIZE ",%s", FORMAT_STR(format), width, height, levels, desc_scale);
+		data = agoCreateDataFromDescription(graph->ref.context, graph, desc, true);
+		if (data) {
+			agoGenerateVirtualDataName(graph, "pyramid", data->name);
+			agoAddData(&graph->dataList, data);
+			// add the children too
+			for (vx_uint32 i = 0; i < data->numChildren; i++) {
+				agoAddData(&graph->dataList, data->children[i]);
+				for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
+					if (data->children[i]->children[j]) {
+						agoAddData(&graph->dataList, data->children[i]->children[j]);
+					}
+				}
+			}
+		}
+	}
+	return (vx_pyramid)data;
+}
+
+
+/*! \brief Releases a reference to a pyramid object.
+* The object may not be garbage collected until its total reference count is zero.
+* \param [in] pyr The pointer to the pyramid to release.
+* \ingroup group_pyramid
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \post After returning from this function the reference is zeroed.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleasePyramid(vx_pyramid *pyr)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (pyr && agoIsValidData((AgoData*)*pyr, VX_TYPE_PYRAMID)) {
+		if (!agoReleaseData((AgoData*)*pyr, true)) {
+			*pyr = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries an attribute from an image pyramid.
+* \param [in] pyr The pyramid to query.
+* \param [in] attribute The attribute for which to query. Use a <tt>\ref vx_pyramid_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_pyramid
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryPyramid(vx_pyramid pyr, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)pyr;
+	if (agoIsValidData(data, VX_TYPE_PYRAMID)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_PYRAMID_ATTRIBUTE_LEVELS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.pyr.levels;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_SCALE:
+				if (size == sizeof(vx_float32)) {
+					*(vx_float32 *)ptr = data->u.pyr.scale;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_WIDTH:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = data->u.pyr.width;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_HEIGHT:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32  *)ptr = data->u.pyr.height;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_FORMAT:
+				if (size == sizeof(vx_df_image)) {
+					*(vx_df_image *)ptr = data->u.pyr.format;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*! \brief Retrieves a level of the pyramid as a <tt>\ref vx_image</tt>, which can be used
+* elsewhere in OpenVX.
+* \param [in] pyr The pyramid object.
+* \param [in] index The index of the level, such that index is less than levels.
+* \return A <tt>\ref vx_image</tt> reference.
+* \retval 0 Indicates that the index or the object is invalid.
+* \ingroup group_pyramid
+*/
+VX_API_ENTRY vx_image VX_API_CALL vxGetPyramidLevel(vx_pyramid pyr, vx_uint32 index)
+{
+	AgoData * data = (AgoData *)pyr;
+	AgoData * img = NULL;
+	if (agoIsValidData(data, VX_TYPE_PYRAMID) && (index < data->u.pyr.levels) && !data->isNotFullyConfigured) {
+		img = data->children[index];
+		agoRetainData((AgoGraph *)data->ref.scope, img, true);
+	}
+	return (vx_image)img;
+}
+
+/*==============================================================================
+REMAP
+=============================================================================*/
+
+/*! \brief Creates a remap table object.
+* \param [in] context The reference to the overall context.
+* \param [in] src_width Width of the source image in pixel.
+* \param [in] src_height Height of the source image in pixels.
+* \param [in] dst_width Width of the destination image in pixels.
+* \param [in] dst_height Height of the destination image in pixels.
+* \ingroup group_remap
+* \return <tt>\ref vx_remap</tt>
+* \retval 0 Object could not be created.
+* \retval * Object was created.
+*/
+VX_API_ENTRY vx_remap VX_API_CALL vxCreateRemap(vx_context context,
+	vx_uint32 src_width,
+	vx_uint32 src_height,
+	vx_uint32 dst_width,
+	vx_uint32 dst_height)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && src_width > 0 && src_height > 0 && dst_width > 0 && dst_height > 0) {
+		CAgoLock lock(context->cs);
+		char desc[512]; sprintf(desc, "remap:%u,%u,%u,%u", src_width, src_height, dst_width, dst_height);
+		data = agoCreateDataFromDescription(context, NULL, desc, true);
+		if (data) {
+			agoGenerateDataName(context, "remap", data->name);
+			agoAddData(&context->dataList, data);
+		}
+	}
+	return (vx_remap)data;
+}
+
+/*! \brief Releases a reference to a remap table object. The object may not be
+* garbage collected until its total reference count is zero.
+* \param [in] table The pointer to the remap table to release.
+* \post After returning from this function the reference is zeroed.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_remap
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseRemap(vx_remap *table)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (table && agoIsValidData((AgoData*)*table, VX_TYPE_REMAP)) {
+		if (!agoReleaseData((AgoData*)*table, true)) {
+			*table = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Assigns a destination pixel mapping to the source pixel.
+* \param [in] table The remap table reference.
+* \param [in] dst_x The destination x coordinate.
+* \param [in] dst_y The destination y coordinate.
+* \param [in] src_x The source x coordinate in float representation to allow interpolation.
+* \param [in] src_y The source y coordinate in float representation to allow interpolation.
+* \ingroup group_remap
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetRemapPoint(vx_remap table,
+	vx_uint32 dst_x, vx_uint32 dst_y,
+	vx_float32 src_x, vx_float32 src_y)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)table;
+	if (agoIsValidData(data, VX_TYPE_REMAP)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (!data->buffer) {
+			CAgoLock lock(data->ref.context->cs);
+			if (agoAllocData(data)) {
+				return VX_FAILURE;
+			}
+		}
+		if (dst_x < data->u.remap.dst_width && dst_y < data->u.remap.dst_height && data->buffer && data->reserved) {
+			ago_coord2d_ushort_t * item_fixed = ((ago_coord2d_ushort_t *)data->buffer) + (dst_y * data->u.remap.dst_width) + dst_x;
+			ago_coord2d_float_t * item_float = ((ago_coord2d_float_t *)data->reserved) + (dst_y * data->u.remap.dst_width) + dst_x;
+			item_float->x = src_x;
+			item_float->y = src_y;
+			item_fixed->x = (vx_uint16)(src_x * (vx_float32)(1 << data->u.remap.remap_fractional_bits) + 0.5f); // convert to fixed-point with rounding
+			item_fixed->y = (vx_uint16)(src_y * (vx_float32)(1 << data->u.remap.remap_fractional_bits) + 0.5f); // convert to fixed-point with rounding
+			// special handing for border cases
+			if (src_x < 0.0f || src_y < 0.0f || src_x >= (vx_float32)(data->u.remap.src_width-1) || src_y >= (vx_float32)(data->u.remap.src_height-1)) {
+				item_fixed->x = 0xffff;
+				item_fixed->y = 0xffff;
+			}
+			status = VX_SUCCESS;
+			// update sync flags
+			data->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+			data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT;
+		}
+	}
+	return status;
+}
+
+/*! \brief Retrieves the source pixel point from a destination pixel.
+* \param [in] table The remap table reference.
+* \param [in] dst_x The destination x coordinate.
+* \param [in] dst_y The destination y coordinate.
+* \param [out] src_x The pointer to the location to store the source x coordinate in float representation to allow interpolation.
+* \param [out] src_y The pointer to the location to store the source y coordinate in float representation to allow interpolation.
+* \ingroup group_remap
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxGetRemapPoint(vx_remap table,
+	vx_uint32 dst_x, vx_uint32 dst_y,
+	vx_float32 *src_x, vx_float32 *src_y)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)table;
+	if (agoIsValidData(data, VX_TYPE_REMAP) && data->buffer && data->reserved) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (src_x && src_y && dst_x < data->u.remap.dst_width && dst_y < data->u.remap.dst_height) {
+			ago_coord2d_float_t * item = ((ago_coord2d_float_t *)data->reserved) + (dst_y * data->u.remap.dst_width) + dst_x;
+			*src_x = item->x;
+			*src_y = item->y;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*! \brief Queries attributes from a Remap table.
+* \param [in] r The remap to query.
+* \param [in] attribute The attribute to query. Use a <tt>\ref vx_remap_attribute_e</tt> enumeration.
+* \param [out] ptr The location at which to store the resulting value.
+* \param [in] size The size of the container to which \a ptr points.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \ingroup group_remap
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryRemap(vx_remap r, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)r;
+	if (agoIsValidData(data, VX_TYPE_REMAP)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_REMAP_ATTRIBUTE_SOURCE_WIDTH:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = data->u.remap.src_width;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_REMAP_ATTRIBUTE_SOURCE_HEIGHT:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = data->u.remap.src_height;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_REMAP_ATTRIBUTE_DESTINATION_WIDTH:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32 *)ptr = data->u.remap.dst_width;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_REMAP_ATTRIBUTE_DESTINATION_HEIGHT:
+				if (size == sizeof(vx_uint32)) {
+					*(vx_uint32  *)ptr = data->u.remap.dst_height;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+ARRAY
+=============================================================================*/
+
+/*!
+* \brief Creates a reference to an Array object.
+*
+* User must specify the Array capacity (i.e., the maximal number of items that the array can hold).
+*
+* \param [in] context      The reference to the overall Context.
+* \param [in] item_type    The type of objects to hold. Use:
+*                          \arg <tt>\ref VX_TYPE_RECTANGLE</tt> for <tt>\ref vx_rectangle_t</tt>.
+*                          \arg <tt>\ref VX_TYPE_KEYPOINT</tt> for <tt>\ref vx_keypoint_t</tt>.
+*                          \arg <tt>\ref VX_TYPE_COORDINATES2D</tt> for <tt>\ref vx_coordinates2d_t</tt>.
+*                          \arg <tt>\ref VX_TYPE_COORDINATES3D</tt> for <tt>\ref vx_coordinates3d_t</tt>.
+*                          \arg <tt>\ref vx_enum</tt> Returned from <tt>\ref vxRegisterUserStruct</tt>.
+* \param [in] capacity     The maximal number of items that the array can hold.
+*
+* \return <tt>\ref vx_array</tt>.
+* \retval 0 No Array was created.
+* \retval * An Array was created.
+*
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_array VX_API_CALL vxCreateArray(vx_context context, vx_enum item_type, vx_size capacity)
+{
+	AgoData * data = NULL;
+	if (agoIsValidContext(context) && capacity > 0) {
+		CAgoLock lock(context->cs);
+		const char * desc_type = agoEnum2Name(item_type);
+		if (!desc_type) {
+			desc_type = agoGetUserStructName(context, item_type);
+		}
+		if (desc_type) {
+			char desc[512]; sprintf(desc, "array:%s," VX_FMT_SIZE "", desc_type, capacity);
+			data = agoCreateDataFromDescription(context, NULL, desc, true);
+			if (data) {
+				agoGenerateDataName(context, "array", data->name);
+				agoAddData(&context->dataList, data);
+			}
+		}
+	}
+	return (vx_array)data;
+}
+
+/*!
+* \brief Creates an opaque reference to a virtual Array with no direct user access.
+*
+* Virtual Arrays are useful when item type or capacity are unknown ahead of time
+* and the Array is used as internal graph edge. Virtual arrays are scoped within the parent graph only.
+*
+* All of the following constructions are allowed.
+* \code
+* vx_context context = vxCreateContext();
+* vx_graph graph = vxCreateGraph(context);
+* vx_array virt[] = {
+*     vxCreateVirtualArray(graph, 0, 0), // totally unspecified
+*     vxCreateVirtualArray(graph, VX_TYPE_KEYPOINT, 0), // unspecified capacity
+*     vxCreateVirtualArray(graph, VX_TYPE_KEYPOINT, 1000), // no access
+* };
+* \endcode
+*
+* \param [in] graph        The reference to the parent graph.
+* \param [in] item_type    The type of objects to hold.
+*                          This may to set to zero to indicate an unspecified item type.
+* \param [in] capacity     The maximal number of items that the array can hold.
+*                          This may be to set to zero to indicate an unspecified capacity.
+* \see vxCreateArray for a type list.
+* \return <tt>\ref vx_array</tt>.
+* \retval 0 No Array was created.
+* \retval * An Array was created or an error occurred. Use <tt>\ref vxGetStatus</tt> to determine.
+*
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_array VX_API_CALL vxCreateVirtualArray(vx_graph graph, vx_enum item_type, vx_size capacity)
+{
+	AgoData * data = NULL;
+	if (agoIsValidGraph(graph)) {
+		CAgoLock lock(graph->cs);
+		const char * desc_type = agoEnum2Name(item_type);
+		if (item_type && !desc_type) {
+			desc_type = agoGetUserStructName(graph->ref.context, item_type);
+		}
+		if (!item_type || desc_type) {
+			char desc[512]; 
+			if (desc_type) sprintf(desc, "array-virtual:%s," VX_FMT_SIZE "", desc_type, capacity);
+			else sprintf(desc, "array-virtual:0," VX_FMT_SIZE "", capacity);
+			data = agoCreateDataFromDescription(graph->ref.context, graph, desc, true);
+			if (data) {
+				agoGenerateVirtualDataName(graph, "array", data->name);
+				agoAddData(&graph->dataList, data);
+			}
+		}
+	}
+	return (vx_array)data;
+}
+
+/*!
+* \brief Releases a reference of an Array object.
+* The object may not be garbage collected until its total reference count is zero.
+* After returning from this function the reference is zeroed.
+* \param [in] arr          The pointer to the Array to release.
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseArray(vx_array *arr)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (arr && agoIsValidData((AgoData*)*arr, VX_TYPE_ARRAY)) {
+		if (!agoReleaseData((AgoData*)*arr, true)) {
+			*arr = NULL;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*!
+* \brief Queries the Array for some specific information.
+*
+* \param [in] arr          The reference to the Array.
+* \param [in] attribute    The attribute to query. Use a <tt>\ref vx_array_attribute_e</tt>.
+* \param [out] ptr         The location at which to store the resulting value.
+* \param [in] size         The size of the container to which \a ptr points.
+*
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS                   No errors.
+* \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+* \retval VX_ERROR_NOT_SUPPORTED       If the \a attribute is not a value supported on this implementation.
+* \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+*
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxQueryArray(vx_array arr, vx_enum attribute, void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)arr;
+	if (agoIsValidData(data, VX_TYPE_ARRAY)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			case VX_ARRAY_ATTRIBUTE_ITEMTYPE:
+				if (size == sizeof(vx_enum)) {
+					*(vx_enum *)ptr = data->u.arr.itemtype;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_ARRAY_ATTRIBUTE_NUMITEMS:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.arr.numitems;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_ARRAY_ATTRIBUTE_CAPACITY:
+				if (size == sizeof(vx_size)) {
+					*(vx_size *)ptr = data->u.arr.capacity;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_ARRAY_ATTRIBUTE_ITEMSIZE:
+				if (size == sizeof(vx_size)) {
+					*(vx_size  *)ptr = data->u.arr.itemsize;
+					status = VX_SUCCESS;
+				}
+				break;
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+/*!
+* \brief Adds items to the Array.
+*
+* This function increases the container size.
+*
+* By default, the function does not reallocate memory,
+* so if the container is already full (number of elements is equal to capacity)
+* or it doesn't have enough space,
+* the function returns <tt>\ref VX_FAILURE</tt> error code.
+*
+* \param [in] arr          The reference to the Array.
+* \param [in] count        The total number of elements to insert.
+* \param [in] ptr          The location at which to store the input values.
+* \param [in] stride       The stride in bytes between elements. User can pass 0, which means that stride is equal to item size.
+*
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS                   No errors.
+* \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+* \retval VX_FAILURE                   If the Array is full.
+* \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+*
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAddArrayItems(vx_array arr, vx_size count, const void *ptr, vx_size stride)
+{
+	AgoData * data = (AgoData *)arr;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_ARRAY)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr && (data->u.arr.numitems + count <= data->u.arr.capacity)) {
+			if (!data->buffer) {
+				CAgoLock lock(data->ref.context->cs);
+				if (agoAllocData(data)) {
+					return VX_FAILURE;
+				}
+			}
+			if (stride == 0) stride = data->u.arr.itemsize; // TBD remove -- this needs to be removed. The spec does not specify to compute stride here, conformance expects it though
+			if (count > 0) {
+				// add items at the end of the array
+				vx_uint8 * pSrc = (vx_uint8 *)ptr;
+				vx_uint8 * pDst = data->buffer + data->u.arr.itemsize * data->u.arr.numitems;
+				if (stride == data->u.arr.itemsize) {
+					HafCpu_BinaryCopy_U8_U8(data->u.arr.itemsize * count, pDst, pSrc);
+				}
+				else {
+					for (vx_size i = 0; i < count; i++, pSrc += stride, pDst += data->u.arr.itemsize) {
+						HafCpu_BinaryCopy_U8_U8(data->u.arr.itemsize, pDst, pSrc);
+					}
+				}
+				data->u.arr.numitems += count;
+			}
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*!
+* \brief Truncates an Array (remove items from the end).
+*
+* \param [in,out] arr          The reference to the Array.
+* \param [in] new_num_items    The new number of items for the Array.
+*
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS                   No errors.
+* \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS  The \a new_size is greater than the current size.
+*
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxTruncateArray(vx_array arr, vx_size new_num_items)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	AgoData * data = (AgoData *)arr;
+	if (agoIsValidData(data, VX_TYPE_ARRAY)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (new_num_items <= data->u.arr.numitems) {
+			data->u.arr.numitems = new_num_items;
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
+
+/*!
+* \brief Grants access to a sub-range of an Array.
+*
+* \param [in] arr          The reference to the Array.
+* \param [in] start        The start index.
+* \param [in] end          The end index.
+* \param [out] stride      The stride in bytes between elements.
+* \param [out] ptr         The user-supplied pointer to a pointer, via which the requested contents are returned.
+*                          If (*ptr) is non-NULL, data is copied to it, else (*ptr) is set to the address of existing internal memory, allocated, or mapped memory.
+*                          (*ptr) must be given to <tt>\ref vxCommitArrayRange</tt>.
+*                          Use a <tt>\ref vx_rectangle_t</tt> for <tt>\ref VX_TYPE_RECTANGLE</tt>
+*                          and a <tt>\ref vx_keypoint_t</tt> for <tt>\ref VX_TYPE_KEYPOINT</tt>.
+* \param [in] usage        This declares the intended usage of the pointer using the <tt>\ref vx_accessor_e</tt> enumeration.
+*
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS                   No errors.
+* \retval VX_ERROR_OPTIMIZED_AWAY      If the reference is a virtual array and cannot be accessed or committed.
+* \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+* \post <tt>\ref vxCommitArrayRange</tt>
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxAccessArrayRange(vx_array arr, vx_size start, vx_size end, vx_size *stride, void **ptr, vx_enum usage)
+{
+	AgoData * data = (AgoData *)arr;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_ARRAY)) {
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr && stride && start < end && end <= data->u.arr.numitems) {
+			if (!data->buffer) {
+				CAgoLock lock(data->ref.context->cs);
+				if (agoAllocData(data)) {
+					return VX_FAILURE;
+				}
+			}
+			vx_uint8 * ptr_internal = data->buffer + data->u.arr.itemsize * start;
+			vx_uint8 * ptr_returned = *ptr ? (vx_uint8 *)*ptr : ptr_internal;
+			// save the pointer and usage for use in vxCommitXXX
+			status = VX_SUCCESS;
+			for (auto i = data->mapped.begin(); i != data->mapped.end(); i++) {
+				if (i->ptr == ptr_returned) {
+					// can't support vxAccessXXX() more than once with same pointer
+					// the application needs to call vxCommitXXX() before calling vxAccessXXX()
+					status = VX_FAILURE;
+				}
+			}
+			if (status == VX_SUCCESS) {
+				MappedData item = { ptr_returned, usage, (ptr_returned != ptr_internal) ? true : false, (ptr_returned != ptr_internal) ? *stride : data->u.arr.itemsize };
+				data->mapped.push_back(item);
+				*ptr = ptr_returned;
+				*stride = item.stride;
+#if ENABLE_OPENCL
+				if (data->opencl_buffer && !(data->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
+					// make sure dirty OpenCL buffers are synched before giving access for read
+					if (data->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL)) {
+						// transfer only valid data
+						vx_size size = data->u.arr.itemsize * data->u.arr.numitems;
+						if (size > 0) {
+							cl_int err = clEnqueueReadBuffer(data->ref.context->opencl_cmdq, data->opencl_buffer, CL_TRUE, data->opencl_buffer_offset, size, data->buffer, 0, NULL, NULL);
+							if (err) {
+								status = VX_FAILURE;
+								agoAddLogEntry(&data->ref, status, "ERROR: vxAccessArrayRange: clEnqueueReadBuffer() => %d\n", err);
+								return status;
+							}
+						}
+						data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
+					}
+				}
+#endif
+				if (item.used_external_ptr && (usage == VX_READ_ONLY || usage == VX_READ_AND_WRITE)) {
+					// copy if read is requested with explicit external buffer
+					vx_uint8 * pSrc = ptr_internal;
+					vx_uint8 * pDst = ptr_returned;
+					if (item.stride == data->u.arr.itemsize) {
+						HafCpu_BinaryCopy_U8_U8(data->u.arr.itemsize * (end - start), ptr_returned, ptr_internal);
+					}
+					else {
+						for (vx_size i = start; i < end; i++, pSrc += data->u.arr.itemsize, pDst += item.stride) {
+							HafCpu_BinaryCopy_U8_U8(data->u.arr.itemsize, pDst, pSrc);
+						}
+					}
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*!
+* \brief Commits data back to the Array object.
+*
+* \details This allows a user to commit data to a sub-range of an Array.
+*
+* \param [in] arr          The reference to the Array.
+* \param [in] start        The start index.
+* \param [in] end          The end index.
+* \param [in] ptr          The user supplied pointer.
+*
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS                   No errors.
+* \retval VX_ERROR_OPTIMIZED_AWAY      If the reference is a virtual array and cannot be accessed or committed.
+* \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+* \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+*
+* \ingroup group_array
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxCommitArrayRange(vx_array arr, vx_size start, vx_size end, const void *ptr)
+{
+	AgoData * data = (AgoData *)arr;
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidData(data, VX_TYPE_ARRAY)) {
+		// check for valid arguments
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (data->isVirtual && !data->buffer) {
+			status = VX_ERROR_OPTIMIZED_AWAY;
+		}
+		else if (ptr && start <= end && end <= data->u.arr.numitems) {
+			status = VX_SUCCESS;
+			if (!data->buffer) {
+				status = VX_FAILURE;
+			}
+			else if (!data->mapped.empty()) {
+				vx_enum usage = VX_READ_ONLY;
+				bool used_external_ptr = false;
+				vx_size stride = data->u.arr.itemsize;
+				for (auto i = data->mapped.begin(); i != data->mapped.end(); i++) {
+					if (i->ptr == ptr) {
+						if (start < end) {
+							usage = i->usage;
+							used_external_ptr = i->used_external_ptr;
+							stride = i->stride;
+						}
+						data->mapped.erase(i);
+						break;
+					}
+				}
+				if ((start < end) && (usage == VX_WRITE_ONLY || usage == VX_READ_AND_WRITE)) {
+					if (used_external_ptr) {
+						// copy from external buffer
+						vx_uint8 * pSrc = (vx_uint8 *)ptr;
+						vx_uint8 * pDst = data->buffer + start * data->u.arr.itemsize;
+						if (stride == data->u.arr.itemsize) {
+							HafCpu_BinaryCopy_U8_U8(data->u.arr.itemsize * (end - start), pDst, pSrc);
+						}
+						else {
+							for (vx_size i = start; i < end; i++, pSrc += stride, pDst += data->u.arr.itemsize) {
+								HafCpu_BinaryCopy_U8_U8(data->u.arr.itemsize, pDst, pSrc);
+							}
+						}
+					}
+					// update sync flags
+					data->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
+					data->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT;
+				}
+			}
+		}
+	}
+	return status;
+}
+
+/*==============================================================================
+META FORMAT
+=============================================================================*/
+
+/*! \brief Allows a user to set the attributes of a <tt>\ref vx_meta_format</tt> object in a kernel output validator.
+* \param [in] meta The reference to the <tt>\ref vx_meta_format</tt> object to set.
+* \param [in] attribute Use attributes from other objects that match the parameter type or from <tt>\ref vx_meta_format_attribute_e</tt>.
+* \param [in] ptr The input pointer of the value to set on the meta format object.
+* \param [in] size The size of the object to which \a ptr points.
+* \ingroup group_user_kernels
+* \return A <tt>\ref vx_status_e</tt> enumeration.
+* \retval VX_SUCCESS The attribute was set.
+* \retval VX_ERROR_INVALID_REFERENCE meta was not a <tt>\ref vx_meta_format</tt>.
+* \retval VX_ERROR_INVALID_PARAMETER size was not correct for the type needed.
+* \retval VX_ERROR_NOT_SUPPORTED the object attribute was not supported on the meta format object.
+* \retval VX_ERROR_INVALID_TYPE attribute type did not match known meta format type.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetMetaFormatAttribute(vx_meta_format meta, vx_enum attribute, const void *ptr, vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (meta && meta->type == VX_TYPE_META_FORMAT && agoIsValidReference(&meta->data.ref)) {
+		AgoData * data = &meta->data;
+		status = VX_ERROR_INVALID_PARAMETERS;
+		if (ptr) {
+			switch (attribute)
+			{
+			/**********************************************************************/
+			case VX_META_FORMAT_ATTRIBUTE_DELTA_RECTANGLE:
+				if (size == sizeof(vx_delta_rectangle_t)) {
+					data->delta = *(vx_delta_rectangle_t *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			/**********************************************************************/
+			case VX_IMAGE_ATTRIBUTE_FORMAT:
+				if (size == sizeof(vx_df_image) && data->ref.type == VX_TYPE_IMAGE) {
+					data->u.img.format = *(vx_df_image *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_HEIGHT:
+				if (size == sizeof(vx_uint32) && data->ref.type == VX_TYPE_IMAGE) {
+					data->u.img.height = *(vx_uint32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_IMAGE_ATTRIBUTE_WIDTH:
+				if (size == sizeof(vx_uint32) && data->ref.type == VX_TYPE_IMAGE) {
+					data->u.img.width = *(vx_uint32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+#if ENABLE_OPENCL
+			case VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL:
+				if (size == sizeof(vx_bool) && data->ref.type == VX_TYPE_IMAGE) {
+					data->u.img.enableUserBufferOpenCL = *(vx_bool *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+#endif
+				/**********************************************************************/
+			case VX_ARRAY_ATTRIBUTE_CAPACITY:
+				if (size == sizeof(vx_size) && data->ref.type == VX_TYPE_ARRAY) {
+					data->u.arr.capacity = *(vx_size *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_ARRAY_ATTRIBUTE_ITEMTYPE:
+				if (size == sizeof(vx_enum) && data->ref.type == VX_TYPE_ARRAY) {
+					data->u.arr.itemtype = *(vx_enum *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			/**********************************************************************/
+			case VX_PYRAMID_ATTRIBUTE_FORMAT:
+				if (size == sizeof(vx_df_image) && data->ref.type == VX_TYPE_PYRAMID) {
+					data->u.pyr.format = *(vx_df_image *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_HEIGHT:
+				if (size == sizeof(vx_uint32) && data->ref.type == VX_TYPE_PYRAMID) {
+					data->u.pyr.height = *(vx_uint32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_WIDTH:
+				if (size == sizeof(vx_uint32) && data->ref.type == VX_TYPE_PYRAMID) {
+					data->u.pyr.width = *(vx_uint32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_LEVELS:
+				if (size == sizeof(vx_size) && data->ref.type == VX_TYPE_PYRAMID) {
+					data->u.pyr.levels = *(vx_size *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			case VX_PYRAMID_ATTRIBUTE_SCALE:
+				if (size == sizeof(vx_float32) && data->ref.type == VX_TYPE_PYRAMID) {
+					data->u.pyr.scale = *(vx_float32 *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			/**********************************************************************/
+			case VX_SCALAR_ATTRIBUTE_TYPE:
+				if (size == sizeof(vx_enum) && data->ref.type == VX_TYPE_SCALAR) {
+					data->u.scalar.type = *(vx_enum *)ptr;
+					status = VX_SUCCESS;
+				}
+				break;
+			/**********************************************************************/
+			default:
+				status = VX_ERROR_NOT_SUPPORTED;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxSetReferenceName(vx_reference ref, const char *name)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidReference(ref) && ((ref->type >= VX_TYPE_DELAY && ref->type <= VX_TYPE_REMAP) || (ref->type >= VX_TYPE_VENDOR_OBJECT_START && ref->type <= VX_TYPE_VENDOR_OBJECT_END))) {
+		AgoData * data = (AgoData *)ref;
+		data->name = name;
+		status = VX_SUCCESS;
+	}
+	return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxGetReferenceName(vx_reference ref, vx_char name[], vx_size size)
+{
+	vx_status status = VX_ERROR_INVALID_REFERENCE;
+	if (agoIsValidReference(ref)) {
+		if ((ref->type >= VX_TYPE_DELAY && ref->type <= VX_TYPE_REMAP) || (ref->type >= VX_TYPE_VENDOR_OBJECT_START && ref->type <= VX_TYPE_VENDOR_OBJECT_END)) {
+			strncpy(name, ((AgoData *)ref)->name.c_str(), size);
+			status = VX_SUCCESS;
+		}
+		else if (ref->type == VX_TYPE_KERNEL) {
+			strncpy(name, ((AgoKernel *)ref)->name, size);
+			status = VX_SUCCESS;
+		}
+		else if (ref->type == VX_TYPE_NODE) {
+			strncpy(name, ((AgoNode *)ref)->akernel->name, size);
+			status = VX_SUCCESS;
+		}
+	}
+	return status;
+}
diff --git a/openvx/api/vx_nodes.cpp b/openvx/api/vx_nodes.cpp
new file mode 100644
index 0000000..9b4a2b7
--- /dev/null
+++ b/openvx/api/vx_nodes.cpp
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2012-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+/*!
+ * \file
+ * \brief The Graph Mode Interface for all Base Kernels.
+ * \author Erik Rainey <erik.rainey@ti.com>
+ */
+
+#include "ago_internal.h"
+
+static vx_node vxCreateNodeByStructure(vx_graph graph,
+	vx_enum kernelenum,
+	vx_reference params[],
+	vx_uint32 num)
+{
+	vx_status status = VX_SUCCESS;
+	vx_node node = 0;
+	vx_context context = vxGetContext((vx_reference)graph);
+	vx_kernel kernel = vxGetKernelByEnum(context, kernelenum);
+	if (kernel)
+	{
+		node = vxCreateGenericNode(graph, kernel);
+		if (node)
+		{
+			vx_uint32 p = 0;
+			for (p = 0; p < num; p++)
+			{
+				if (params[p]) {
+					status = vxSetParameterByIndex(node,
+						p,
+						params[p]);
+					if (status != VX_SUCCESS)
+					{
+						vxAddLogEntry((vx_reference)graph, status, "Kernel %d Parameter %u is invalid.\n", kernelenum, p);
+						vxReleaseNode(&node);
+						node = 0;
+						break;
+					}
+				}
+			}
+		}
+		else
+		{
+			vxAddLogEntry((vx_reference)graph, VX_ERROR_INVALID_PARAMETERS, "Failed to create node with kernel enum %d\n", kernelenum);
+			status = VX_ERROR_NO_MEMORY;
+		}
+		vxReleaseKernel(&kernel);
+	}
+	else
+	{
+		vxAddLogEntry((vx_reference)graph, VX_ERROR_INVALID_PARAMETERS, "failed to retrieve kernel enum %d\n", kernelenum);
+		status = VX_ERROR_NOT_SUPPORTED;
+	}
+	return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph, VX_KERNEL_COLOR_CONVERT, params, dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxChannelExtractNode(vx_graph graph,
+                             vx_image input,
+                             vx_enum channelNum,
+                             vx_image output)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar scalar = vxCreateScalar(context, VX_TYPE_ENUM, &channelNum);
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)scalar,
+        (vx_reference)output,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_CHANNEL_EXTRACT,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&scalar); // node hold reference
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxChannelCombineNode(vx_graph graph,
+                             vx_image plane0,
+                             vx_image plane1,
+                             vx_image plane2,
+                             vx_image plane3,
+                             vx_image output)
+{
+    vx_reference params[] = {
+       (vx_reference)plane0,
+       (vx_reference)plane1,
+       (vx_reference)plane2,
+       (vx_reference)plane3,
+       (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_CHANNEL_COMBINE,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input, vx_image output_x, vx_image output_y)
+{
+    vx_reference params[] = {
+       (vx_reference)input,
+       (vx_reference)output_x,
+       (vx_reference)output_y,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_SOBEL_3x3,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image mag)
+{
+    vx_reference params[] = {
+       (vx_reference)grad_x,
+       (vx_reference)grad_y,
+       (vx_reference)mag,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MAGNITUDE,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image orientation)
+{
+    vx_reference params[] = {
+       (vx_reference)grad_x,
+       (vx_reference)grad_y,
+       (vx_reference)orientation,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_PHASE,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src, vx_image dst, vx_enum type)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar stype = vxCreateScalar(context, VX_TYPE_ENUM, &type);
+    vx_reference params[] = {
+        (vx_reference)src,
+        (vx_reference)dst,
+        (vx_reference)stype,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_SCALE_IMAGE,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&stype);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxTableLookupNode(vx_graph graph, vx_image input, vx_lut lut, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)lut,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_TABLE_LOOKUP,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxHistogramNode(vx_graph graph, vx_image input, vx_distribution distribution)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)distribution,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_HISTOGRAM,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxEqualizeHistNode(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_EQUALIZE_HISTOGRAM,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxAbsDiffNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)out,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ABSDIFF,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxMeanStdDevNode(vx_graph graph, vx_image input, vx_scalar mean, vx_scalar stddev)
+{
+    vx_reference params[] = {
+       (vx_reference)input,
+       (vx_reference)mean,
+       (vx_reference)stddev,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MEAN_STDDEV,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxThresholdNode(vx_graph graph, vx_image input, vx_threshold thesh, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)thesh,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_THRESHOLD,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_INTEGRAL_IMAGE,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ERODE_3x3,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_DILATE_3x3,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MEDIAN_3x3,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_BOX_3x3,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_GAUSSIAN_3x3,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxConvolveNode(vx_graph graph, vx_image input, vx_convolution conv, vx_image output)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)conv,
+        (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_CUSTOM_CONVOLUTION,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxGaussianPyramidNode(vx_graph graph, vx_image input, vx_pyramid gaussian)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)gaussian,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_GAUSSIAN_PYRAMID,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxAccumulateImageNode(vx_graph graph, vx_image input, vx_image accum)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)accum,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ACCUMULATE,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxAccumulateWeightedImageNode(vx_graph graph, vx_image input, vx_scalar alpha, vx_image accum)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)alpha,
+        (vx_reference)accum,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ACCUMULATE_WEIGHTED,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxAccumulateSquareImageNode(vx_graph graph, vx_image input, vx_scalar scalar, vx_image accum)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)scalar,
+        (vx_reference)accum,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ACCUMULATE_SQUARE,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxMinMaxLocNode(vx_graph graph,
+                        vx_image input,
+                        vx_scalar minVal, vx_scalar maxVal,
+                        vx_array minLoc, vx_array maxLoc,
+                        vx_scalar minCount, vx_scalar maxCount)
+{
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)minVal,
+        (vx_reference)maxVal,
+        (vx_reference)minLoc,
+        (vx_reference)maxLoc,
+        (vx_reference)minCount,
+        (vx_reference)maxCount,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MINMAXLOC,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image input, vx_image output, vx_enum policy, vx_scalar shift)
+{
+    vx_scalar pol = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_ENUM, &policy);
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)output,
+        (vx_reference)pol,
+        (vx_reference)shift,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_CONVERTDEPTH,
+                                   params,
+                                   dimof(params));
+    vxReleaseScalar(&pol);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxCannyEdgeDetectorNode(vx_graph graph, vx_image input, vx_threshold hyst,
+                                vx_int32 gradient_size, vx_enum norm_type,
+                                vx_image output)
+{
+    vx_scalar gs = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_INT32, &gradient_size);
+    vx_scalar nt = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_ENUM, &norm_type);
+    vx_reference params[] = {
+        (vx_reference)input,
+        (vx_reference)hyst,
+        (vx_reference)gs,
+        (vx_reference)nt,
+        (vx_reference)output,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_CANNY_EDGE_DETECTOR,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&gs);
+    vxReleaseScalar(&nt);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxAndNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)out,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_AND,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)out,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_OR,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)out,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_XOR,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_image output)
+{
+    vx_reference params[] = {
+       (vx_reference)input,
+       (vx_reference)output,
+    };
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_NOT,
+                                   params,
+                                   dimof(params));
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph, vx_image in1, vx_image in2, vx_scalar scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_image out)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar spolicy = vxCreateScalar(context, VX_TYPE_ENUM, &overflow_policy);
+    vx_scalar rpolicy = vxCreateScalar(context, VX_TYPE_ENUM, &rounding_policy);
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)scale,
+       (vx_reference)spolicy,
+       (vx_reference)rpolicy,
+       (vx_reference)out,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_MULTIPLY,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&spolicy);
+    vxReleaseScalar(&rpolicy);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxAddNode(vx_graph graph, vx_image in1, vx_image in2, vx_enum policy, vx_image out)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar spolicy = vxCreateScalar(context, VX_TYPE_ENUM, &policy);
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)spolicy,
+       (vx_reference)out,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_ADD,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&spolicy);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxSubtractNode(vx_graph graph, vx_image in1, vx_image in2, vx_enum policy, vx_image out)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar spolicy = vxCreateScalar(context, VX_TYPE_ENUM, &policy);
+    vx_reference params[] = {
+       (vx_reference)in1,
+       (vx_reference)in2,
+       (vx_reference)spolicy,
+       (vx_reference)out,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_SUBTRACT,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&spolicy);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxWarpAffineNode(vx_graph graph, vx_image input, vx_matrix matrix, vx_enum type, vx_image output)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar stype = vxCreateScalar(context, VX_TYPE_ENUM, &type);
+    vx_reference params[] = {
+            (vx_reference)input,
+            (vx_reference)matrix,
+            (vx_reference)stype,
+            (vx_reference)output,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_WARP_AFFINE,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&stype);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxWarpPerspectiveNode(vx_graph graph, vx_image input, vx_matrix matrix, vx_enum type, vx_image output)
+{
+    vx_context context = vxGetContext((vx_reference)graph);
+    vx_scalar stype = vxCreateScalar(context, VX_TYPE_ENUM, &type);
+    vx_reference params[] = {
+            (vx_reference)input,
+            (vx_reference)matrix,
+            (vx_reference)stype,
+            (vx_reference)output,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_WARP_PERSPECTIVE,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&stype);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxHarrisCornersNode(vx_graph graph,
+                            vx_image input,
+                            vx_scalar strength_thresh,
+                            vx_scalar min_distance,
+                            vx_scalar sensitivity,
+                            vx_int32 gradient_size,
+                            vx_int32 block_size,
+                            vx_array corners,
+                            vx_scalar num_corners)
+{
+    vx_scalar win = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_INT32, &gradient_size);
+    vx_scalar blk = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_INT32, &block_size);
+    vx_reference params[] = {
+            (vx_reference)input,
+            (vx_reference)strength_thresh,
+            (vx_reference)min_distance,
+            (vx_reference)sensitivity,
+            (vx_reference)win,
+            (vx_reference)blk,
+            (vx_reference)corners,
+            (vx_reference)num_corners,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_HARRIS_CORNERS,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&win);
+    vxReleaseScalar(&blk);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxFastCornersNode(vx_graph graph, vx_image input, vx_scalar strength_thresh, vx_bool nonmax_supression, vx_array corners, vx_scalar num_corners)
+{
+    vx_scalar nonmax = vxCreateScalar(vxGetContext((vx_reference)graph),VX_TYPE_BOOL, &nonmax_supression);
+    vx_reference params[] = {
+            (vx_reference)input,
+            (vx_reference)strength_thresh,
+            (vx_reference)nonmax,
+            (vx_reference)corners,
+            (vx_reference)num_corners,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_FAST_CORNERS,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&nonmax);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxOpticalFlowPyrLKNode(vx_graph graph,
+                               vx_pyramid old_images,
+                               vx_pyramid new_images,
+                               vx_array old_points,
+                               vx_array new_points_estimates,
+                               vx_array new_points,
+                               vx_enum termination,
+                               vx_scalar epsilon,
+                               vx_scalar num_iterations,
+                               vx_scalar use_initial_estimate,
+                               vx_size window_dimension)
+{
+    vx_scalar term = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_ENUM, &termination);
+    vx_scalar winsize = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_SIZE, &window_dimension);
+    vx_reference params[] = {
+            (vx_reference)old_images,
+            (vx_reference)new_images,
+            (vx_reference)old_points,
+            (vx_reference)new_points_estimates,
+            (vx_reference)new_points,
+            (vx_reference)term,
+            (vx_reference)epsilon,
+            (vx_reference)num_iterations,
+            (vx_reference)use_initial_estimate,
+            (vx_reference)winsize,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_OPTICAL_FLOW_PYR_LK,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&term);
+    vxReleaseScalar(&winsize);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxRemapNode(vx_graph graph,
+                    vx_image input,
+                    vx_remap table,
+                    vx_enum policy,
+                    vx_image output)
+{
+    vx_scalar spolicy = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_ENUM, &policy);
+    vx_reference params[] = {
+            (vx_reference)input,
+            (vx_reference)table,
+            (vx_reference)spolicy,
+            (vx_reference)output,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_REMAP,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&spolicy);
+    return node;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxHalfScaleGaussianNode(vx_graph graph, vx_image input, vx_image output, vx_int32 kernel_size)
+{
+    vx_scalar ksize = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_INT32, &kernel_size);
+    vx_reference params[] = {
+            (vx_reference)input,
+            (vx_reference)output,
+            (vx_reference)ksize,
+    };
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_HALFSCALE_GAUSSIAN,
+                                           params,
+                                           dimof(params));
+    vxReleaseScalar(&ksize);
+    return node;
+}
diff --git a/openvx/api/vxu.cpp b/openvx/api/vxu.cpp
new file mode 100644
index 0000000..ace91da
--- /dev/null
+++ b/openvx/api/vxu.cpp
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2012-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#include <VX/vx.h>
+#include <VX/vxu.h>
+
+VX_API_ENTRY vx_status VX_API_CALL vxuColorConvert(vx_context context, vx_image src, vx_image dst)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxColorConvertNode(graph, src, dst);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuChannelExtract(vx_context context, vx_image src, vx_enum channel, vx_image dst)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxChannelExtractNode(graph, src, channel, dst);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuChannelCombine(vx_context context,
+                            vx_image plane0,
+                            vx_image plane1,
+                            vx_image plane2,
+                            vx_image plane3,
+                            vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxChannelCombineNode(graph, plane0, plane1, plane2, plane3, output);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+static vx_status vx_useImmediateBorderMode(vx_context context, vx_node node)
+{
+    vx_border_mode_t border;
+    vx_status status = vxQueryContext(context, VX_CONTEXT_ATTRIBUTE_IMMEDIATE_BORDER_MODE, &border, sizeof(border));
+    if (status == VX_SUCCESS)
+        status = vxSetNodeAttribute(node, VX_NODE_ATTRIBUTE_BORDER_MODE, &border, sizeof(border));
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuSobel3x3(vx_context context, vx_image src, vx_image output_x, vx_image output_y)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxSobel3x3Node(graph, src, output_x, output_y);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuMagnitude(vx_context context, vx_image grad_x, vx_image grad_y, vx_image dst)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxMagnitudeNode(graph, grad_x, grad_y, dst);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuPhase(vx_context context, vx_image grad_x, vx_image grad_y, vx_image dst)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxPhaseNode(graph, grad_x, grad_y, dst);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuScaleImage(vx_context context, vx_image src, vx_image dst, vx_enum type)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxScaleImageNode(graph, src, dst, type);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuTableLookup(vx_context context, vx_image input, vx_lut lut, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxTableLookupNode(graph, input, lut, output);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuHistogram(vx_context context, vx_image input, vx_distribution distribution)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxHistogramNode(graph, input, distribution);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuEqualizeHist(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxEqualizeHistNode(graph, input, output);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuAbsDiff(vx_context context, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxAbsDiffNode(graph, in1, in2, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuMeanStdDev(vx_context context, vx_image input, vx_float32 *mean, vx_float32 *stddev)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_scalar s_mean = vxCreateScalar(context, VX_TYPE_FLOAT32, NULL);
+        vx_scalar s_stddev = vxCreateScalar(context, VX_TYPE_FLOAT32, NULL);
+        vx_node node = vxMeanStdDevNode(graph, input, s_mean, s_stddev);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+                if(mean) vxReadScalarValue(s_mean, mean);
+                if(stddev) vxReadScalarValue(s_stddev, stddev);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseScalar(&s_mean);
+        vxReleaseScalar(&s_stddev);
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuThreshold(vx_context context, vx_image input, vx_threshold thresh, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxThresholdNode(graph, input, thresh, output);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuIntegralImage(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxIntegralImageNode(graph, input, output);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuErode3x3(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxErode3x3Node(graph, input, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuDilate3x3(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxDilate3x3Node(graph, input, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuMedian3x3(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxMedian3x3Node(graph, input, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuBox3x3(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxBox3x3Node(graph, input, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuGaussian3x3(vx_context context, vx_image input, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxGaussian3x3Node(graph, input, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuConvolve(vx_context context, vx_image input, vx_convolution conv, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxConvolveNode(graph, input, conv, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuGaussianPyramid(vx_context context, vx_image input, vx_pyramid gaussian)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxGaussianPyramidNode(graph, input, gaussian);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateImage(vx_context context, vx_image input, vx_image accum)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxAccumulateImageNode(graph, input, accum);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateWeightedImage(vx_context context, vx_image input, vx_scalar scale, vx_image accum)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxAccumulateWeightedImageNode(graph, input, scale, accum);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateSquareImage(vx_context context, vx_image input, vx_scalar scale, vx_image accum)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxAccumulateSquareImageNode(graph, input, scale, accum);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuMinMaxLoc(vx_context context, vx_image input,
+                        vx_scalar minVal, vx_scalar maxVal,
+                        vx_array minLoc, vx_array maxLoc,
+                        vx_scalar minCount, vx_scalar maxCount)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxMinMaxLocNode(graph, input, minVal, maxVal, minLoc, maxLoc, minCount, maxCount);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuConvertDepth(vx_context context, vx_image input, vx_image output, vx_enum policy, vx_int32 shift)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    vx_scalar sshift = vxCreateScalar(context, VX_TYPE_INT32, &shift);
+    if (graph)
+    {
+        vx_node node = vxConvertDepthNode(graph, input, output, policy, sshift);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    vxReleaseScalar(&sshift);
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuCannyEdgeDetector(vx_context context, vx_image input, vx_threshold hyst,
+                               vx_int32 gradient_size, vx_enum norm_type,
+                               vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxCannyEdgeDetectorNode(graph, input, hyst, gradient_size, norm_type, output);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuHalfScaleGaussian(vx_context context, vx_image input, vx_image output, vx_int32 kernel_size)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxHalfScaleGaussianNode(graph, input, output, kernel_size);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuAnd(vx_context context, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxAndNode(graph, in1, in2, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuOr(vx_context context, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxOrNode(graph, in1, in2, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuXor(vx_context context, vx_image in1, vx_image in2, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxXorNode(graph, in1, in2, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuNot(vx_context context, vx_image input, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxNotNode(graph, input, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuMultiply(vx_context context, vx_image in1, vx_image in2, vx_float32 scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    vx_scalar sscale = vxCreateScalar(context, VX_TYPE_FLOAT32, &scale);
+    if (graph)
+    {
+        vx_node node = vxMultiplyNode(graph, in1, in2, sscale, overflow_policy, rounding_policy, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    vxReleaseScalar(&sscale);
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuAdd(vx_context context, vx_image in1, vx_image in2, vx_enum policy, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxAddNode(graph, in1, in2, policy, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuSubtract(vx_context context, vx_image in1, vx_image in2, vx_enum policy, vx_image out)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxSubtractNode(graph, in1, in2, policy, out);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuWarpAffine(vx_context context, vx_image input, vx_matrix matrix, vx_enum type, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxWarpAffineNode(graph, input, matrix, type, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuWarpPerspective(vx_context context, vx_image input, vx_matrix matrix, vx_enum type, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxWarpPerspectiveNode(graph, input, matrix, type, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuHarrisCorners(vx_context context, vx_image input,
+        vx_scalar strength_thresh,
+        vx_scalar min_distance,
+        vx_scalar sensitivity,
+        vx_int32 gradient_size,
+        vx_int32 block_size,
+        vx_array corners,
+        vx_scalar num_corners)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxHarrisCornersNode(graph, input, strength_thresh, min_distance, sensitivity, gradient_size, block_size, corners, num_corners);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuFastCorners(vx_context context, vx_image input, vx_scalar sens, vx_bool nonmax, vx_array corners, vx_scalar num_corners)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxFastCornersNode(graph, input, sens, nonmax, corners, num_corners);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuOpticalFlowPyrLK(vx_context context, vx_pyramid old_images,
+                              vx_pyramid new_images,
+                              vx_array old_points,
+                              vx_array new_points_estimates,
+                              vx_array new_points,
+                              vx_enum termination,
+                              vx_scalar epsilon,
+                              vx_scalar num_iterations,
+                              vx_scalar use_initial_estimate,
+                              vx_size window_dimension)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxOpticalFlowPyrLKNode(graph, old_images, new_images, old_points,new_points_estimates, new_points,
+                termination,epsilon,num_iterations,use_initial_estimate,window_dimension);
+        if (node)
+        {
+            status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
+
+VX_API_ENTRY vx_status VX_API_CALL vxuRemap(vx_context context, vx_image input, vx_remap table, vx_enum policy, vx_image output)
+{
+    vx_status status = VX_FAILURE;
+    vx_graph graph = vxCreateGraph(context);
+    if (graph)
+    {
+        vx_node node = vxRemapNode(graph, input, table, policy, output);
+        if (node)
+        {
+            status = vx_useImmediateBorderMode(context, node);
+            if (status == VX_SUCCESS)
+                status = vxVerifyGraph(graph);
+            if (status == VX_SUCCESS)
+            {
+                status = vxProcessGraph(graph);
+            }
+            vxReleaseNode(&node);
+        }
+        vxReleaseGraph(&graph);
+    }
+    return status;
+}
diff --git a/openvx/include/VX/vx.h b/openvx/include/VX/vx.h
new file mode 100644
index 0000000..b393644
--- /dev/null
+++ b/openvx/include/VX/vx.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2012-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#ifndef _OPENVX_H_
+#define _OPENVX_H_
+
+/*!
+ * \file
+ * \brief The top level OpenVX Header.
+ */
+
+/*! \brief Defines the maximum number of characters in a implementation string.
+ * \ingroup group_context
+ */
+#define VX_MAX_IMPLEMENTATION_NAME (64)
+
+/*! \brief Defines the maximum string length of a kernel name to be added to OpenVX.
+ * \ingroup group_kernel
+ */
+#define VX_MAX_KERNEL_NAME (256)
+
+/*! \brief Defines the maximum length of a message buffer to copy from the log.
+ * \ingroup group_basic_features
+ */
+#define VX_MAX_LOG_MESSAGE_LEN (1024)
+
+#include <VX/vx_vendors.h>
+#include <VX/vx_types.h>
+#include <VX/vx_kernels.h>
+#include <VX/vx_api.h>
+#include <VX/vx_nodes.h>
+
+/*! Defines the major version number macro.
+ * \ingroup group_basic_features
+ */
+#define VX_VERSION_MAJOR(x) ((x & 0xFF) << 8)
+
+/*! Defines the minor version number macro.
+ * \ingroup group_basic_features
+ */
+#define VX_VERSION_MINOR(x) ((x & 0xFF) << 0)
+
+/*! \brief Defines the predefined version number for 1.0.
+ * \ingroup group_basic_features
+ */
+#define VX_VERSION_1_0      (VX_VERSION_MAJOR(1) | VX_VERSION_MINOR(0))
+
+/*! Defines the OpenVX Version Number.
+ * \ingroup group_basic_features
+ */
+#define VX_VERSION          VX_VERSION_1_0
+
+#endif
diff --git a/openvx/include/VX/vx_api.h b/openvx/include/VX/vx_api.h
new file mode 100644
index 0000000..9758f40
--- /dev/null
+++ b/openvx/include/VX/vx_api.h
@@ -0,0 +1,1902 @@
+/*
+ * Copyright (c) 2012-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#ifndef _OPENVX_API_H_
+#define _OPENVX_API_H_
+
+/*!
+ * \file
+ * \brief The API definition for OpenVX.
+ */
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*==============================================================================
+ CONTEXT
+ =============================================================================*/
+
+/*! \brief Creates a <tt>\ref vx_context</tt>.
+ * \details This creates a top-level object context for OpenVX.
+ * \note This is required to do anything else.
+ * \returns The reference to the implementation context <tt>\ref vx_context</tt>. Any possible errors 
+ * preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_context
+ * \post <tt>\ref vxReleaseContext</tt>
+ */
+VX_API_ENTRY vx_context VX_API_CALL vxCreateContext();
+
+/*! \brief Releases the OpenVX object context.
+ * \details All reference counted objects are garbage-collected by the return of this call.
+ * No calls are possible using the parameter context after the context has been
+ * released until a new reference from <tt>\ref vxCreateContext</tt> is returned.
+ * All outstanding references to OpenVX objects from this context are invalid
+ * after this call.
+ * \param [in] context The pointer to the reference to the context.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If context is not a <tt>\ref vx_context</tt>.
+ * \ingroup group_context
+ * \pre <tt>\ref vxCreateContext</tt>
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseContext(vx_context *context);
+
+/*! \brief Retrieves the context from any reference from within a context.
+ * \param [in] reference The reference from which to extract the context.
+ * \ingroup group_context
+ * \return The overall context that created the particular
+ * reference.
+ */
+VX_API_ENTRY vx_context VX_API_CALL vxGetContext(vx_reference reference);
+
+/*! \brief Queries the context for some specific information.
+ * \param [in] context The reference to the context.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_context_attribute_e</tt>.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If the context is not a <tt>\ref vx_context</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+ * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation.
+ * \ingroup group_context
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Sets an attribute on the context.
+ * \param [in] context The handle to the overall context.
+ * \param [in] attribute The attribute to set from <tt>\ref vx_context_attribute_e</tt>.
+ * \param [in] ptr The pointer to the data to which to set the attribute.
+ * \param [in] size The size in bytes of the data to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If the context is not a <tt>\ref vx_context</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+ * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not settable.
+ * \ingroup group_context
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetContextAttribute(vx_context context, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Provides a generic API to give platform-specific hints to the implementation.
+ * \param [in] reference The reference to the object to hint at.
+ * This could be <tt>\ref vx_context</tt>, <tt>\ref vx_graph</tt>, <tt>\ref vx_node</tt>, <tt>\ref vx_image</tt>, <tt>\ref vx_array</tt>, or any other reference.
+ * \param [in] hint A <tt>\ref vx_hint_e</tt> \a hint to give the OpenVX context. This is a platform-specific optimization or implementation mechanism.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No error.
+ * \retval VX_ERROR_INVALID_REFERENCE If context or reference is invalid.
+ * \retval VX_ERROR_NOT_SUPPORTED If the hint is not supported.
+ * \ingroup group_hint
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxHint(vx_reference reference, vx_enum hint);
+
+/*! \brief Provides a generic API to give platform-specific directives to the implementations.
+ * \param [in] reference The reference to the object to set the directive on.
+ * This could be <tt>\ref vx_context</tt>, <tt>\ref vx_graph</tt>, <tt>\ref vx_node</tt>, <tt>\ref vx_image</tt>, <tt>\ref vx_array</tt>, or any other reference.
+ * \param [in] directive The directive to set.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No error.
+ * \retval VX_ERROR_INVALID_REFERENCE If context or reference is invalid.
+ * \retval VX_ERROR_NOT_SUPPORTED If the directive is not supported.
+ * \ingroup group_directive
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxDirective(vx_reference reference, vx_enum directive);
+
+/*! \brief Provides a generic API to return status values from Object constructors if they
+ * fail.
+ * \note Users do not need to strictly check every object creator as the errors
+ * should properly propogate and be detected during verification time or run-time.
+ * \code
+ * vx_image img = vxCreateImage(context, 639, 480, VX_DF_IMAGE_UYVY);
+ * vx_status status = vxGetStatus((vx_reference)img);
+ * // status == VX_ERROR_INVALID_DIMENSIONS
+ * vxReleaseImage(&img);
+ * \endcode
+ * \pre Appropriate Object Creator function.
+ * \post Appropriate Object Release function.
+ * \param [in] reference The reference to check for construction errors.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No error.
+ * \retval * Some error occurred, please check enumeration list and constructor.
+ * \ingroup group_basic_features
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxGetStatus(vx_reference reference);
+
+/*!
+ * \brief Registers user-defined structures to the context.
+ * \param [in] context  The reference to the implementation context.
+ * \param [in] size     The size of user struct in bytes.
+ * \return A <tt>\ref vx_enum</tt> value that is a type given to the User
+ * to refer to their custom structure when declaring a <tt>\ref vx_array</tt>
+ * of that structure.
+ * \retval VX_TYPE_INVALID If the namespace of types has been exhausted.
+ * \note This call should only be used once within the lifetime of a context for
+ * a specific structure.
+ *
+ * \snippet vx_arrayrange.c array define
+ * \ingroup group_adv_array
+ */
+VX_API_ENTRY vx_enum VX_API_CALL vxRegisterUserStruct(vx_context context, vx_size size);
+
+/*==============================================================================
+ IMAGE
+ =============================================================================*/
+
+/*! \brief Creates an opaque reference to an image buffer.
+ * \details Not guaranteed to exist until the <tt>\ref vx_graph</tt> containing it has been verified.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] width The image width in pixels.
+ * \param [in] height The image height in pixels.
+ * \param [in] color The VX_DF_IMAGE (<tt>\ref vx_df_image_e</tt>) code that represents the format of the image and the color space.
+ * \returns An image reference <tt>\ref vx_image</tt>. Any possible errors preventing a successful
+ * creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \see vxAccessImagePatch to obtain direct memory access to the image data.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_image VX_API_CALL vxCreateImage(vx_context context, vx_uint32 width, vx_uint32 height, vx_df_image color);
+
+/*! \brief Creates an image from another image given a rectangle. This second
+ * reference refers to the data in the original image. Updates to this image
+ * updates the parent image. The rectangle must be defined within the pixel space
+ * of the parent image.
+ * \param [in] img The reference to the parent image.
+ * \param [in] rect The region of interest rectangle. Must contain points within
+ * the parent image pixel space.
+ * \returns An image reference <tt>\ref vx_image</tt> to the sub-image. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromROI(vx_image img, const vx_rectangle_t *rect);
+
+/*! \brief Creates a reference to an image object that has a singular,
+ * uniform value in all pixels.
+ * \details The value pointer must reflect the specific format of the desired
+ * image. For example:
+ * | Color       | Value Ptr  |
+ * |:------------|:-----------|
+ * | <tt>\ref VX_DF_IMAGE_U8</tt>   | vx_uint8 * |
+ * | <tt>\ref VX_DF_IMAGE_S16</tt>  | vx_int16 * |
+ * | <tt>\ref VX_DF_IMAGE_U16</tt>  | vx_uint16 *|
+ * | <tt>\ref VX_DF_IMAGE_S32</tt>  | vx_int32 * |
+ * | <tt>\ref VX_DF_IMAGE_U32</tt>  | vx_uint32 *|
+ * | <tt>\ref VX_DF_IMAGE_RGB</tt>  | vx_uint8 pixel[3] in R, G, B order |
+ * | <tt>\ref VX_DF_IMAGE_RGBX</tt> | vx_uint8 pixels[4] |
+ * | Any YUV     | vx_uint8 pixel[3] in Y, U, V order |
+ *
+ * \param [in] context The reference to the implementation context.
+ * \param [in] width The image width in pixels.
+ * \param [in] height The image height in pixels.
+ * \param [in] color The VX_DF_IMAGE (\ref vx_df_image_e) code that represents the format of the image and the color space.
+ * \param [in] value The pointer to the pixel value to which to set all pixels.
+ * \returns An image reference <tt>\ref vx_image</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * <tt>\see vxAccessImagePatch</tt> to obtain direct memory access to the image data.
+ * \note <tt>\ref vxAccessImagePatch</tt> and <tt>\ref vxCommitImagePatch</tt> may be called with
+ * a uniform image reference.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_image VX_API_CALL vxCreateUniformImage(vx_context context, vx_uint32 width, vx_uint32 height, vx_df_image color, const void *value);
+
+/*! \brief Creates an opaque reference to an image buffer with no direct
+ * user access. This function allows setting the image width, height, or format.
+ * \details Virtual data objects allow users to connect various nodes within a
+ * graph via data references without access to that data, but they also permit the
+ * implementation to take maximum advantage of possible optimizations. Use this
+ * API to create a data reference to link two or more nodes together when the
+ * intermediate data are not required to be accessed by outside entities. This API
+ * in particular allows the user to define the image format of the data without
+ * requiring the exact dimensions. Virtual objects are scoped within the graph
+ * they are declared a part of, and can't be shared outside of this scope.
+ * All of the following constructions of virtual images are valid.
+ * \code
+ * vx_context context = vxCreateContext();
+ * vx_graph graph = vxCreateGraph(context);
+ * vx_image virt[] = {
+ *     vxCreateVirtualImage(graph, 0, 0, VX_DF_IMAGE_U8), // no specified dimension
+ *     vxCreateVirtualImage(graph, 320, 240, VX_DF_IMAGE_VIRT), // no specified format
+ *     vxCreateVirtualImage(graph, 640, 480, VX_DF_IMAGE_U8), // no user access
+ * };
+ * \endcode
+ * \param [in] graph The reference to the parent graph.
+ * \param [in] width The width of the image in pixels. A value of zero informs the interface that the value is unspecified.
+ * \param [in] height The height of the image in pixels. A value of zero informs the interface that the value is unspecified.
+ * \param [in] color The VX_DF_IMAGE (<tt>\ref vx_df_image_e</tt>) code that represents the format of the image and the color space. A value of <tt>\ref VX_DF_IMAGE_VIRT</tt> informs the interface that the format is unspecified.
+ * \returns An image reference <tt>\ref vx_image</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \note Passing this reference to <tt>\ref vxAccessImagePatch</tt> will return an error.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_image VX_API_CALL vxCreateVirtualImage(vx_graph graph, vx_uint32 width, vx_uint32 height, vx_df_image color);
+
+/*! \brief Creates a reference to an image object that was externally allocated.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] color See the <tt>\ref vx_df_image_e</tt> codes. This mandates the
+ * number of planes needed to be valid in the \a addrs and \a ptrs arrays based on the format given.
+ * \param [in] addrs[] The array of image patch addressing structures that
+ * define the dimension and stride of the array of pointers.
+ * \param [in] ptrs[] The array of platform-defined references to each plane.
+ * \param [in] import_type <tt>\ref vx_import_type_e</tt>. When giving <tt>\ref VX_IMPORT_TYPE_HOST</tt>
+ * the \a ptrs array is assumed to be HOST accessible pointers to memory.
+ * \returns An image reference <tt>\ref vx_image</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \note The user must call vxAccessImagePatch prior to accessing the pixels of an image, even if the 
+ * image was created via <tt>\ref vxCreateImageFromHandle</tt>. Reads or writes to memory referenced 
+ * by ptrs[ ] after calling <tt>\ref vxCreateImageFromHandle</tt> without first calling 
+ * <tt>\ref vxAccessImagePatch</tt> will result in undefined behavior.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromHandle(vx_context context, vx_df_image color, vx_imagepatch_addressing_t addrs[], void *ptrs[], vx_enum import_type);
+
+/*! \brief Retrieves various attributes of an image.
+ * \param [in] image The reference to the image to query.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_image_attribute_e</tt>.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If the image is not a <tt>\ref vx_image</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+ * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryImage(vx_image image, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Allows setting attributes on the image.
+ * \param [in] image The reference to the image on which to set the attribute.
+ * \param [in] attribute The attribute to set. Use a <tt>\ref vx_image_attribute_e</tt> enumeration.
+ * \param [in] ptr The pointer to the location from which to read the value.
+ * \param [in] size The size in bytes of the object pointed to by \a ptr.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If the image is not a <tt>\ref vx_image</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetImageAttribute(vx_image image, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Releases a reference to an image object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] image The pointer to the image to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If image is not a <tt>\ref vx_image</tt>.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseImage(vx_image *image);
+
+/*! \brief This computes the size needed to retrieve an image patch from an image.
+ * \param [in] image The reference to the image from which to extract the patch.
+ * \param [in] rect The coordinates. Must be 0 <= start < end <= dimension where
+ * dimension is width for x and height for y.
+ * \param [in] plane_index The plane index from which to get the data.
+ * \return vx_size
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_size VX_API_CALL vxComputeImagePatchSize(vx_image image,
+                                       const vx_rectangle_t *rect,
+                                       vx_uint32 plane_index);
+
+/*! \brief Allows the User to extract a rectangular patch (subset) of an image from a single plane.
+ * \param [in] image The reference to the image from which to extract the patch.
+ * \param [in] rect The coordinates from which to get the patch. Must be 0 <= start < end.
+ * \param [in] plane_index The plane index from which to get the data.
+ * \param [in, out] addr A pointer to a structure describing the addressing information of the 
+ * image patch to accessed.
+ * \arg Input case: ptr is a pointer to a non-NULL pointer. The addr parameter must be the 
+ * address of an addressing 
+ * structure that describes how the user will access the requested image data at address (*ptr).
+ * \arg Output case: ptr is a pointer to a NULL pointer. The function fills the structure pointed by 
+ * addr with the 
+ * addressing information that the user must consult to access the pixel data at address (*ptr). 
+ * If the image being accessed was created via <tt>\ref vxCreateImageFromHandle</tt>, then the
+ * returned addressing information will be the identical to that of the addressing structure provided 
+ * when <tt>\ref vxCreateImageFromHandle</tt> was called.
+ 
+ * \param [in, out] ptr A pointer to a pointer to a location to store the requested data.
+ * \arg Input case: ptr is a pointer to a non-NULL pointer to a valid pixel buffer. This buffer 
+ * will be used in one 
+ * of two ways, depending on the value of the usage parameter. If usage is VX_WRITE_ONLY, then the 
+ * buffer must contain pixel data that the user wants to replace the image's pixel data with.
+ * Otherwise (i.e., usage is not VX_WRITE_ONLY), the image's current pixel data will be written to the 
+ * memory starting at address (*ptr) as storage memory for the access request. The caller must ensure 
+ * enough memory has been allocated for the requested patch with the requested addressing.
+ * If image was created via <tt>\ref vxCreateImageFromHandle</tt>, and the pixel buffer pointed to by (*ptr) overlaps 
+ * the original pixel buffer provided when image was created, then the results of such a call to 
+ * <tt>\ref vxAccessImagePatch</tt> are undefined.
+ * \arg Output case: ptr is a pointer to a NULL pointer. This NULL pointer will be overwritten 
+ * with a pointer to the 
+ * address where the requested data can be accessed. If image was created via 
+ * <tt>\ref vxCreateImageFromHandle</tt> 
+ * then the overwriting pointer must be within the original pixel buffer provided when image was created.
+ * \arg (*ptr) must eventually be provided as the ptr parameter of a call to 
+ * <tt>\ref vxCommitImagePatch</tt>.
+
+ * \param [in] usage This declares the intended usage of the pointer using the <tt>\ref vx_accessor_e</tt> enumeration. For uniform images Only VX_READ_ONLY is supported.
+ * \note The addr and ptr parameters must both be input, or both be output, otherwise the behavior is undefined.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_OPTIMIZED_AWAY The reference is a virtual image and cannot be accessed or committed.
+ * \retval VX_ERROR_INVALID_PARAMETERS The \a start, \a end, \a plane_index, \a stride_x, or \a stride_y pointer is incorrect.
+ * \retval VX_ERROR_INVALID_REFERENCE The image reference is not actually an image reference.
+ * \note The user may ask for data outside the bounds of the valid region, but
+ * such data has an undefined value.
+ * \note Users must be cautious to prevent passing in \e uninitialized pointers or
+ * addresses of uninitialized pointers to this function.
+ * \pre <tt>\ref vxComputeImagePatchSize</tt> if users wish to allocate their own memory.
+ * \post <tt>\ref vxCommitImagePatch</tt> with same (*ptr) value.
+ * \ingroup group_image
+ * \include vx_imagepatch.c
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAccessImagePatch(vx_image image,
+                                    const vx_rectangle_t *rect,
+                                    vx_uint32 plane_index,
+                                    vx_imagepatch_addressing_t *addr,
+                                    void **ptr,
+                                    vx_enum usage);
+
+/*! \brief This allows the User to commit a rectangular patch (subset) of an image from a single plane.
+ * \param [in] image The reference to the image from which to extract the patch.
+ * \param [in] rect The coordinates to which to set the patch. Must be 0 <= start <= end.
+ * This may be 0 or a rectangle of zero area in order to indicate that the commit
+ * must only decrement the reference count.
+ * \param [in] plane_index The plane index to which to set the data.
+ * \param [in] addr The addressing information for the image patch.
+ * \param [in] ptr A pointer to a pixel buffer to be committed. If the user previously provided a 
+ * pointer to this buffer to <tt>\ref vxAccessImagePatch</tt>, the buffer can be
+ * freed or re-used after <tt>\ref vxCommitImagePatch</tt> completes. If the pointer was returned by 
+ * <tt>\ref vxAccessImagePatch</tt>, reads or writes to the location pointed by ptr after 
+ * <tt>\ref vxCommitImagePatch</tt> completes will result in undefined behavior.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_OPTIMIZED_AWAY The reference is a virtual image and cannot be accessed or committed.
+ * \retval VX_ERROR_INVALID_PARAMETERS The \a start, \a end, \a plane_index, \a stride_x, or \a stride_y pointer is incorrect.
+ * \retval VX_ERROR_INVALID_REFERENCE The image reference is not actually an image reference.
+ * \ingroup group_image
+ * \include vx_imagepatch.c
+ * \note If the implementation gives the client a pointer from
+ * <tt>\ref vxAccessImagePatch</tt> then implementation-specific behavior may occur.
+ * If not, then a copy occurs from the users pointer to the internal data of the object.
+ * \note If the rectangle intersects bounds of the current valid region, the
+ * valid region grows to the union of the two rectangles as long as they occur
+ * within the bounds of the original image dimensions.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxCommitImagePatch(vx_image image,
+                                    vx_rectangle_t *rect,
+                                    vx_uint32 plane_index,
+                                    vx_imagepatch_addressing_t *addr,
+                                    const void *ptr);
+
+/*!
+ * \brief Accesses a specific indexed pixel in an image patch.
+ * \param [in] ptr The base pointer of the patch as returned from <tt>\ref vxAccessImagePatch</tt>.
+ * \param [in] index The 0 based index of the pixel count in the patch. Indexes increase horizontally by 1 then wrap around to the next row.
+ * \param [in] addr The pointer to the addressing mode information returned from <tt>\ref vxAccessImagePatch</tt>.
+ * \return void * Returns the pointer to the specified pixel.
+ * \pre <tt>\ref vxAccessImagePatch</tt>
+ * \include vx_imagepatch.c
+ * \ingroup group_image
+ */
+VX_API_ENTRY void * VX_API_CALL vxFormatImagePatchAddress1d(void *ptr, vx_uint32 index, const vx_imagepatch_addressing_t *addr);
+
+/*!
+ * \brief Accesses a specific pixel at a 2d coordinate in an image patch.
+ * \param [in] ptr The base pointer of the patch as returned from <tt>\ref vxAccessImagePatch</tt>.
+ * \param [in] x The x dimension within the patch.
+ * \param [in] y The y dimension within the patch.
+ * \param [in] addr The pointer to the addressing mode information returned from <tt>\ref vxAccessImagePatch</tt>.
+ * \return void * Returns the pointer to the specified pixel.
+ * \pre <tt>\ref vxAccessImagePatch</tt>
+ * \include vx_imagepatch.c
+ * \ingroup group_image
+ */
+VX_API_ENTRY void * VX_API_CALL vxFormatImagePatchAddress2d(void *ptr, vx_uint32 x, vx_uint32 y, const vx_imagepatch_addressing_t *addr);
+
+/*! \brief Retrieves the valid region of the image as a rectangle.
+ * \details After the image is allocated but has not been written to this
+ * returns the full rectangle of the image so that functions do not have to manage
+ * a case for uninitialized data. The image still retains an uninitialized
+ * value, but once the image is written to via any means such as <tt>\ref vxCommitImagePatch</tt>,
+ * the valid region is altered to contain the maximum bounds of the written
+ * area.
+ * \param [in] image The image from which to retrieve the valid region.
+ * \param [out] rect The destination rectangle.
+ * \return vx_status
+ * \retval VX_ERROR_INVALID_REFERENCE Invalid image.
+ * \retval VX_ERROR_INVALID_PARAMETERS Invalid rect.
+ * \retval VX_SUCCESS Valid image.
+ * \note This rectangle can be passed directly to <tt>\ref vxAccessImagePatch</tt> to get
+ * the full valid region of the image. Modifications from <tt>\ref vxCommitImagePatch</tt>
+ * grows the valid region.
+ * \ingroup group_image
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxGetValidRegionImage(vx_image image, vx_rectangle_t *rect);
+
+/*==============================================================================
+ KERNEL
+ =============================================================================*/
+
+/*! \brief Loads one or more kernels into the OpenVX context. This is the interface
+ * by which OpenVX is extensible. Once the set of kernels is loaded new kernels
+ * and their parameters can be queried.
+ * \note When all references to loaded kernels are released, the module
+ * may be automatically unloaded.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] module The short name of the module to load. On systems where
+ * there are specific naming conventions for modules, the name passed
+ * should ignore such conventions. For example: \c libxyz.so should be
+ * passed as just \c xyz and the implementation will <i>do the right thing</i> that
+ * the platform requires.
+ * \note This API uses the system pre-defined paths for modules.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If the context is not a <tt>\ref vx_context</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+ * \ingroup group_user_kernels
+ * \see vxGetKernelByName
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxLoadKernels(vx_context context, const vx_char *module);
+
+/*! \brief Obtains a reference to a kernel using a string to specify the name.
+ * \details User Kernels follow a "dotted" heirarchical syntax. For example:
+ * "com.company.example.xyz". The following are strings specifying the kernel names:
+
+ * org.khronos.openvx.color_convert
+
+ * org.khronos.openvx.channel_extract
+
+ * org.khronos.openvx.channel_combine
+
+ * org.khronos.openvx.sobel_3x3
+
+ * org.khronos.openvx.magnitude
+
+ * org.khronos.openvx.phase
+
+ * org.khronos.openvx.scale_image
+
+ * org.khronos.openvx.table_lookup
+
+ * org.khronos.openvx.histogram
+
+ * org.khronos.openvx.equalize_histogram
+
+ * org.khronos.openvx.absdiff
+
+ * org.khronos.openvx.mean_stddev
+
+ * org.khronos.openvx.threshold
+
+ * org.khronos.openvx.integral_image
+
+ * org.khronos.openvx.dilate_3x3
+
+ * org.khronos.openvx.erode_3x3
+
+ * org.khronos.openvx.median_3x3
+
+ * org.khronos.openvx.box_3x3
+
+ * org.khronos.openvx.gaussian_3x3
+
+ * org.khronos.openvx.custom_convolution
+
+ * org.khronos.openvx.gaussian_pyramid
+
+ * org.khronos.openvx.accumulate
+
+ * org.khronos.openvx.accumulate_weighted
+
+ * org.khronos.openvx.accumulate_square
+
+ * org.khronos.openvx.minmaxloc
+
+ * org.khronos.openvx.convertdepth
+
+ * org.khronos.openvx.canny_edge_detector
+
+ * org.khronos.openvx.and
+
+ * org.khronos.openvx.or
+
+ * org.khronos.openvx.xor
+
+ * org.khronos.openvx.not
+
+ * org.khronos.openvx.multiply
+
+ * org.khronos.openvx.add
+
+ * org.khronos.openvx.subtract
+
+ * org.khronos.openvx.warp_affine
+
+ * org.khronos.openvx.warp_perspective
+
+ * org.khronos.openvx.harris_corners
+
+ * org.khronos.openvx.fast_corners
+
+ * org.khronos.openvx.optical_flow_pyr_lk
+
+ * org.khronos.openvx.remap
+
+ * org.khronos.openvx.halfscale_gaussian 
+
+ * \param [in] context The reference to the implementation context.
+ * \param [in] name The string of the name of the kernel to get.
+ * \return A kernel reference or zero if an error occurred.
+ * \retval 0 The kernel name is not found in the context.
+ * \ingroup group_kernel
+ * \pre <tt>\ref vxLoadKernels</tt> if the kernel is not provided by the
+ * OpenVX implementation.
+ * \note User Kernels should follow a "dotted" heirarchical syntax. For example:
+ * "com.company.example.xyz".
+ */
+VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const vx_char *name);
+
+/*! \brief Obtains a reference to the kernel using the <tt>\ref vx_kernel_e</tt> enumeration.
+ * \details Enum values above the standard set are assumed to apply to
+ * loaded libraries.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] kernel A value from <tt>\ref vx_kernel_e</tt> or a vendor or client-defined value.
+ * \return A <tt>\ref vx_kernel</tt>.
+ * \retval 0 The kernel enumeration is not found in the context.
+ * \ingroup group_kernel
+ * \pre <tt>\ref vxLoadKernels</tt> if the kernel is not provided by the
+ * OpenVX implementation.
+ */
+VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByEnum(vx_context context, vx_enum kernel);
+
+/*! \brief This allows the client to query the kernel to get information about
+ * the number of parameters, enum values, etc.
+ * \param [in] kernel The kernel reference to query.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_kernel_attribute_e</tt>.
+ * \param [out] ptr The pointer to the location at which to store the resulting value.
+ * \param [in] size The size of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If the kernel is not a <tt>\ref vx_kernel</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect.
+ * \retval VX_ERROR_NOT_SUPPORTED If the attribute value is not supported in this implementation.
+ * \ingroup group_kernel
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryKernel(vx_kernel kernel, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Release the reference to the kernel.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] kernel The pointer to the kernel reference to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If kernel is not a <tt>\ref vx_kernel</tt>.
+ * \ingroup group_kernel
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseKernel(vx_kernel *kernel);
+
+/*! \brief Allows users to add custom kernels to the known kernel
+ * database in OpenVX at run-time. This would primarily be used by the module function
+ * \c vxPublishKernels.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] name The string to use to match the kernel.
+ * \param [in] enumeration The enumerated value of the kernel to be used by clients.
+ * \param [in] func_ptr The process-local function pointer to be invoked.
+ * \param [in] numParams The number of parameters for this kernel.
+ * \param [in] input The pointer to <tt>\ref vx_kernel_input_validate_f</tt>, which validates the
+ * input parameters to this kernel.
+ * \param [in] output The pointer to <tt>\ref vx_kernel_output_validate_f </tt>, which validates the
+ * output parameters to this kernel.
+ * \param [in] init The kernel initialization function.
+ * \param [in] deinit The kernel de-initialization function.
+ * \ingroup group_user_kernels
+ * \return <tt>\ref vx_kernel</tt>
+ * \retval 0 Indicates that an error occurred when adding the kernel.
+ * \retval * Kernel added to OpenVX.
+ */
+VX_API_ENTRY vx_kernel VX_API_CALL vxAddKernel(vx_context context,
+                             const vx_char name[VX_MAX_KERNEL_NAME],
+                             vx_enum enumeration,
+                             vx_kernel_f func_ptr,
+                             vx_uint32 numParams,
+                             vx_kernel_input_validate_f input,
+                             vx_kernel_output_validate_f output,
+                             vx_kernel_initialize_f init,
+                             vx_kernel_deinitialize_f deinit);
+
+/*! \brief This API is called after all parameters have been added to the
+ * kernel and the kernel is \e ready to be used. Notice that the reference to the kernel created 
+ * by vxAddKernel is still valid after the call to vxFinalizeKernel.
+ * \param [in] kernel The reference to the loaded kernel from <tt>\ref vxAddKernel</tt>.
+ * \return A <tt>\ref vx_status_e</tt> enumeration. If an error occurs, the kernel is not available
+ * for usage by the clients of OpenVX. Typically this is due to a mismatch
+ * between the number of parameters requested and given.
+ * \pre <tt>\ref vxAddKernel</tt> and <tt>\ref vxAddParameterToKernel</tt>
+ * \ingroup group_user_kernels
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxFinalizeKernel(vx_kernel kernel);
+
+/*! \brief Allows users to set the signatures of the custom kernel.
+ * \param [in] kernel The reference to the kernel added with <tt>\ref vxAddKernel</tt>.
+ * \param [in] index The index of the parameter to add.
+ * \param [in] dir The direction of the parameter. This must be either <tt>\ref VX_INPUT</tt> or 
+ * <tt>\ref VX_OUTPUT</tt>. <tt>\ref VX_BIDIRECTIONAL</tt> is not supported for this function. 
+ * \param [in] data_type The type of parameter. This must be a value from <tt>\ref vx_type_e</tt>.
+ * \param [in] state The state of the parameter (required or not). This must be a value from <tt>\ref vx_parameter_state_e</tt>.
+ * \return A <tt>\ref vx_status_e</tt> enumerated value.
+ * \retval VX_SUCCESS Parameter is successfully set on kernel.
+ * \retval VX_ERROR_INVALID_REFERENCE The value passed as kernel was not a \c vx_kernel.
+ * \pre <tt>\ref vxAddKernel</tt>
+ * \ingroup group_user_kernels
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToKernel(vx_kernel kernel, vx_uint32 index, vx_enum dir, vx_enum data_type, vx_enum state);
+
+/*! \brief Removes a non-finalized <tt>\ref vx_kernel</tt> from the <tt>\ref vx_context</tt> 
+ * and releases it. Once a <tt>\ref vx_kernel</tt> has been finalized it cannot be removed.
+ * \param [in] kernel The reference to the kernel to remove. Returned from <tt>\ref vxAddKernel</tt>.
+ * \note Any kernel enumerated in the base standard
+ * cannot be removed; only kernels added through <tt>\ref vxAddKernel</tt> can
+ * be removed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_INVALID_REFERENCE If an invalid kernel is passed in.
+ * \retval VX_ERROR_INVALID_PARAMETER If a base kernel is passed in.
+ * \ingroup group_user_kernels
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxRemoveKernel(vx_kernel kernel);
+
+/*! \brief Sets kernel attributes.
+ * \param [in] kernel The reference to the kernel.
+ * \param [in] attribute The enumeration of the attributes. See <tt>\ref vx_kernel_attribute_e</tt>.
+ * \param [in] ptr The pointer to the location from which to read the attribute.
+ * \param [in] size The size in bytes of the data area indicated by \a ptr in bytes.
+ * \note After a kernel has been passed to <tt>\ref vxFinalizeKernel</tt>, no attributes
+ * can be altered.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_user_kernels
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetKernelAttribute(vx_kernel kernel, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Retrieves a <tt>\ref vx_parameter</tt> from a <tt>\ref vx_kernel</tt>.
+ * \param [in] kernel The reference to the kernel.
+ * \param [in] index The index of the parameter.
+ * \return A <tt>\ref vx_parameter</tt>.
+ * \retval 0 Either the kernel or index is invalid.
+ * \retval * The parameter reference.
+ * \ingroup group_parameter
+ */
+VX_API_ENTRY vx_parameter VX_API_CALL vxGetKernelParameterByIndex(vx_kernel kernel, vx_uint32 index);
+
+/*==============================================================================
+ GRAPH
+ =============================================================================*/
+
+/*! \brief Creates an empty graph.
+ * \param [in] context The reference to the implementation context.
+ * \returns A graph reference <tt>\ref vx_graph</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_graph VX_API_CALL vxCreateGraph(vx_context context);
+
+/*! \brief Releases a reference to a graph.
+ * The object may not be garbage collected until its total reference count is zero.
+ * Once the reference count is zero, all node references in the graph are automatically
+ * released as well. Data referenced by those nodes may not be released as
+ * the user may have external references to the data.
+ * \param [in] graph The pointer to the graph to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseGraph(vx_graph *graph);
+
+/*! \brief Verifies the state of the graph before it is executed.
+ * This is useful to catch programmer errors and contract errors. If not verified,
+ * the graph verifies before being processed.
+ * \pre Memory for data objects is not guarenteed to exist before
+ * this call. \post After this call data objects exist unless
+ * the implementation optimized them out.
+ * \param [in] graph The reference to the graph to verify.
+ * \return A status code for graphs with more than one error; it is
+ * undefined which error will be returned. Register a log callback using <tt>\ref vxRegisterLogCallback</tt>
+ * to receive each specific error in the graph.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If graph is not a <tt>\ref vx_graph</tt>.
+ * \retval VX_ERROR_MULTIPLE_WRITERS If the graph contains more than one writer
+ * to any data object.
+ * \retval VX_ERROR_INVALID_NODE If a node in the graph is invalid or failed be created.
+ * \retval VX_ERROR_INVALID_GRAPH If the graph contains cycles or some other invalid topology.
+ * \retval VX_ERROR_INVALID_TYPE If any parameter on a node is given the wrong type.
+ * \retval VX_ERROR_INVALID_VALUE If any value of any parameter is out of bounds of specification.
+ * \retval VX_ERROR_INVALID_FORMAT If the image format is not compatible.
+ * \ingroup group_graph
+ * \see vxProcessGraph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxVerifyGraph(vx_graph graph);
+
+/*! \brief This function causes the synchronous processing of a graph. If the graph
+ * has not been verified, then the implementation verifies the graph
+ * immediately. If verification fails this function returns a status
+ * identical to what <tt>\ref vxVerifyGraph</tt> would return. After
+ * the graph verfies successfully then processing occurs. If the graph was
+ * previously verified via <tt>\ref vxVerifyGraph</tt> or <tt>\ref vxProcessGraph</tt>
+ * then the graph is processed. This function blocks until the graph is completed.
+ * \param [in] graph The graph to execute.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Graph has been processed.
+ * \retval VX_FAILURE A catastrophic error occurred during processing.
+ * \retval * See <tt>\ref vxVerifyGraph</tt>.
+ * \pre <tt>\ref vxVerifyGraph</tt> must return <tt>\ref VX_SUCCESS</tt> before this function will pass.
+ * \ingroup group_graph
+ * \see vxVerifyGraph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxProcessGraph(vx_graph graph);
+
+/*! \brief Schedules a graph for future execution.
+ * \param [in] graph The graph to schedule.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_NO_RESOURCES The graph cannot be scheduled now.
+ * \retval VX_ERROR_NOT_SUFFICIENT The graph is not verified and has failed
+forced verification.
+ * \retval VX_SUCCESS The graph has been scheduled.
+ * \pre <tt>\ref vxVerifyGraph</tt> must return <tt>\ref VX_SUCCESS</tt> before this function will pass.
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxScheduleGraph(vx_graph graph);
+
+/*! \brief Waits for a specific graph to complete. If the graph has been scheduled multiple 
+ * times since the last call to vxWaitGraph, then vxWaitGraph returns only when the last 
+ * scheduled execution completes.
+ * \param [in] graph The graph to wait on.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS The graph has successfully completed execution and its outputs are the 
+ * valid results of the most recent execution. 
+ * \retval VX_FAILURE An error occurred or the graph was never scheduled.  Use \ref vxQueryGraph 
+ * for the VX_GRAPH_ATTRIBUTE_STATUS attribute to determine the details.  Output data of the 
+ * graph is undefined. 
+ * \pre <tt>\ref vxScheduleGraph</tt>
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxWaitGraph(vx_graph graph);
+
+/*! \brief Allows the user to query attributes of the Graph.
+ * \param [in] graph The reference to the created graph.
+ * \param [in] attribute The <tt>\ref vx_graph_attribute_e</tt> type needed.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryGraph(vx_graph graph, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Allows the set to attributes on the Graph.
+ * \param [in] graph The reference to the graph.
+ * \param [in] attribute The <tt>\ref vx_graph_attribute_e</tt> type needed.
+ * \param [in] ptr The location from which to read the value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetGraphAttribute(vx_graph graph, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Adds the given parameter extracted from a <tt>\ref vx_node</tt> to the graph.
+ * \param [in] graph The graph reference that contains the node.
+ * \param [in] parameter The parameter reference to add to the graph from the node.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Parameter added to Graph.
+ * \retval VX_ERROR_INVALID_REFERENCE The parameter is not a valid <tt>\ref vx_parameter</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETER The parameter is of a node not in this
+ * graph.
+ * \ingroup group_graph_parameters
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToGraph(vx_graph graph, vx_parameter parameter);
+
+/*! \brief Sets a reference to the parameter on the graph. The implementation
+ * must set this parameter on the originating node as well.
+ * \param [in] graph The graph reference.
+ * \param [in] index The parameter index.
+ * \param [in] value The reference to set to the parameter.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Parameter set to Graph.
+ * \retval VX_ERROR_INVALID_REFERENCE The value is not a valid <tt>\ref vx_reference</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETER The parameter index is out of bounds or the
+ * dir parameter is incorrect.
+ * \ingroup group_graph_parameters
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetGraphParameterByIndex(vx_graph graph, vx_uint32 index, vx_reference value);
+
+/*! \brief Retrieves a <tt>\ref vx_parameter</tt> from a <tt>\ref vx_graph</tt>.
+ * \param [in] graph The graph.
+ * \param [in] index The index of the parameter.
+ * \return <tt>\ref vx_parameter</tt> reference.
+ * \retval 0 if the index is out of bounds.
+ * \retval * The parameter reference.
+ * \ingroup group_graph_parameters
+ */
+VX_API_ENTRY vx_parameter VX_API_CALL vxGetGraphParameterByIndex(vx_graph graph, vx_uint32 index);
+
+/*! \brief Returns a Boolean to indicate the state of graph verification.
+ * \param [in] graph The reference to the graph to check.
+ * \return A <tt>\ref vx_bool</tt> value.
+ * \retval vx_true_e The graph is verified.
+ * \retval vx_false_e The graph is not verified. It must be verified before
+ * execution either through <tt>\ref vxVerifyGraph</tt> or automatically through
+ * <tt>\ref vxProcessGraph</tt> or <tt>\ref vxScheduleGraph</tt>.
+ * \ingroup group_graph
+ */
+VX_API_ENTRY vx_bool VX_API_CALL vxIsGraphVerified(vx_graph graph);
+
+/*==============================================================================
+ NODE
+ =============================================================================*/
+
+/*! \brief Creates a reference to a node object for a given kernel.
+ * \details This node has no references assigned as parameters after completion.
+ * The client is then required to set these parameters manually by <tt>\ref vxSetParameterByIndex</tt>.
+ * When clients supply their own node creation functions (for use with User Kernels), this is the API
+ * to use along with the parameter setting API.
+ * \param [in] graph The reference to the graph in which this node exists.
+ * \param [in] kernel The kernel reference to associate with this new node.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_adv_node
+ * \post Call <tt>\ref vxSetParameterByIndex</tt> for as many parameters as needed to be set.
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxCreateGenericNode(vx_graph graph, vx_kernel kernel);
+
+/*! \brief Allows a user to query information out of a node.
+ * \param [in] node The reference to the node to query.
+ * \param [in] attribute Use <tt>\ref vx_node_attribute_e</tt> value to query for information.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytesin bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Successful
+ * \retval VX_ERROR_INVALID_PARAMETERS The type or size is incorrect.
+ * \ingroup group_node
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryNode(vx_node node, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Allows a user to set attribute of a node before Graph Validation.
+ * \param [in] node The reference to the node to set.
+ * \param [in] attribute Use <tt>\ref vx_node_attribute_e</tt> value to query for information.
+ * \param [out] ptr The output pointer to where to send the value.
+ * \param [in] size The size in bytes of the objects to which \a ptr points.
+ * \note Some attributes are inherited from the <tt>\ref vx_kernel</tt>, which was used
+ * to create the node. Some of these can be overridden using this API, notably
+ * \ref VX_NODE_ATTRIBUTE_LOCAL_DATA_SIZE and \ref VX_NODE_ATTRIBUTE_LOCAL_DATA_PTR.
+ * \ingroup group_node
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS The attribute was set.
+ * \retval VX_ERROR_INVALID_REFERENCE node is not a vx_node.
+ * \retval VX_ERROR_INVALID_PARAMETER size is not correct for the type needed.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetNodeAttribute(vx_node node, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Releases a reference to a Node object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] node The pointer to the reference of the node to release.
+ * \ingroup group_node
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If node is not a <tt>\ref vx_node</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseNode(vx_node *node);
+
+/*! \brief Removes a Node from its parent Graph and releases it.
+ * \param [in] node The pointer to the node to remove and release.
+ * \ingroup group_node
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If node is not a <tt>\ref vx_node</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxRemoveNode(vx_node *node);
+
+/*! \brief Assigns a callback to a node.
+ * If a callback already exists in this node, this function must return an error
+ * and the user may clear the callback by passing a NULL pointer as the callback.
+ * \param [in] node The reference to the node.
+ * \param [in] callback The callback to associate with completion of this
+ * specific node.
+ * \warning This must be used with <b><i>extreme</i></b> caution as it can \e ruin
+ * optimizations in the power/performance efficiency of a graph.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Callback assigned.
+ * \retval VX_ERROR_INVALID_REFERENCE The value passed as node was not a <tt>\ref vx_node</tt>.
+ * \ingroup group_node_callback
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecomplete_f callback);
+
+/*! \brief Retrieves the current node callback function pointer set on the node.
+ * \param [in] node The reference to the <tt>\ref vx_node</tt> object.
+ * \ingroup group_node_callback
+ * \return vx_nodecomplete_f The pointer to the callback function.
+ * \retval NULL No callback is set.
+ * \retval * The node callback function.
+ */
+VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node);
+
+/*==============================================================================
+ PARAMETER
+ =============================================================================*/
+
+/*! \brief Retrieves a <tt>\ref vx_parameter</tt> from a <tt>\ref vx_node</tt>.
+ * \param [in] node The node from which to extract the parameter.
+ * \param [in] index The index of the parameter to which to get a reference.
+ * \return <tt>\ref vx_parameter</tt>
+ * \ingroup group_parameter
+ */
+VX_API_ENTRY vx_parameter VX_API_CALL vxGetParameterByIndex(vx_node node, vx_uint32 index);
+
+/*! \brief Releases a reference to a parameter object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] param The pointer to the parameter to release.
+ * \ingroup group_parameter
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If param is not a <tt>\ref vx_parameter</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseParameter(vx_parameter *param);
+
+/*! \brief Sets the specified parameter data for a kernel on the node.
+ * \param [in] node The node that contains the kernel.
+ * \param [in] index The index of the parameter desired.
+ * \param [in] value The reference to the parameter.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_parameter
+ * \see vxSetParameterByReference
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByIndex(vx_node node, vx_uint32 index, vx_reference value);
+
+/*! \brief Associates a parameter reference and a data reference with a kernel
+ * on a node.
+ * \param [in] parameter The reference to the kernel parameter.
+ * \param [in] value The value to associate with the kernel parameter.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_parameter
+ * \see vxGetParameterByIndex
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByReference(vx_parameter parameter, vx_reference value);
+
+/*! \brief Allows the client to query a parameter to determine its meta-information.
+ * \param [in] param The reference to the parameter.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_parameter_attribute_e</tt>.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_parameter
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryParameter(vx_parameter param, vx_enum attribute, void *ptr, vx_size size);
+
+/*==============================================================================
+ SCALAR
+ =============================================================================*/
+
+/*! \brief Creates a reference to a scalar object. Also see \ref sub_node_parameters.
+ * \param [in] context The reference to the system context.
+ * \param [in] data_type The <tt>\ref vx_type_e</tt> of the scalar. Must be greater than
+ * <tt>\ref VX_TYPE_INVALID</tt> and less than <tt>\ref VX_TYPE_SCALAR_MAX</tt>.
+ * \param [in] ptr The pointer to the initial value of the scalar.
+ * \ingroup group_scalar
+ * \returns A scaler reference <tt>\ref vx_scalar</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ */
+VX_API_ENTRY vx_scalar VX_API_CALL vxCreateScalar(vx_context context, vx_enum data_type, const void *ptr);
+
+/*! \brief Releases a reference to a scalar object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] scalar The pointer to the scalar to release.
+ * \ingroup group_scalar
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If scalar is not a <tt>\ref vx_scalar</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseScalar(vx_scalar *scalar);
+
+/*! \brief Queries attributes from a scalar.
+ * \param [in] scalar The scalar object.
+ * \param [in] attribute The enumeration to query. Use a <tt>\ref vx_scalar_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_scalar
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryScalar(vx_scalar scalar, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Gets the scalar value out of a reference.
+ * \note Use this in conjunction with Query APIs that return references which
+ * should be converted into values.
+ * \ingroup group_scalar
+ * \param [in] ref The reference from which to get the scalar value.
+ * \param [out] ptr An appropriate typed pointer that points to a location to which to copy
+ * the scalar value.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_INVALID_REFERENCE If the ref is not a valid
+ * reference.
+ * \retval VX_ERROR_INVALID_PARAMETERS If \a ptr is NULL.
+ * \retval VX_ERROR_INVALID_TYPE If the type does not match the type in the reference or is a bad value.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReadScalarValue(vx_scalar ref, void *ptr);
+
+/*! \brief Sets the scalar value in a reference.
+ * \note Use this in conjunction with Parameter APIs that return references
+ * to parameters that need to be altered.
+ * \ingroup group_scalar
+ * \param [in] ref The reference from which to get the scalar value.
+ * \param [in] ptr An appropriately typed pointer that points to a location to which to copy
+ * the scalar value.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_INVALID_REFERENCE If the ref is not a valid
+ * reference.
+ * \retval VX_ERROR_INVALID_PARAMETERS If \a ptr is NULL.
+ * \retval VX_ERROR_INVALID_TYPE If the type does not match the type in the reference or is a bad value.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxWriteScalarValue(vx_scalar ref, const void *ptr);
+
+/*==============================================================================
+ REFERENCE
+ =============================================================================*/
+
+/*! \brief Queries any reference type for some basic information (count, type).
+ * \param [in] ref The reference to query.
+ * \param [in] attribute The value for which to query. Use <tt>\ref vx_reference_attribute_e</tt>.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_reference
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryReference(vx_reference ref, vx_enum attribute, void *ptr, vx_size size);
+
+/*==============================================================================
+ DELAY
+ =============================================================================*/
+
+/*! \brief Queries a <tt>\ref vx_delay</tt> object attribute.
+ * \param [in] delay A pointer to a delay object.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_delay_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_delay
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryDelay(vx_delay delay, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Releases a reference to a delay object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] delay The pointer to the delay to release.
+ * \post After returning from this function the reference is zeroed.
+ * \ingroup group_delay
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If delay is not a <tt>\ref vx_delay</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseDelay(vx_delay *delay);
+
+/*! \brief Creates a Delay object.
+ * \details This function uses a subset of the attributes defining the metadata of 
+ * the exemplar, ignoring the object. It does not alter the exemplar or keep or release 
+ * the reference to the exemplar. For the definition of supported attributes
+ * see \ref vxSetMetaFormatAttribute.
+ * 
+ * \param [in] context The reference to the system context.
+ * \param [in] exemplar The exemplar object.
+ * \param [in] slots The number of reference in the delay.
+ * \returns A delay reference <tt>\ref vx_delay</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_delay
+ */
+VX_API_ENTRY vx_delay VX_API_CALL vxCreateDelay(vx_context context,
+                              vx_reference exemplar,
+                              vx_size slots);
+
+/*! \brief Retrieves a reference from a delay object.
+ * \param [in] delay The reference to the delay object.
+ * \param [in] index An index into the delay from which to extract the
+ * reference.
+ * \return <tt>\ref vx_reference</tt>
+ * \note The delay index is in the range \f$ [-count+1,0] \f$. 0 is always the
+ * \e current object.
+ * \ingroup group_delay
+ * \note A reference from a delay object must not be given to its associated
+ * release API (e.g. <tt>\ref vxReleaseImage</tt>). Use the <tt>\ref vxReleaseDelay</tt> only.
+ */
+VX_API_ENTRY vx_reference VX_API_CALL vxGetReferenceFromDelay(vx_delay delay, vx_int32 index);
+
+/*! \brief Ages the internal delay ring by one. This means that once this API is
+ * called the reference from index 0 will go to index -1 and so forth until
+ * \f$ -count+1 \f$ is reached. This last object will become 0. Once the delay has
+ * been aged, it updates the reference in any associated nodes.
+ * \param [in] delay
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Delay was aged.
+ * \retval VX_ERROR_INVALID_REFERENCE The value passed as delay was not a <tt>\ref vx_delay</tt>.
+ * \ingroup group_delay
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAgeDelay(vx_delay delay);
+
+
+/*==============================================================================
+ LOGGING
+ =============================================================================*/
+
+/*! \brief Adds a line to the log.
+ * \param [in] ref The reference to add the log entry against. Some valid value must be provided.
+ * \param [in] status The status code. <tt>\ref VX_SUCCESS</tt> status entries are ignored and not added.
+ * \param [in] message The human readable message to add to the log.
+ * \param [in] ... a list of variable arguments to the message.
+ * \note Messages may not exceed <tt>\ref VX_MAX_LOG_MESSAGE_LEN</tt> bytes and will be truncated in the log if they exceed this limit.
+ * \ingroup group_log
+ */
+VX_API_ENTRY void VX_API_CALL vxAddLogEntry(vx_reference ref, vx_status status, const char *message, ...);
+
+/*! \brief Registers a callback facility to the OpenVX implementation to receive error logs.
+ * \param [in] context The overall context to OpenVX.
+ * \param [in] callback The callback function. If NULL, the previous callback is removed.
+ * \param [in] reentrant If reentrancy flag is <tt>\ref vx_true_e</tt>, then the callback may be entered from multiple
+ * simultaneous tasks or threads (if the host OS supports this).
+ * \ingroup group_log
+ */
+VX_API_ENTRY void VX_API_CALL vxRegisterLogCallback(vx_context context, vx_log_callback_f callback, vx_bool reentrant);
+
+/*==============================================================================
+ LUT
+ =============================================================================*/
+
+/*! \brief Creates LUT object of a given type.
+ * \param [in] context The reference to the context.
+ * \param [in] data_type The type of data stored in the LUT.
+ * \param [in] count The number of entries desired.
+ * \if OPENVX_STRICT_1_0
+ * \note For OpenVX 1.0, count must be equal to 256 and data_type can only be \ref VX_TYPE_UINT8.
+ * \endif
+ * \returns An LUT reference <tt>\ref vx_lut</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_lut
+ */
+VX_API_ENTRY vx_lut VX_API_CALL vxCreateLUT(vx_context context, vx_enum data_type, vx_size count);
+
+/*! \brief Releases a reference to a LUT object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] lut The pointer to the LUT to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If lut is not a <tt>\ref vx_lut</tt>.
+ * \ingroup group_lut
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseLUT(vx_lut *lut);
+
+/*! \brief Queries attributes from a LUT.
+ * \param [in] lut The LUT to query.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_lut_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_lut
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryLUT(vx_lut lut, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Grants access to a LUT table and increments the object reference count in case of success.
+ * \details There are several variations of call methodology:
+ * \arg If \a ptr is NULL (which means the current data of the LUT is not desired),
+ * the LUT reference count is incremented.
+ * \arg If \a ptr is not NULL but (*ptr) is NULL, (*ptr) will contain the address of the LUT data when the function returns and
+ * the reference count will be incremented. Whether the (*ptr) address is mapped
+ * or allocated is undefined. (*ptr) must be returned to <tt>\ref vxCommitLUT</tt>.
+ * \arg If \a ptr is not NULL and (*ptr) is not NULL, the user is signalling the implementation to copy the LUT data into the location specified
+ * by (*ptr). Users must use <tt>\ref vxQueryLUT</tt> with <tt>\ref VX_LUT_ATTRIBUTE_SIZE</tt> to
+ * determine how much memory to allocate for the LUT data.
+ *
+ * In any case, <tt>\ref vxCommitLUT</tt> must be called after LUT access is complete.
+ * \param [in] lut The LUT from which to get the data.
+ * \param [in,out] ptr ptr The user-supplied address to a pointer, via which the requested contents 
+ * are returned.
+ * \arg If ptr is NULL, an error occurs.
+ * \arg If ptr is not NULL and (*ptr) is NULL, (*ptr) will be set to the address of a memory area 
+ * managed by the OpenVX framework containing the requested data.
+ * \arg If both ptr and (*ptr) are not NULL, requested data will be copied to (*ptr) (optionally in 
+ * case of write-only access).
+ * \param [in] usage This declares the intended usage of the pointer using the * <tt>\ref vx_accessor_e</tt> enumeration.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \post <tt>\ref vxCommitLUT</tt>
+ * \ingroup group_lut
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAccessLUT(vx_lut lut, void **ptr, vx_enum usage);
+
+/*! \brief Commits the Lookup Table and decrements the object reference count in case of success.
+ * \details Commits the data back to the LUT object and decrements the reference count.
+ * There are several variations of call methodology:
+ * \arg If a user should allocated their own memory for the LUT data copy, the user is
+ * obligated to free this memory.
+ * \arg If \a ptr is not NULL and the (*ptr) for <tt>\ref vxAccessLUT</tt> was NULL,
+ * it is undefined whether the implementation will unmap or copy and free the memory.
+ * \param [in] lut The LUT to modify.
+ * \param [in] ptr The pointer provided or returned by <tt>\ref vxAccessLUT</tt>. This cannot be NULL.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \pre <tt>\ref vxAccessLUT</tt>.
+ * \ingroup group_lut
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxCommitLUT(vx_lut lut, const void *ptr);
+
+/*==============================================================================
+ DISTRIBUTION
+ =============================================================================*/
+
+/*! \brief Creates a reference to a 1D Distribution of a consecutive interval [offset, offset + range - 1] 
+ * defined by a start offset and valid range, divided equally into numBins parts.
+ * \param [in] context The reference to the overall context.
+ * \param [in] numBins The number of bins in the distribution.
+ * \param [in] offset The start offset into the range value that marks the begining of the 1D Distribution.
+ * \param [in] range The total number of the values.
+ * \returns A distribution reference <tt>\ref vx_distribution</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_distribution
+ */
+VX_API_ENTRY vx_distribution VX_API_CALL vxCreateDistribution(vx_context context, vx_size numBins, vx_int32 offset, vx_uint32 range);
+
+/*! \brief Releases a reference to a distribution object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] distribution The reference to the distribution to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If distribution is not a <tt>\ref vx_distribution</tt>.
+ * \ingroup group_distribution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseDistribution(vx_distribution *distribution);
+
+/*! \brief Queries a Distribution object.
+ * \param [in] distribution The reference to the distribution to query.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_distribution_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_distribution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryDistribution(vx_distribution distribution, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Grants access to a distribution object and increments the object reference count in 
+ * case of success.
+ * \param [in] distribution The reference to the distribution to access.
+ * \param [in, out] ptr The user-supplied address to a pointer, via which the requested contents 
+ * are returned.
+ * \arg If ptr is NULL, an error occurs.
+ * \arg If ptr is not NULL and (*ptr) is NULL, (*ptr) will be set to the address of a memory area 
+ * managed by the OpenVX framework containing the requested data.
+ * \arg If both ptr and (*ptr) are not NULL, requested data will be copied to (*ptr) (optionally in 
+ * case of write-only access).
+ * \param [in] usage The <tt>\ref vx_accessor_e</tt> value to describe the access of the object.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \post <tt>\ref vxCommitDistribution</tt>
+ * \ingroup group_distribution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAccessDistribution(vx_distribution distribution, void **ptr, vx_enum usage);
+
+/*! \brief Commits the distribution objec> and decrements the object reference count in case of success. 
+ * The memory must be a vx_uint32 array of a value at least as big as the value returned via 
+ * <tt>\ref VX_DISTRIBUTION_ATTRIBUTE_BINS</tt>.
+ * \param [in] distribution The Distribution to modify.
+ * \param [in] ptr The pointer provided or returned by <tt>\ref vxAccessDistribution</tt>. The ptr cannot
+ * be NULL.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \pre <tt>\ref vxAccessDistribution</tt>.
+ * \ingroup group_distribution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxCommitDistribution(vx_distribution distribution, const void * ptr);
+
+/*==============================================================================
+ THRESHOLD
+ =============================================================================*/
+
+/*! \brief Creates a reference to a threshold object of a given type.
+ * \param [in] c The reference to the overall context.
+ * \param [in] thresh_type The type of threshold to create.
+ * \param [in] data_type The data type of the threshold's value(s).
+ * \if OPENVX_STRICT_1_0
+ * \note For OpenVX 1.0, data_type can only be <tt>\ref VX_TYPE_UINT8</tt>.
+ * \endif
+ * \returns An threshold reference <tt>\ref vx_threshold</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_threshold
+ */
+VX_API_ENTRY vx_threshold VX_API_CALL vxCreateThreshold(vx_context c, vx_enum thresh_type, vx_enum data_type);
+
+/*! \brief Releases a reference to a threshold object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] thresh The pointer to the threshold to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If thresh is not a <tt>\ref vx_threshold</tt>.
+ * \ingroup group_threshold
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseThreshold(vx_threshold *thresh);
+
+/*! \brief Sets attributes on the threshold object.
+ * \param [in] thresh The threshold object to set.
+ * \param [in] attribute The attribute to modify. Use a <tt>\ref vx_threshold_attribute_e</tt> enumeration.
+ * \param [in] ptr The pointer to the value to which to set the attribute.
+ * \param [in] size The size of the data pointed to by \a ptr.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_threshold
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetThresholdAttribute(vx_threshold thresh, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Queries an attribute on the threshold object.
+ * \param [in] thresh The threshold object to set.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_threshold_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_threshold
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryThreshold(vx_threshold thresh, vx_enum attribute, void *ptr, vx_size size);
+
+/*==============================================================================
+ MATRIX
+ =============================================================================*/
+
+/*! \brief Creates a reference to a matrix object.
+ * \param [in] c The reference to the overall context.
+ * \param [in] data_type The unit format of the matrix. <tt>\ref VX_TYPE_INT32</tt> or <tt>\ref VX_TYPE_FLOAT32</tt>.
+ * \param [in] columns The first dimensionality.
+ * \param [in] rows The second dimensionality.
+ * \returns An matrix reference <tt>\ref vx_matrix</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_matrix
+ */
+VX_API_ENTRY vx_matrix VX_API_CALL vxCreateMatrix(vx_context c, vx_enum data_type, vx_size columns, vx_size rows);
+
+/*! \brief Releases a reference to a matrix object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] mat The matrix reference to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If mat is not a <tt>\ref vx_matrix</tt>.
+ * \ingroup group_matrix
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseMatrix(vx_matrix *mat);
+
+/*! \brief Queries an attribute on the matrix object.
+ * \param [in] mat The matrix object to set.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_matrix_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_matrix
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryMatrix(vx_matrix mat, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Gets the matrix data (copy).
+ * \param [in] mat The reference to the matrix.
+ * \param [out] array The array in which to place the matrix.
+ * \see vxQueryMatrix and <tt>\ref VX_MATRIX_ATTRIBUTE_COLUMNS</tt> and <tt>\ref VX_MATRIX_ATTRIBUTE_ROWS</tt>
+ * to get the needed number of elements of the array.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_matrix
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReadMatrix(vx_matrix mat, void *array);
+
+/*! \brief Sets the matrix data (copy)
+ * \param [in] mat The reference to the matrix.
+ * \param [in] array The array containing the matrix to be written.
+ * \see vxQueryMatrix and <tt>\ref VX_MATRIX_ATTRIBUTE_COLUMNS</tt> and <tt>\ref VX_MATRIX_ATTRIBUTE_ROWS</tt>
+ * to get the needed number of elements of the array.'
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_matrix
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxWriteMatrix(vx_matrix mat, const void *array);
+
+/*==============================================================================
+ CONVOLUTION
+ =============================================================================*/
+
+/*! \brief Creates a reference to a convolution matrix object.
+ * \param [in] context The reference to the overall context.
+ * \param [in] columns The columns dimension of the convolution.
+ * Must be odd and greater than or equal to 3 and less than the value returned
+ * from <tt>\ref VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION</tt>.
+ * \param [in] rows The rows dimension of the convolution.
+ * Must be odd and greater than or equal to 3 and less than the value returned
+ * from <tt>\ref VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION</tt>.
+ * \returns A convolution reference <tt>\ref vx_convolution</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_convolution
+ */
+VX_API_ENTRY vx_convolution VX_API_CALL vxCreateConvolution(vx_context context, vx_size columns, vx_size rows);
+
+/*! \brief Releases the reference to a convolution matrix.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] conv The pointer to the convolution matrix to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If conv is not a <tt>\ref vx_convolution</tt>.
+ * \ingroup group_convolution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseConvolution(vx_convolution *conv);
+
+/*! \brief Queries an attribute on the convolution matrix object.
+ * \param [in] conv The convolution matrix object to set.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_convolution_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_convolution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryConvolution(vx_convolution conv, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Sets attributes on the convolution object.
+ * \param [in] conv The coordinates object to set.
+ * \param [in] attribute The attribute to modify. Use a <tt>\ref vx_convolution_attribute_e</tt> enumeration.
+ * \param [in] ptr The pointer to the value to which to set the attribute.
+ * \param [in] size The size in bytes of the data pointed to by \a ptr.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_convolution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetConvolutionAttribute(vx_convolution conv, vx_enum attribute, const void *ptr, vx_size size);
+
+/*! \brief Gets the convolution data (copy).
+ * \param [in] conv The reference to the convolution.
+ * \param [out] array The array to place the convolution.
+ * \see vxQueryConvolution and <tt>\ref VX_CONVOLUTION_ATTRIBUTE_SIZE</tt> to get the
+ * needed number of bytes of the array.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_convolution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReadConvolutionCoefficients(vx_convolution conv, vx_int16 *array);
+
+/*! \brief Sets the convolution data (copy)
+ * \param [in] conv The reference to the convolution.
+ * \param [in] array The array containing the convolution to be written.
+ * \see <tt>\ref vxQueryConvolution</tt> and <tt>\ref VX_CONVOLUTION_ATTRIBUTE_SIZE</tt> to get the
+ * needed number of bytes of the array.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_convolution
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxWriteConvolutionCoefficients(vx_convolution conv, const vx_int16 *array);
+
+/*==============================================================================
+ PYRAMID
+ =============================================================================*/
+
+/*! \brief Creates a reference to a pyramid object of the supplied number of levels.
+ * \param [in] context The reference to the overall context.
+ * \param [in] levels The number of levels desired. This is required to be a non-zero value.
+ * \param [in] scale Used to indicate the scale between pyramid levels. This is required to be a non-zero positive value.
+ * \if OPENVX_STRICT_1_0
+ * In OpenVX 1.0, the only permissible values are <tt>\ref VX_SCALE_PYRAMID_HALF</tt> or <tt>\ref VX_SCALE_PYRAMID_ORB</tt>.
+ * \endif
+ * \param [in] width The width of the 0th level image in pixels.
+ * \param [in] height The height of the 0th level image in pixels.
+ * \param [in] format The format of all images in the pyramid. NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
+ * \returns A pyramid reference <tt>\ref vx_pyramid</tt> to the sub-image. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_pyramid
+ */
+VX_API_ENTRY vx_pyramid VX_API_CALL vxCreatePyramid(vx_context context, vx_size levels, vx_float32 scale, vx_uint32 width, vx_uint32 height, vx_df_image format);
+
+/*! \brief Creates a reference to a virtual pyramid object of the supplied number of levels.
+ * \details Virtual Pyramids can be used to connect Nodes together when the contents of the pyramids will
+ * not be accessed by the user of the API.
+ * All of the following constructions are valid:
+ * \code
+ * vx_context context = vxCreateContext();
+ * vx_graph graph = vxCreateGraph(context);
+ * vx_pyramid virt[] = {
+ *     vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 0, 0, VX_DF_IMAGE_VIRT), // no dimension and format specified for level 0
+ *     vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 640, 480, VX_DF_IMAGE_VIRT), // no format specified.
+ *     vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 640, 480, VX_DF_IMAGE_U8), // no access
+ * };
+ * \endcode
+ * \param [in] graph The reference to the parent graph.
+ * \param [in] levels The number of levels desired. This is required to be a non-zero value.
+ * \param [in] scale Used to indicate the scale between pyramid levels. This is required to be a non-zero positive value.
+ * \if OPENVX_STRICT_1_0
+ * In OpenVX 1.0, the only permissible values are <tt>\ref VX_SCALE_PYRAMID_HALF</tt> or <tt>\ref VX_SCALE_PYRAMID_ORB</tt>.
+ * \endif
+ * \param [in] width The width of the 0th level image in pixels. This may be set to zero to indicate to the interface that the value is unspecified.
+ * \param [in] height The height of the 0th level image in pixels. This may be set to zero to indicate to the interface that the value is unspecified.
+ * \param [in] format The format of all images in the pyramid. This may be set to <tt>\ref VX_DF_IMAGE_VIRT</tt> to indicate that the format is unspecified.
+ * \returns A pyramid reference <tt>\ref vx_pyramid</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \note Images extracted with <tt>\ref vxGetPyramidLevel</tt> behave as Virtual Images and
+ * cause <tt>\ref vxAccessImagePatch</tt> to return errors.
+ * \ingroup group_pyramid
+ */
+VX_API_ENTRY vx_pyramid VX_API_CALL vxCreateVirtualPyramid(vx_graph graph, vx_size levels, vx_float32 scale, vx_uint32 width, vx_uint32 height, vx_df_image format);
+
+
+/*! \brief Releases a reference to a pyramid object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] pyr The pointer to the pyramid to release.
+ * \ingroup group_pyramid
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If pyr is not a <tt>\ref vx_pyramid</tt>.
+ * \post After returning from this function the reference is zeroed.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleasePyramid(vx_pyramid *pyr);
+
+/*! \brief Queries an attribute from an image pyramid.
+ * \param [in] pyr The pyramid to query.
+ * \param [in] attribute The attribute for which to query. Use a <tt>\ref vx_pyramid_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_pyramid
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryPyramid(vx_pyramid pyr, vx_enum attribute, void *ptr, vx_size size);
+
+/*! \brief Retrieves a level of the pyramid as a <tt>\ref vx_image</tt>, which can be used
+ * elsewhere in OpenVX. A call to vxReleaseImage is necessary to release an image for each 
+ * call of vxGetPyramidLevel.
+ * \param [in] pyr The pyramid object.
+ * \param [in] index The index of the level, such that index is less than levels.
+ * \return A <tt>\ref vx_image</tt> reference.
+ * \retval 0 Indicates that the index or the object is invalid.
+ * \ingroup group_pyramid
+ */
+VX_API_ENTRY vx_image VX_API_CALL vxGetPyramidLevel(vx_pyramid pyr, vx_uint32 index);
+
+/*==============================================================================
+ REMAP
+ =============================================================================*/
+
+/*! \brief Creates a remap table object.
+ * \param [in] context The reference to the overall context.
+ * \param [in] src_width Width of the source image in pixel.
+ * \param [in] src_height Height of the source image in pixels.
+ * \param [in] dst_width Width of the destination image in pixels.
+ * \param [in] dst_height Height of the destination image in pixels.
+ * \ingroup group_remap
+ * \returns A remap reference <tt>\ref vx_remap</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ */
+VX_API_ENTRY vx_remap VX_API_CALL vxCreateRemap(vx_context context,
+                              vx_uint32 src_width,
+                              vx_uint32 src_height,
+                              vx_uint32 dst_width,
+                              vx_uint32 dst_height);
+
+/*! \brief Releases a reference to a remap table object. The object may not be
+ * garbage collected until its total reference count is zero.
+ * \param [in] table The pointer to the remap table to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If table is not a <tt>\ref vx_remap</tt>.
+ * \ingroup group_remap
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseRemap(vx_remap *table);
+
+/*! \brief Assigns a destination pixel mapping to the source pixel.
+ * \param [in] table The remap table reference.
+ * \param [in] dst_x The destination x coordinate.
+ * \param [in] dst_y The destination y coordinate.
+ * \param [in] src_x The source x coordinate in float representation to allow interpolation.
+ * \param [in] src_y The source y coordinate in float representation to allow interpolation.
+ * \ingroup group_remap
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetRemapPoint(vx_remap table,
+                                 vx_uint32 dst_x, vx_uint32 dst_y,
+                                 vx_float32 src_x, vx_float32 src_y);
+
+/*! \brief Retrieves the source pixel point from a destination pixel.
+ * \param [in] table The remap table reference.
+ * \param [in] dst_x The destination x coordinate.
+ * \param [in] dst_y The destination y coordinate.
+ * \param [out] src_x The pointer to the location to store the source x coordinate in float representation to allow interpolation.
+ * \param [out] src_y The pointer to the location to store the source y coordinate in float representation to allow interpolation.
+ * \ingroup group_remap
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxGetRemapPoint(vx_remap table,
+                                 vx_uint32 dst_x, vx_uint32 dst_y,
+                                 vx_float32 *src_x, vx_float32 *src_y);
+
+/*! \brief Queries attributes from a Remap table.
+ * \param [in] r The remap to query.
+ * \param [in] attribute The attribute to query. Use a <tt>\ref vx_remap_attribute_e</tt> enumeration.
+ * \param [out] ptr The location at which to store the resulting value.
+ * \param [in] size The size in bytes of the container to which \a ptr points.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_remap
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryRemap(vx_remap r, vx_enum attribute, void *ptr, vx_size size);
+
+/*==============================================================================
+ ARRAY
+ =============================================================================*/
+
+/*!
+ * \brief Creates a reference to an Array object.
+ *
+ * User must specify the Array capacity (i.e., the maximal number of items that the array can hold).
+ *
+ * \param [in] context      The reference to the overall Context.
+ * \param [in] item_type    The type of objects to hold. Use:
+ *                          \arg <tt>\ref VX_TYPE_RECTANGLE</tt> for <tt>\ref vx_rectangle_t</tt>.
+ *                          \arg <tt>\ref VX_TYPE_KEYPOINT</tt> for <tt>\ref vx_keypoint_t</tt>.
+ *                          \arg <tt>\ref VX_TYPE_COORDINATES2D</tt> for <tt>\ref vx_coordinates2d_t</tt>.
+ *                          \arg <tt>\ref VX_TYPE_COORDINATES3D</tt> for <tt>\ref vx_coordinates3d_t</tt>.
+ *                          \arg <tt>\ref vx_enum</tt> Returned from <tt>\ref vxRegisterUserStruct</tt>.
+ * \param [in] capacity     The maximal number of items that the array can hold.
+ *
+ * \returns An array reference <tt>\ref vx_array</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_array VX_API_CALL vxCreateArray(vx_context context, vx_enum item_type, vx_size capacity);
+
+/*!
+ * \brief Creates an opaque reference to a virtual Array with no direct user access.
+ *
+ * Virtual Arrays are useful when item type or capacity are unknown ahead of time
+ * and the Array is used as internal graph edge. Virtual arrays are scoped within the parent graph only.
+ *
+ * All of the following constructions are allowed.
+ * \code
+ * vx_context context = vxCreateContext();
+ * vx_graph graph = vxCreateGraph(context);
+ * vx_array virt[] = {
+ *     vxCreateVirtualArray(graph, 0, 0), // totally unspecified
+ *     vxCreateVirtualArray(graph, VX_TYPE_KEYPOINT, 0), // unspecified capacity
+ *     vxCreateVirtualArray(graph, VX_TYPE_KEYPOINT, 1000), // no access
+ * };
+ * \endcode
+ *
+ * \param [in] graph        The reference to the parent graph.
+ * \param [in] item_type    The type of objects to hold.
+ *                          This may to set to zero to indicate an unspecified item type.
+ * \param [in] capacity     The maximal number of items that the array can hold.
+ *                          This may be to set to zero to indicate an unspecified capacity.
+ * \see vxCreateArray for a type list.
+ * \returns A array reference <tt>\ref vx_array</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_array VX_API_CALL vxCreateVirtualArray(vx_graph graph, vx_enum item_type, vx_size capacity);
+
+/*!
+ * \brief Releases a reference of an Array object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * After returning from this function the reference is zeroed.
+ * \param [in] arr          The pointer to the Array to release.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If arr is not a <tt>\ref vx_array</tt>.
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseArray(vx_array *arr);
+
+/*!
+ * \brief Queries the Array for some specific information.
+ *
+ * \param [in] arr          The reference to the Array.
+ * \param [in] attribute    The attribute to query. Use a <tt>\ref vx_array_attribute_e</tt>.
+ * \param [out] ptr         The location at which to store the resulting value.
+ * \param [in] size         The size in bytes of the container to which \a ptr points.
+ *
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS                   No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+ * \retval VX_ERROR_NOT_SUPPORTED       If the \a attribute is not a value supported on this implementation.
+ * \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+ *
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxQueryArray(vx_array arr, vx_enum attribute, void *ptr, vx_size size);
+
+/*!
+ * \brief Adds items to the Array.
+ *
+ * This function increases the container size.
+ *
+ * By default, the function does not reallocate memory,
+ * so if the container is already full (number of elements is equal to capacity)
+ * or it doesn't have enough space,
+ * the function returns <tt>\ref VX_FAILURE</tt> error code.
+ *
+ * \param [in] arr          The reference to the Array.
+ * \param [in] count        The total number of elements to insert.
+ * \param [in] ptr          The location at which to store the input values.
+ * \param [in] stride       The number of bytes between the beginning of two consecutive elements.
+ *
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS                   No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+ * \retval VX_FAILURE                   If the Array is full.
+ * \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+ *
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAddArrayItems(vx_array arr, vx_size count, const void *ptr, vx_size stride);
+
+/*!
+ * \brief Truncates an Array (remove items from the end).
+ *
+ * \param [in,out] arr          The reference to the Array.
+ * \param [in] new_num_items    The new number of items for the Array.
+ *
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS                   No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS  The \a new_size is greater than the current size.
+ *
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxTruncateArray(vx_array arr, vx_size new_num_items);
+
+/*!
+ * \brief Grants access to a sub-range of an Array. The number of elements in the sub-range is given by (end - start).
+ *
+ * \param [in] arr          The reference to the Array.
+ * \param [in] start        The start index.
+ * \param [in] end          The end index. (end - start) elements are accessed from start.
+ * \param [in, out] stride  A pointer to 'number of bytes' between the beginning of two consequent 
+ * elements. 
+ * \arg Input case: ptr is a pointer to a non-NULL pointer. The stride parameter must be the address 
+ * of a vx_size scalar that describes how the user will access the requested array data at address 
+ * (*ptr).
+ * \arg Output Case: ptr is a pointer to a NULL pointer. The function fills the vx_size scalar 
+ * pointed by stride with the element stride information that the user must consult to access the 
+ * array elements at address (*ptr).
+ * \param [out] ptr        A pointer to a pointer to a location to store the requested data.
+ * \arg Input Case: ptr is a pointer to a non-NULL pointer to a valid buffer. This buffer will be 
+ * used in one of two ways, depending on the value of the usage parameter. If usage is 
+ * VX_WRITE_ONLY, then the buffer must contain element data that the user wants to replace the 
+ * array's element data with. Otherwise (i.e., usage is not VX_WRITE_ONLY), the array's current 
+ * element data will be written to the memory starting at address (*ptr) as storage memory for the 
+ * access request. The caller must ensure enough memory has been allocated for the requested array 
+ * range with the requested stride.
+ * \arg Output Case: ptr is a pointer to a NULL pointer.  This NULL pointer will be overwritten with 
+ * a pointer to the address where the requested data can be accessed. (*ptr) must eventually be provided 
+ * as the ptr parameter of a call to vxCommitArrayRange. 
+ * \param [in] usage        This declares the intended usage of the pointer using the <tt>\ref vx_accessor_e</tt> enumeration.
+ *
+ * \note The stride and ptr parameters must both be input, or both be output, otherwise the behavior 
+ * is undefined.
+ *
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS                   No errors.
+ * \retval VX_ERROR_OPTIMIZED_AWAY      If the reference is a virtual array and cannot be accessed or committed.
+ * \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+ * \post <tt>\ref vxCommitArrayRange</tt>
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAccessArrayRange(vx_array arr, vx_size start, vx_size end, vx_size *stride, void **ptr, vx_enum usage);
+
+/*!
+ * \brief Commits data back to the Array object.
+ *
+ * \details This allows a user to commit data to a sub-range of an Array. The number of elements in the sub-range is given by (end - start).
+ *
+ * \param [in] arr          The reference to the Array.
+ * \param [in] start        The start index.
+ * \param [in] end          The end index. (end - start) elements are accessed from start.
+ * \param [in] ptr          The user supplied pointer.
+ *
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS                   No errors.
+ * \retval VX_ERROR_OPTIMIZED_AWAY      If the reference is a virtual array and cannot be accessed or committed.
+ * \retval VX_ERROR_INVALID_REFERENCE   If the \a arr is not a <tt>\ref vx_array</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS  If any of the other parameters are incorrect.
+ *
+ * \ingroup group_array
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxCommitArrayRange(vx_array arr, vx_size start, vx_size end, const void *ptr);
+
+/*!
+ * \brief Accesses a specific indexed element in an array.
+ * \param [in] ptr The base pointer for the array range.
+ * \param [in] index The index of the element, not byte, to access.
+ * \param [in] stride The 'number of bytes' between the beginning of two consecutive elements. 
+ * \ingroup group_array
+ */
+#define vxFormatArrayPointer(ptr, index, stride) \
+    (&(((vx_uint8*)(ptr))[(index) * (stride)]))
+
+/*!
+ * \brief Allows access to an array item as a typecast pointer deference.
+ * \param [in] type The type of the item to access.
+ * \param [in] ptr The base pointer for the array range.
+ * \param [in] index The index of the element, not byte, to access.
+ * \param [in] stride The 'number of bytes' between the beginning of two consecutive elements. 
+ * \ingroup group_array
+ */
+#define vxArrayItem(type, ptr, index, stride) \
+    (*(type *)(vxFormatArrayPointer((ptr), (index), (stride))))
+
+/*==============================================================================
+ META FORMAT
+ =============================================================================*/
+
+/*! \brief This function allows a user to set the attributes of a <tt>\ref vx_meta_format</tt> object in a kernel output validator.
+ * 
+ * The \ref vx_meta_format object contains two types of information : data object meta data and 
+ * some specific information that defines how the valid region of an image changes
+ *
+ * The meta data attributes that can be set are identified by this list:
+ * - \ref vx_image : \ref VX_IMAGE_ATTRIBUTE_FORMAT, \ref VX_IMAGE_ATTRIBUTE_HEIGHT, \ref VX_IMAGE_ATTRIBUTE_WIDTH
+ * - \ref vx_array : \ref VX_ARRAY_ATTRIBUTE_CAPACITY, \ref VX_ARRAY_ATTRIBUTE_ITEMTYPE
+ * - \ref vx_pyramid : \ref VX_PYRAMID_ATTRIBUTE_FORMAT, \ref VX_PYRAMID_ATTRIBUTE_HEIGHT, \ref VX_PYRAMID_ATTRIBUTE_WIDTH, \ref VX_PYRAMID_ATTRIBUTE_LEVELS, \ref VX_PYRAMID_ATTRIBUTE_SCALE
+ * - \ref vx_scalar : \ref VX_SCALAR_ATTRIBUTE_TYPE
+ * - \ref vx_matrix : \ref VX_MATRIX_ATTRIBUTE_TYPE, \ref VX_MATRIX_ATTRIBUTE_ROWS, \ref VX_MATRIX_ATTRIBUTE_COLUMNS
+ * - \ref vx_distribution : \ref VX_DISTRIBUTION_ATTRIBUTE_BINS, \ref VX_DISTRIBUTION_ATTRIBUTE_OFFSET, \ref VX_DISTRIBUTION_ATTRIBUTE_RANGE
+ * - \ref vx_remap : \ref VX_REMAP_ATTRIBUTE_SOURCE_WIDTH, \ref VX_REMAP_ATTRIBUTE_SOURCE_HEIGHT, \ref VX_REMAP_ATTRIBUTE_DESTINATION_WIDTH, \ref VX_REMAP_ATTRIBUTE_DESTINATION_HEIGHT
+ * - \ref vx_lut : \ref VX_LUT_ATTRIBUTE_TYPE, \ref VX_LUT_ATTRIBUTE_COUNT
+ * - \ref vx_threshold : \ref VX_THRESHOLD_ATTRIBUTE_TYPE
+ * - \ref VX_META_FORMAT_ATTRIBUTE_DELTA_RECTANGLE
+ * \note For vx_image, a specific attribute can be used to specify the valid region evolution. This information is not a meta data.
+ *
+ * \param [in] meta The reference to the \ref vx_meta_format struct to set 
+ * \param [in] attribute Use the subset of data object attributes that define the meta data of this object or attributes from <tt>\ref vx_meta_format_attribute_e</tt>.
+ * \param [in] ptr The input pointer of the value to set on the meta format object.
+ * \param [in] size The size in bytes of the object to which \a ptr points.
+ * \ingroup group_user_kernels
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS The attribute was set.
+ * \retval VX_ERROR_INVALID_REFERENCE meta was not a <tt>\ref vx_meta_format</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETER size was not correct for the type needed.
+ * \retval VX_ERROR_NOT_SUPPORTED the object attribute was not supported on the meta format object.
+ * \retval VX_ERROR_INVALID_TYPE attribute type did not match known meta format type.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetMetaFormatAttribute(vx_meta_format meta, vx_enum attribute, const void *ptr, vx_size size);
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/openvx/include/VX/vx_kernels.h b/openvx/include/VX/vx_kernels.h
new file mode 100644
index 0000000..707485a
--- /dev/null
+++ b/openvx/include/VX/vx_kernels.h
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2012-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#ifndef _OPENVX_KERNELS_H_
+#define _OPENVX_KERNELS_H_
+
+/*!
+ * \file
+ * \brief The list of supported kernels in the OpenVX standard.
+ */
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*! \brief The standard list of available libraries */
+enum vx_library_e {
+    /*! \brief The base set of kernels as defined by Khronos. */
+    VX_LIBRARY_KHR_BASE = 0x0,
+};
+
+/*!
+ * \brief The standard list of available vision kernels.
+ *
+ * Each kernel listed here can be used with the <tt>\ref vxGetKernelByEnum</tt> call.
+ * When programming the parameters, use
+ * \arg <tt>\ref VX_INPUT</tt> for [in]
+ * \arg <tt>\ref VX_OUTPUT</tt> for [out]
+ * \arg <tt>\ref VX_BIDIRECTIONAL</tt> for [in,out]
+ *
+ * When programming the parameters, use
+ * \arg <tt>\ref VX_TYPE_IMAGE</tt> for a <tt>\ref vx_image</tt> in the size field of <tt>\ref vxGetParameterByIndex</tt> or <tt>\ref vxSetParameterByIndex</tt>  * \arg <tt>\ref VX_TYPE_ARRAY</tt> for a <tt>\ref vx_array</tt> in the size field of <tt>\ref vxGetParameterByIndex</tt> or <tt>\ref vxSetParameterByIndex</tt>  * \arg or other appropriate types in \ref vx_type_e.
+ * \ingroup group_kernel
+ */
+enum vx_kernel_e {
+
+    /*!
+     * \brief The invalid kernel is used to for conformance failure in relation to
+     * some kernel operation (Get/Release).
+     * \details If the kernel is executed it shall always return an error.
+     * The kernel has no parameters. To address by name use "org.khronos.openvx.invalid".
+     */
+    VX_KERNEL_INVALID = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x0,
+
+    /*!
+     * \brief The Color Space conversion kernel.
+     * \details The conversions are based on the <tt>\ref vx_df_image_e</tt> code in the images.
+     * \see group_vision_function_colorconvert
+     */
+    VX_KERNEL_COLOR_CONVERT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1,
+
+    /*!
+     * \brief The Generic Channel Extraction Kernel.
+     * \details This kernel can remove individual color channels from an interleaved
+     * or semi-planar, planar, sub-sampled planar image. A client could extract
+     * a red channel from an interleaved RGB image or do a Luma extract from a
+     * YUV format.
+     * \see group_vision_function_channelextract
+     */
+    VX_KERNEL_CHANNEL_EXTRACT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2,
+
+    /*!
+     * \brief The Generic Channel Combine Kernel.
+     * \details This kernel combine multiple individual planes into a single
+     * multiplanar image of the type specified in the output image.
+     * \see group_vision_function_channelcombine
+     */
+    VX_KERNEL_CHANNEL_COMBINE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3,
+
+    /*! \brief The Sobel 3x3 Filter Kernel.
+     * \see group_vision_function_sobel3x3
+     */
+    VX_KERNEL_SOBEL_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x4,
+
+    /*!
+     * \brief The Magnitude Kernel.
+     * \details This kernel produces a magnitude plane from two input gradients.
+     * \see group_vision_function_magnitude
+     */
+    VX_KERNEL_MAGNITUDE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x5,
+
+    /*!
+     * \brief The Phase Kernel.
+     * \details This kernel produces a phase plane from two input gradients.
+     * \see group_vision_function_phase
+     */
+    VX_KERNEL_PHASE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x6,
+
+    /*!
+     * \brief The Scale Image Kernel.
+     * \details This kernel provides resizing of an input image to an output image.
+     * The scaling factor is determined but the relative sizes of the input and
+     * output.
+     * \see group_vision_function_scale_image
+     */
+    VX_KERNEL_SCALE_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x7,
+
+    /*! \brief The Table Lookup kernel
+     * \see group_vision_function_lut
+     */
+    VX_KERNEL_TABLE_LOOKUP = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x8,
+
+    /*! \brief The Histogram Kernel.
+     * \see group_vision_function_histogram
+     */
+    VX_KERNEL_HISTOGRAM = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x9,
+
+    /*! \brief The Histogram Equalization Kernel.
+     * \see group_vision_function_equalize_hist
+     */
+    VX_KERNEL_EQUALIZE_HISTOGRAM = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xA,
+
+    /*! \brief The Absolute Difference Kernel.
+     * \see group_vision_function_absdiff
+     */
+    VX_KERNEL_ABSDIFF = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xB,
+
+    /*! \brief The Mean and Standard Deviation Kernel.
+     * \see group_vision_function_meanstddev
+     */
+    VX_KERNEL_MEAN_STDDEV = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xC,
+
+    /*! \brief The Threshold Kernel.
+     * \see group_vision_function_threshold
+     */
+    VX_KERNEL_THRESHOLD = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xD,
+
+    /*! \brief The Integral Image Kernel.
+     * \see group_vision_function_integral_image
+     */
+    VX_KERNEL_INTEGRAL_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xE,
+
+    /*! \brief The dilate kernel.
+     * \see group_vision_function_dilate_image
+     */
+    VX_KERNEL_DILATE_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xF,
+
+    /*! \brief The erode kernel.
+     * \see group_vision_function_erode_image
+     */
+    VX_KERNEL_ERODE_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x10,
+
+    /*! \brief The median image filter.
+     * \see group_vision_function_median_image
+     */
+    VX_KERNEL_MEDIAN_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x11,
+
+    /*! \brief The box filter kernel.
+     * \see group_vision_function_box_image
+     */
+    VX_KERNEL_BOX_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x12,
+
+    /*! \brief The gaussian filter kernel.
+     * \see group_vision_function_gaussian_image
+     */
+    VX_KERNEL_GAUSSIAN_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x13,
+
+    /*! \brief The custom convolution kernel.
+     * \see group_vision_function_custom_convolution
+     */
+    VX_KERNEL_CUSTOM_CONVOLUTION = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x14,
+
+    /*! \brief The gaussian image pyramid kernel.
+     * \see group_vision_function_gaussian_pyramid
+     */
+    VX_KERNEL_GAUSSIAN_PYRAMID = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x15,
+
+    /*! \brief The accumulation kernel.
+     * \see group_vision_function_accumulate
+     */
+    VX_KERNEL_ACCUMULATE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x16,
+
+    /*! \brief The weigthed accumulation kernel.
+     * \see group_vision_function_accumulate_weighted
+     */
+    VX_KERNEL_ACCUMULATE_WEIGHTED = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x17,
+
+    /*! \brief The squared accumulation kernel.
+     * \see group_vision_function_accumulate_square
+     */
+    VX_KERNEL_ACCUMULATE_SQUARE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x18,
+
+    /*! \brief The min and max location kernel.
+     * \see group_vision_function_minmaxloc
+     */
+    VX_KERNEL_MINMAXLOC = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x19,
+
+    /*! \brief The bit-depth conversion kernel.
+     * \see group_vision_function_convertdepth
+     */
+    VX_KERNEL_CONVERTDEPTH = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1A,
+
+    /*! \brief The Canny Edge Detector.
+     * \see group_vision_function_canny
+     */
+    VX_KERNEL_CANNY_EDGE_DETECTOR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1B,
+
+    /*! \brief The Bitwise And Kernel.
+     * \see group_vision_function_and
+     */
+    VX_KERNEL_AND = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1C,
+
+    /*! \brief The Bitwise Inclusive Or Kernel.
+     * \see group_vision_function_or
+     */
+    VX_KERNEL_OR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1D,
+
+    /*! \brief The Bitwise Exclusive Or Kernel.
+     * \see group_vision_function_xor
+     */
+    VX_KERNEL_XOR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1E,
+
+    /*! \brief The Bitwise Not Kernel.
+     * \see group_vision_function_not
+     */
+    VX_KERNEL_NOT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1F,
+
+    /*! \brief The Pixelwise Multiplication Kernel.
+     * \see group_vision_function_mult
+     */
+    VX_KERNEL_MULTIPLY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x20,
+
+    /*! \brief The Addition Kernel.
+     * \see group_vision_function_add
+     */
+    VX_KERNEL_ADD = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x21,
+
+    /*! \brief The Subtraction Kernel.
+     * \see group_vision_function_sub
+     */
+    VX_KERNEL_SUBTRACT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x22,
+
+    /*! \brief The Warp Affine Kernel.
+     * \see group_vision_function_warp_affine
+     */
+    VX_KERNEL_WARP_AFFINE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x23,
+
+    /*! \brief The Warp Perspective Kernel.
+     * \see group_vision_function_warp_perspective
+     */
+    VX_KERNEL_WARP_PERSPECTIVE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x24,
+
+    /*! \brief The Harris Corners Kernel.
+     * \see group_vision_function_harris
+     */
+    VX_KERNEL_HARRIS_CORNERS = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x25,
+
+    /*! \brief The FAST Corners Kernel.
+     * \see group_vision_function_fast
+     */
+    VX_KERNEL_FAST_CORNERS = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x26,
+
+    /*! \brief The Optical Flow Pyramid (LK) Kernel.
+     * \see group_vision_function_opticalflowpyrlk
+     */
+    VX_KERNEL_OPTICAL_FLOW_PYR_LK = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x27,
+
+    /*! \brief The Remap Kernel.
+     * \see group_vision_function_remap
+     */
+    VX_KERNEL_REMAP = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x28,
+
+
+    /*! \brief The Half Scale Gaussian Kernel.
+     * \see group_vision_function_scale_image
+     */
+    VX_KERNEL_HALFSCALE_GAUSSIAN = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x29,
+
+    /* insert new kernels here */
+    VX_KERNEL_MAX_1_0, /*!< \internal Used for bounds checking in the conformance test. */
+};
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  /* _OPEN_VISION_LIBRARY_KERNELS_H_ */
diff --git a/openvx/include/VX/vx_nodes.h b/openvx/include/VX/vx_nodes.h
new file mode 100644
index 0000000..d0d7718
--- /dev/null
+++ b/openvx/include/VX/vx_nodes.h
@@ -0,0 +1,592 @@
+/*******************************************************************************
+* Copyright (c) 2012-2015 The Khronos Group Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and/or associated documentation files (the
+* "Materials"), to deal in the Materials without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Materials, and to
+* permit persons to whom the Materials are furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Materials.
+*
+* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+******************************************************************************/
+
+#ifndef _OPENVX_NODES_H_
+#define _OPENVX_NODES_H_
+
+/*!
+ * \file vx_nodes.h
+ * \brief The "Simple" API interface for OpenVX. These APIs are just
+ * wrappers around the more verbose functions defined in <tt>\ref vx_api.h</tt>.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief [Graph] Creates a color conversion node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image from which to convert.
+ * \param [out] output The output image to which to convert.
+ * \see <tt>VX_KERNEL_COLOR_CONVERT</tt>
+ * \ingroup group_vision_function_colorconvert
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates a channel extract node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image. Must be one of the defined \ref vx_df_image_e multi-planar formats.
+ * \param [in] channel The <tt>\ref vx_channel_e</tt> channel to extract.
+ * \param [out] output The output image. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * <tt>\see VX_KERNEL_CHANNEL_EXTRACT</tt>
+ * \ingroup group_vision_function_channelextract
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxChannelExtractNode(vx_graph graph,
+                             vx_image input,
+                             vx_enum channel,
+                             vx_image output);
+
+/*! \brief [Graph] Creates a channel combine node.
+ * \param [in] graph The graph reference.
+ * \param [in] plane0 The plane that forms channel 0. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] plane1 The plane that forms channel 1. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] plane2 [optional] The plane that forms channel 2. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] plane3 [optional] The plane that forms channel 3. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [out] output The output image. The format of the image must be defined, even if the image is virtual.
+ * \see <tt>VX_KERNEL_CHANNEL_COMBINE</tt>
+ * \ingroup group_vision_function_channelcombine
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxChannelCombineNode(vx_graph graph,
+                             vx_image plane0,
+                             vx_image plane1,
+                             vx_image plane2,
+                             vx_image plane3,
+                             vx_image output);
+
+/*! \brief [Graph] Creates a Phase node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] grad_x The input x image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [in] grad_y The input y image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] orientation The phase image. This is in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \see <tt>VX_KERNEL_PHASE</tt>
+ * \ingroup group_vision_function_phase
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image orientation);
+
+/*! \brief [Graph] Creates a Sobel3x3 node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output_x [optional] The output gradient in the x direction in <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \param [out] output_y [optional] The output gradient in the y direction in <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \see <tt>VX_KERNEL_SOBEL_3x3</tt>
+ * \ingroup group_vision_function_sobel3x3
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input, vx_image output_x, vx_image output_y);
+
+
+/*! \brief [Graph] Create a Magnitude node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] grad_x The input x image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [in] grad_y The input y image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] mag The magnitude image. This is in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \see <tt>VX_KERNEL_MAGNITUDE</tt>
+ * \ingroup group_vision_function_magnitude
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image mag);
+
+/*! \brief [Graph] Creates a Scale Image Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] src The source image of type <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [out] dst The destination image of type <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] type The interpolation type to use. \see vx_interpolation_type_e.
+ * \ingroup group_vision_function_scale_image
+ * \note The destination image must have a defined size and format. Only 
+ *  <tt>\ref VX_NODE_ATTRIBUTE_BORDER_MODE</tt> value <tt>\ref VX_BORDER_MODE_UNDEFINED</tt>, 
+ *  <tt>\ref VX_BORDER_MODE_REPLICATE</tt> or <tt>\ref VX_BORDER_MODE_CONSTANT</tt> is supported.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src, vx_image dst, vx_enum type);
+
+/*! \brief [Graph] Creates a Table Lookup node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] lut The LUT which is of type <tt>\ref VX_TYPE_UINT8</tt>.
+ * \param [out] output The output image of type <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \ingroup group_vision_function_lut
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxTableLookupNode(vx_graph graph, vx_image input, vx_lut lut, vx_image output);
+
+/*! \brief [Graph] Creates a Histogram node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [out] distribution The output distribution.
+ * \ingroup group_vision_function_histogram
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxHistogramNode(vx_graph graph, vx_image input, vx_distribution distribution);
+
+/*! \brief [Graph] Creates a Histogram Equalization node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The grayscale input image in <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [out] output The grayscale output image of type <tt>\ref VX_DF_IMAGE_U8</tt> with equalized brightness and contrast.
+ * \ingroup group_vision_function_equalize_hist
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxEqualizeHistNode(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates an AbsDiff node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 An input image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [in] in2 An input image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] out The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_absdiff
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxAbsDiffNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Graph] Creates a mean value and standard deviation node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image. <tt>\ref VX_DF_IMAGE_U8</tt> is supported.
+ * \param [out] mean The <tt>\ref VX_TYPE_FLOAT32</tt> average pixel value.
+ * \param [out] stddev The <tt>\ref VX_TYPE_FLOAT32</tt> standard deviation of the pixel values.
+ * \ingroup group_vision_function_meanstddev
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMeanStdDevNode(vx_graph graph, vx_image input, vx_scalar mean, vx_scalar stddev);
+
+/*! \brief [Graph] Creates a Threshold node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image. <tt>\ref VX_DF_IMAGE_U8</tt> is supported.
+ * \param [in] thresh The thresholding object that defines the parameters of
+ * the operation.
+ * \param [out] output The output Boolean image. Values are either 0 or 255.
+ * \ingroup group_vision_function_threshold
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxThresholdNode(vx_graph graph, vx_image input, vx_threshold thresh, vx_image output);
+
+/*! \brief [Graph] Creates an Integral Image Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U32</tt> format.
+ * \ingroup group_vision_function_integral_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates an Erosion Image Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_erode_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates a Dilation Image Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_dilate_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates a Median Image Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_median_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates a Box Filter Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_box_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates a Gaussian Filter Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_gaussian_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates a custom convolution node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [in] conv The vx_int16 convolution matrix.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_custom_convolution
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt> 
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxConvolveNode(vx_graph graph, vx_image input, vx_convolution conv, vx_image output);
+
+/*! \brief [Graph] Creates a node for a Gaussian Image Pyramid.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] gaussian The Gaussian pyramid with <tt>\ref VX_DF_IMAGE_U8</tt> to construct.
+ * \ingroup group_vision_function_gaussian_pyramid
+ * \see group_pyramid
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGaussianPyramidNode(vx_graph graph, vx_image input, vx_pyramid gaussian);
+
+/*! \brief [Graph] Creates an accumulate node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in,out] accum The accumulation image in <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \ingroup group_vision_function_accumulate
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxAccumulateImageNode(vx_graph graph, vx_image input, vx_image accum);
+
+/*! \brief [Graph] Creates a weighted accumulate node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] alpha The input <tt>\ref VX_TYPE_FLOAT32</tt> scalar value with a value in the range of \f$ 0.0 \le \alpha \le 1.0 \f$.
+ * \param [in,out] accum The <tt>\ref VX_DF_IMAGE_U8</tt> accumulation image.
+ * \ingroup group_vision_function_accumulate_weighted
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxAccumulateWeightedImageNode(vx_graph graph, vx_image input, vx_scalar alpha, vx_image accum);
+
+/*! \brief [Graph] Creates an accumulate square node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] shift The input <tt>\ref VX_TYPE_UINT32</tt> with a value in the range of \f$ 0 \le shift \le 15 \f$.
+ * \param [in,out] accum The accumulation image in <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \ingroup group_vision_function_accumulate_square
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxAccumulateSquareImageNode(vx_graph graph, vx_image input, vx_scalar shift, vx_image accum);
+
+/*! \brief [Graph] Creates a min,max,loc node.
+ * \param [in] graph The reference to create the graph.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] minVal The minimum value in the image, which corresponds to the type of the input.
+ * \param [out] maxVal The maximum value in the image, which corresponds to the type of the input.
+ * \param [out] minLoc The minimum <tt>\ref VX_TYPE_COORDINATES2D</tt> locations (optional). If the input image has several minimums, the kernel will return up to the capacity of the array.
+ * \param [out] maxLoc The maximum <tt>\ref VX_TYPE_COORDINATES2D</tt> locations (optional). If the input image has several maximums, the kernel will return up to the capacity of the array.
+ * \param [out] minCount The total number of detected minimums in image (optional). Use a <tt>\ref VX_TYPE_UINT32</tt> scalar.
+ * \param [out] maxCount The total number of detected maximums in image (optional). Use a <tt>\ref VX_TYPE_UINT32</tt> scalar.
+ * \ingroup group_vision_function_minmaxloc
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMinMaxLocNode(vx_graph graph,
+                        vx_image input,
+                        vx_scalar minVal, vx_scalar maxVal,
+                        vx_array minLoc, vx_array maxLoc,
+                        vx_scalar minCount, vx_scalar maxCount);
+
+/*! \brief [Graph] Creates a bitwise AND node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [out] out The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_and
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxAndNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Graph] Creates a bitwise INCLUSIVE OR node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [out] out The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_or
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Graph] Creates a bitwise EXCLUSIVE OR node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [out] out The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_xor
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Graph] Creates a bitwise NOT node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input A <tt>\ref VX_DF_IMAGE_U8</tt> input image.
+ * \param [out] output The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_not
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_image output);
+
+/*! \brief [Graph] Creates an pixelwise-multiplication node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 An input image, <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \param [in] in2 An input image, <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \param [in] scale A non-negative <tt>\ref VX_TYPE_FLOAT32</tt> multiplied to each product before overflow handling.
+ * \param [in] overflow_policy A <tt>\ref VX_TYPE_ENUM</tt> of the <tt>\ref vx_convert_policy_e</tt> enumeration.
+ * \param [in] rounding_policy A <tt>\ref VX_TYPE_ENUM</tt> of the <tt>\ref vx_round_policy_e</tt> enumeration.
+ * \param [out] out The output image, a <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> image.
+ * \ingroup group_vision_function_mult
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph,
+                       vx_image in1, vx_image in2,
+                       vx_scalar scale,
+                       vx_enum overflow_policy,
+                       vx_enum rounding_policy,
+                       vx_image out);
+
+/*! \brief [Graph] Creates an arithmetic addition node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 An input image, <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \param [in] in2 An input image, <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \param [in] policy A <tt>\ref VX_TYPE_ENUM</tt> of the \ref vx_convert_policy_e enumeration.
+ * \param [out] out The output image, a <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> image.
+ * \ingroup group_vision_function_add
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxAddNode(vx_graph graph,
+                  vx_image in1, vx_image in2,
+                  vx_enum policy,
+                  vx_image out);
+
+/*! \brief [Graph] Creates an arithmetic subtraction node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] in1 An input image, <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt>, the minuend.
+ * \param [in] in2 An input image, <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt>, the subtrahend.
+ * \param [in] policy A <tt>\ref VX_TYPE_ENUM</tt> of the \ref vx_convert_policy_e enumeration.
+ * \param [out] out The output image, a <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> image.
+ * \ingroup group_vision_function_sub
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxSubtractNode(vx_graph graph,
+                       vx_image in1, vx_image in2,
+                       vx_enum policy,
+                       vx_image out);
+
+/*! \brief [Graph] Creates a bit-depth conversion node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input image.
+ * \param [out] output The output image.
+ * \param [in] policy A scalar containing a <tt>\ref VX_TYPE_ENUM</tt> of the \ref vx_convert_policy_e enumeration.
+ * \param [in] shift A scalar containing a <tt>\ref VX_TYPE_INT32</tt> of the shift value.
+ * \ingroup group_vision_function_convertdepth
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image input, vx_image output, vx_enum policy, vx_scalar shift);
+
+/*! \brief [Graph] Creates a Canny Edge Detection Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] hyst The double threshold for hysteresis.
+ * \param [in] gradient_size The size of the Sobel filter window, must support at least 3, 5, and 7.
+ * \param [in] norm_type A flag indicating the norm used to compute the gradient, <tt>\ref VX_NORM_L1</tt> or VX_NORM_L2.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format with values either 0 or 255.
+ * \ingroup group_vision_function_canny
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxCannyEdgeDetectorNode(vx_graph graph, vx_image input, vx_threshold hyst,
+                                vx_int32 gradient_size, vx_enum norm_type,
+                                vx_image output);
+
+/*! \brief [Graph] Creates an Affine Warp Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] matrix The affine matrix. Must be 2x3 of type \ref VX_TYPE_FLOAT32.
+ * \param [in] type The interpolation type from <tt>\ref vx_interpolation_type_e</tt>.
+ * <tt>\ref VX_INTERPOLATION_TYPE_AREA</tt> is not supported.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \ingroup group_vision_function_warp_affine
+ * \note Only <tt>\ref VX_NODE_ATTRIBUTE_BORDER_MODE</tt> value <tt>\ref VX_BORDER_MODE_UNDEFINED</tt> or
+ * <tt>\ref VX_BORDER_MODE_CONSTANT</tt> is supported.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxWarpAffineNode(vx_graph graph, vx_image input, vx_matrix matrix, vx_enum type, vx_image output);
+
+/*! \brief [Graph] Creates a Perspective Warp Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] matrix The perspective matrix. Must be 3x3 of type <tt>\ref VX_TYPE_FLOAT32</tt>.
+ * \param [in] type The interpolation type from <tt>\ref vx_interpolation_type_e</tt>.
+ * <tt>\ref VX_INTERPOLATION_TYPE_AREA</tt> is not supported.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \ingroup group_vision_function_warp_perspective
+ * \note Only <tt>\ref VX_NODE_ATTRIBUTE_BORDER_MODE</tt> value <tt>\ref VX_BORDER_MODE_UNDEFINED</tt> or
+ * <tt>\ref VX_BORDER_MODE_CONSTANT</tt> is supported.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxWarpPerspectiveNode(vx_graph graph, vx_image input, vx_matrix matrix, vx_enum type, vx_image output);
+
+/*! \brief [Graph] Creates a Harris Corners Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] strength_thresh The <tt>\ref VX_TYPE_FLOAT32</tt> minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+ * \param [in] min_distance The <tt>\ref VX_TYPE_FLOAT32</tt> radial Euclidean distance for non-maximum suppression.
+ * \param [in] sensitivity The <tt>\ref VX_TYPE_FLOAT32</tt> scalar sensitivity threshold \f$ k \f$ from the Harris-Stephens equation.
+ * \param [in] gradient_size The gradient window size to use on the input. The
+ * implementation must support at least 3, 5, and 7.
+ * \param [in] block_size The block window size used to compute the Harris Corner score.
+ * The implementation must support at least 3, 5, and 7.
+ * \param [out] corners The array of <tt>\ref VX_TYPE_KEYPOINT</tt> objects.
+ * \param [out] num_corners The total number of detected corners in image (optional). Use a \ref VX_TYPE_SIZE scalar.
+ * \ingroup group_vision_function_harris
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxHarrisCornersNode(vx_graph graph,
+                            vx_image input,
+                            vx_scalar strength_thresh,
+                            vx_scalar min_distance,
+                            vx_scalar sensitivity,
+                            vx_int32 gradient_size,
+                            vx_int32 block_size,
+                            vx_array corners,
+                            vx_scalar num_corners);
+
+/*! \brief [Graph] Creates a FAST Corners Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] strength_thresh Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3 (<tt>\ref VX_TYPE_FLOAT32</tt> scalar).
+ * \param [in] nonmax_suppression If true, non-maximum suppression is applied to
+ * detected corners before being placed in the <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt> objects.
+ * \param [out] corners Output corner <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt>.
+ * \param [out] num_corners The total number of detected corners in image (optional). Use a \ref VX_TYPE_SIZE scalar.
+ * \ingroup group_vision_function_fast
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxFastCornersNode(vx_graph graph, vx_image input, vx_scalar strength_thresh, vx_bool nonmax_suppression, vx_array corners, vx_scalar num_corners);
+
+/*! \brief [Graph] Creates a Lucas Kanade Tracking Node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] old_images Input of first (old) image pyramid in <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] new_images Input of destination (new) image pyramid <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] old_points An array of key points in a <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt>; those key points are defined at
+ *  the \a old_images high resolution pyramid.
+ * \param [in] new_points_estimates An array of estimation on what is the output key points in a <tt>\ref vx_array</tt> of
+ *  <tt>\ref VX_TYPE_KEYPOINT</tt>; those keypoints are defined at the \a new_images high resolution pyramid.
+ * \param [out] new_points An output array of key points in a <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt>; those key points are
+ *  defined at the \a new_images high resolution pyramid.
+ * \param [in] termination The termination can be <tt>\ref VX_TERM_CRITERIA_ITERATIONS</tt> or <tt>\ref VX_TERM_CRITERIA_EPSILON</tt> or
+ * <tt>\ref VX_TERM_CRITERIA_BOTH</tt>.
+ * \param [in] epsilon The <tt>\ref vx_float32</tt> error for terminating the algorithm.
+ * \param [in] num_iterations The number of iterations. Use a <tt>\ref VX_TYPE_UINT32</tt> scalar.
+ * \param [in] use_initial_estimate Use a <tt>\ref VX_TYPE_BOOL</tt> scalar.
+ * \param [in] window_dimension The size of the window on which to perform the algorithm. See 
+ *  <tt>\ref VX_CONTEXT_ATTRIBUTE_OPTICAL_FLOW_WINDOW_MAXIMUM_DIMENSION</tt>
+ * \ingroup group_vision_function_opticalflowpyrlk
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxOpticalFlowPyrLKNode(vx_graph graph,
+                               vx_pyramid old_images,
+                               vx_pyramid new_images,
+                               vx_array old_points,
+                               vx_array new_points_estimates,
+                               vx_array new_points,
+                               vx_enum termination,
+                               vx_scalar epsilon,
+                               vx_scalar num_iterations,
+                               vx_scalar use_initial_estimate,
+                               vx_size window_dimension);
+
+/*! \brief [Graph] Creates a Remap Node.
+ * \param [in] graph The reference to the graph that will contain the node.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] table The remap table object.
+ * \param [in] policy An interpolation type from <tt>\ref vx_interpolation_type_e</tt>.
+ * <tt>\ref VX_INTERPOLATION_TYPE_AREA</tt> is not supported.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \note Only <tt>\ref VX_NODE_ATTRIBUTE_BORDER_MODE</tt> value <tt>\ref VX_BORDER_MODE_UNDEFINED</tt> or
+ * <tt>\ref VX_BORDER_MODE_CONSTANT</tt> is supported.
+ * \return A <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ * \ingroup group_vision_function_remap
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxRemapNode(vx_graph graph,
+                    vx_image input,
+                    vx_remap table,
+                    vx_enum policy,
+                    vx_image output);
+
+/*! \brief [Graph] Performs a Gaussian Blur on an image then half-scales it.
+ * \details The output image size is determined by:
+ * \f[
+ * W_{output} = \frac{W_{input} + 1}{2} \\
+ * ,
+ * H_{output} = \frac{H_{input} + 1}{2}
+ * \f]
+ * \param [in] graph The reference to the graph.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] kernel_size The input size of the Gaussian filter. Supported values are 3 and 5. 
+ * \ingroup group_vision_function_scale_image
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>  
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxHalfScaleGaussianNode(vx_graph graph, vx_image input, vx_image output, vx_int32 kernel_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/openvx/include/VX/vx_types.h b/openvx/include/VX/vx_types.h
new file mode 100644
index 0000000..9704e54
--- /dev/null
+++ b/openvx/include/VX/vx_types.h
@@ -0,0 +1,1449 @@
+/*
+ * Copyright (c) 2012-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#ifndef _OPENVX_TYPES_H_
+#define _OPENVX_TYPES_H_
+
+/*!
+ * \file vx_types.h
+ * \brief The type definitions required by OpenVX Library.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+/*!
+ * \internal
+ * \def VX_API_ENTRY
+ * \brief This is a tag used to identify exported, public API functions as
+ * distinct from internal functions, helpers, and other non-public interfaces.
+ * It can optionally be defined in the make system according the the compiler and intent.
+ * \ingroup group_basic_features
+ */
+#ifndef VX_API_ENTRY
+#define VX_API_ENTRY
+#endif
+#ifndef VX_API_CALL
+#if defined(_WIN32)
+#define VX_API_CALL __stdcall
+#else
+#define VX_API_CALL
+#endif
+#endif
+#ifndef VX_CALLBACK
+#if defined(_WIN32)
+#define VX_CALLBACK __stdcall
+#else
+#define VX_CALLBACK
+#endif
+#endif
+
+/*! \brief An 8 bit ASCII character.
+ * \ingroup group_basic_features
+ */
+typedef char     vx_char;
+
+/*! \brief An 8-bit unsigned value.
+ * \ingroup group_basic_features
+ */
+typedef uint8_t  vx_uint8;
+
+/*! \brief A 16-bit unsigned value.
+ * \ingroup group_basic_features
+ */
+typedef uint16_t vx_uint16;
+
+/*! \brief A 32-bit unsigned value.
+ * \ingroup group_basic_features
+ */
+typedef uint32_t vx_uint32;
+
+/*! \brief A 64-bit unsigned value.
+ * \ingroup group_basic_features
+ */
+typedef uint64_t vx_uint64;
+
+/*! \brief An 8-bit signed value.
+ * \ingroup group_basic_features
+ */
+typedef int8_t   vx_int8;
+
+/*! \brief A 16-bit signed value.
+ * \ingroup group_basic_features
+ */
+typedef int16_t  vx_int16;
+
+/*! \brief A 32-bit signed value.
+ * \ingroup group_basic_features
+ */
+typedef int32_t  vx_int32;
+
+/*! \brief A 64-bit signed value.
+ * \ingroup group_basic_features
+ */
+typedef int64_t  vx_int64;
+
+#if defined(EXPERIMENTAL_PLATFORM_SUPPORTS_16_FLOAT)
+
+/*! \brief A 16-bit float value.
+ * \ingroup group_basic_features
+ */
+typedef hfloat   vx_float16;
+#endif
+
+/*! \brief A 32-bit float value.
+ * \ingroup group_basic_features
+ */
+typedef float    vx_float32;
+
+/*! \brief A 64-bit float value (aka double).
+ * \ingroup group_basic_features
+ */
+typedef double   vx_float64;
+
+/*! \brief A generic opaque reference to any object within OpenVX.
+ * \details A user of OpenVX should not assume that this can be cast directly to anything;
+ * however, any object in OpenVX can be cast back to this for the purposes of
+ * querying attributes of the object or for passing the object as a parameter to
+ * functions that take a <tt>\ref vx_reference</tt> type.
+ * If the API does not take that specific type but may take others, an
+ * error may be returned from the API.
+ * \ingroup group_reference
+ */
+typedef struct _vx_reference *vx_reference;
+
+/*! \brief Sets the standard enumeration type size to be a fixed quantity.
+ * \details All enumerable fields must use this type as the container to
+ * enforce enumeration ranges and sizeof() operations.
+ * \ingroup group_basic_features
+ */
+typedef int32_t vx_enum;
+
+/*! \brief A wrapper of <tt>size_t</tt> to keep the naming convention uniform.
+ * \ingroup group_basic_features
+ */
+typedef size_t vx_size;
+
+/*! \brief Used to hold a VX_DF_IMAGE code to describe the pixel format and color space.
+ * \ingroup group_basic_features
+ */
+typedef uint32_t vx_df_image;
+
+/*! \brief An opaque reference to a scalar.
+ * \details A scalar can be up to 64 bits wide.
+ * \see vxCreateScalar
+ * \ingroup group_scalar
+ * \extends vx_reference
+ */
+typedef struct _vx_scalar *vx_scalar;
+
+/*! \brief An opaque reference to an image.
+ * \see vxCreateImage
+ * \ingroup group_image
+ * \extends vx_reference
+ */
+typedef struct _vx_image *vx_image;
+
+/*! \brief An opaque reference to the descriptor of a kernel.
+ * \see vxGetKernelByName
+ * \see vxGetKernelByEnum
+ * \ingroup group_kernel
+ * \extends vx_reference
+ */
+typedef struct _vx_kernel *vx_kernel;
+
+/*! \brief An opaque reference to a single parameter.
+ * \see vxGetParameterByIndex
+ * \ingroup group_parameter
+ * \extends vx_reference
+ */
+typedef struct _vx_parameter *vx_parameter;
+
+/*! \brief An opaque reference to a kernel node.
+ * \see vxCreateGenericNode
+ * \ingroup group_node
+ * \extends vx_reference
+ */
+typedef struct _vx_node *vx_node;
+
+/*! \brief An opaque reference to a graph
+ * \see vxCreateGraph
+ * \ingroup group_graph
+ * \extends vx_reference
+ */
+typedef struct _vx_graph *vx_graph;
+
+/*! \brief An opaque reference to the implementation context.
+ * \see vxCreateContext
+ * \ingroup group_context
+ * \extends vx_reference
+ */
+typedef struct _vx_context *vx_context;
+
+/*! \brief The delay object. This is like a ring buffer of objects that is
+ * maintained by the OpenVX implementation.
+ * \see vxCreateDelay
+ * \extends vx_reference
+ * \ingroup group_delay
+ */
+typedef struct _vx_delay *vx_delay;
+
+/*! \brief The Look-Up Table (LUT) Object.
+ * \extends vx_reference
+ * \ingroup group_lut
+ */
+typedef struct _vx_lut *vx_lut;
+
+/*! \brief The Distribution object. This has a user-defined number of bins over
+ * a user-defined range (within a uint32_t range).
+ * \extends vx_reference
+ * \ingroup group_distribution
+ */
+typedef struct _vx_distribution *vx_distribution;
+
+/*! \brief The Matrix Object. An MxN matrix of some unit type.
+ * \extends vx_reference
+ * \ingroup group_matrix
+ */
+typedef struct _vx_matrix *vx_matrix;
+
+/*! \brief The Image Pyramid object. A set of scaled images.
+ * \extends vx_reference
+ * \ingroup group_pyramid
+ */
+typedef struct _vx_pyramid *vx_pyramid;
+
+/*! \brief The Threshold Object. A thresholding object contains the types and
+ * limit values of the thresholding required.
+ * \extends vx_reference
+ * \ingroup group_threshold
+ */
+typedef struct _vx_threshold *vx_threshold;
+
+/*! \brief The Convolution Object. A user-defined convolution kernel of MxM elements.
+ * \extends vx_reference
+ * \ingroup group_convolution
+ */
+typedef struct _vx_convolution *vx_convolution;
+
+/*! \brief The remap table Object. A remap table contains per-pixel mapping of
+ * output pixels to input pixels.
+ * \ingroup group_remap
+ */
+typedef struct _vx_remap *vx_remap;
+
+/*! \brief The Array Object. Array is a strongly-typed container for other data structures.
+ * \ingroup group_array
+ */
+typedef struct _vx_array *vx_array;
+
+/*! \brief A Boolean value.
+ * This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE.
+ * \code
+ * vx_bool ret = vx_true_e;
+ * if (ret) printf("true!\n");
+ * ret = vx_false_e;
+ * if (!ret) printf("false!\n");
+ * \endcode
+ * This would print both strings.
+ * \ingroup group_basic_features
+ */
+typedef enum _vx_bool_e {
+    /*! \brief The "false" value. */
+    vx_false_e = 0,
+    /*! \brief The "true" value. */
+    vx_true_e,
+} vx_bool;
+
+/*!
+ * \brief This object is used by output validation functions to specify the meta data 
+ * of the expected output data object. If the output object is an image, 
+ * the vx_meta_format object can additionally store the valid region delta rectangle.
+ * \note when the actual output object of the user node is virtual, the information 
+ * given through the vx_meta_format object allows the OpenVX framework to automatically 
+ * create the data object when meta data were not specified by the application at object 
+ * creation time. 
+ * \ingroup group_user_kernels
+ */
+typedef struct _vx_meta_format* vx_meta_format;
+
+/*! \brief The type enumeration lists all the known types in OpenVX.
+ * \ingroup group_basic_features
+ */
+enum vx_type_e {
+    VX_TYPE_INVALID         = 0x000,/*!< \brief An invalid type value. When passed an error must be returned. */
+    VX_TYPE_CHAR            = 0x001,/*!< \brief A <tt>\ref vx_char</tt>. */
+    VX_TYPE_INT8            = 0x002,/*!< \brief A <tt>\ref vx_int8</tt>. */
+    VX_TYPE_UINT8           = 0x003,/*!< \brief A <tt>\ref vx_uint8</tt>. */
+    VX_TYPE_INT16           = 0x004,/*!< \brief A <tt>\ref vx_int16</tt>. */
+    VX_TYPE_UINT16          = 0x005,/*!< \brief A <tt>\ref vx_uint16</tt>. */
+    VX_TYPE_INT32           = 0x006,/*!< \brief A <tt>\ref vx_int32</tt>. */
+    VX_TYPE_UINT32          = 0x007,/*!< \brief A <tt>\ref vx_uint32</tt>. */
+    VX_TYPE_INT64           = 0x008,/*!< \brief A <tt>\ref vx_int64</tt>. */
+    VX_TYPE_UINT64          = 0x009,/*!< \brief A <tt>\ref vx_uint64</tt>. */
+    VX_TYPE_FLOAT32         = 0x00A,/*!< \brief A <tt>\ref vx_float32</tt>. */
+    VX_TYPE_FLOAT64         = 0x00B,/*!< \brief A <tt>\ref vx_float64</tt>. */
+    VX_TYPE_ENUM            = 0x00C,/*!< \brief A <tt>\ref vx_enum</tt>. Equivalent in size to a <tt>\ref vx_int32</tt>. */
+    VX_TYPE_SIZE            = 0x00D,/*!< \brief A <tt>\ref vx_size</tt>. */
+    VX_TYPE_DF_IMAGE        = 0x00E,/*!< \brief A <tt>\ref vx_df_image</tt>. */
+#if defined(EXPERIMENTAL_PLATFORM_SUPPORTS_16_FLOAT)
+    VX_TYPE_FLOAT16         = 0x00F,/*!< \brief A <tt>\ref vx_float16</tt>. */
+#endif
+    VX_TYPE_BOOL            = 0x010,/*!< \brief A <tt>\ref vx_bool</tt>. */
+
+    /* add new scalar types here */
+
+    VX_TYPE_SCALAR_MAX,     /*!< \brief A floating value for comparison between OpenVX scalars and OpenVX structs. */
+
+    VX_TYPE_RECTANGLE       = 0x020,/*!< \brief A <tt>\ref vx_rectangle_t</tt>. */
+    VX_TYPE_KEYPOINT        = 0x021,/*!< \brief A <tt>\ref vx_keypoint_t</tt>. */
+    VX_TYPE_COORDINATES2D   = 0x022,/*!< \brief A <tt>\ref vx_coordinates2d_t</tt>. */
+    VX_TYPE_COORDINATES3D   = 0x023,/*!< \brief A <tt>\ref vx_coordinates3d_t</tt>. */
+    VX_TYPE_USER_STRUCT_START = 0x100, 
+                                    /*!< \brief A floating value for user-defined struct base index.*/
+    VX_TYPE_STRUCT_MAX      = VX_TYPE_USER_STRUCT_START - 1,     
+                                    /*!< \brief A floating value for comparison between OpenVX 
+                                          structs and user structs. */
+    VX_TYPE_VENDOR_STRUCT_START = 0x400, 
+                                    /*!< \brief A floating value for vendor-defined struct base index.*/
+    VX_TYPE_USER_STRUCT_END = VX_TYPE_VENDOR_STRUCT_START - 1, 
+                                    /*!< \brief A floating value for comparison between user structs and 
+                                          vendor structs. */
+    VX_TYPE_VENDOR_STRUCT_END = 0x7FF,   
+                                    /*!< \brief A floating value for comparison between vendor 
+                                          structs and OpenVX objects. */
+    VX_TYPE_REFERENCE       = 0x800,/*!< \brief A <tt>\ref vx_reference</tt>. */
+    VX_TYPE_CONTEXT         = 0x801,/*!< \brief A <tt>\ref vx_context</tt>. */
+    VX_TYPE_GRAPH           = 0x802,/*!< \brief A <tt>\ref vx_graph</tt>. */
+    VX_TYPE_NODE            = 0x803,/*!< \brief A <tt>\ref vx_node</tt>. */
+    VX_TYPE_KERNEL          = 0x804,/*!< \brief A <tt>\ref vx_kernel</tt>. */
+    VX_TYPE_PARAMETER       = 0x805,/*!< \brief A <tt>\ref vx_parameter</tt>. */
+    VX_TYPE_DELAY           = 0x806,/*!< \brief A <tt>\ref vx_delay</tt>. */
+    VX_TYPE_LUT             = 0x807,/*!< \brief A <tt>\ref vx_lut</tt>. */
+    VX_TYPE_DISTRIBUTION    = 0x808,/*!< \brief A <tt>\ref vx_distribution</tt>. */
+    VX_TYPE_PYRAMID         = 0x809,/*!< \brief A <tt>\ref vx_pyramid</tt>. */
+    VX_TYPE_THRESHOLD       = 0x80A,/*!< \brief A <tt>\ref vx_threshold</tt>. */
+    VX_TYPE_MATRIX          = 0x80B,/*!< \brief A <tt>\ref vx_matrix</tt>. */
+    VX_TYPE_CONVOLUTION     = 0x80C,/*!< \brief A <tt>\ref vx_convolution</tt>. */
+    VX_TYPE_SCALAR          = 0x80D,/*!< \brief A <tt>\ref vx_scalar</tt>. when needed to be completely generic for kernel validation. */
+    VX_TYPE_ARRAY           = 0x80E,/*!< \brief A <tt>\ref vx_array</tt>. */
+    VX_TYPE_IMAGE           = 0x80F,/*!< \brief A <tt>\ref vx_image</tt>. */
+    VX_TYPE_REMAP           = 0x810,/*!< \brief A <tt>\ref vx_remap</tt>. */
+    VX_TYPE_ERROR           = 0x811,/*!< \brief An error object which has no type. */
+    VX_TYPE_META_FORMAT     = 0x812,/*!< \brief A <tt>\ref vx_meta_format</tt>. */
+
+    /* \todo add new object types here */
+
+    VX_TYPE_VENDOR_OBJECT_START  = 0xC00,/*!< \brief A floating value for vendor defined object base index. */
+    VX_TYPE_OBJECT_MAX      = VX_TYPE_VENDOR_OBJECT_START - 1,/*!< \brief A value used for bound checking the OpenVX object types. */
+    VX_TYPE_VENDOR_OBJECT_END   = 0xFFF,/*!< \brief A value used for bound checking of vendor objects */
+};
+
+/*! \brief The enumeration of all status codes.
+ * \see vx_status.
+ * \ingroup group_basic_features
+ */
+enum vx_status_e {
+    VX_STATUS_MIN                       = -25,/*!< \brief Indicates the lower bound of status codes in VX. Used for bounds checks only. */
+    /* add new codes here */
+    VX_ERROR_REFERENCE_NONZERO          = -24,/*!< \brief Indicates that an operation did not complete due to a reference count being non-zero. */
+    VX_ERROR_MULTIPLE_WRITERS           = -23,/*!< \brief Indicates that the graph has more than one node outputting to the same data object. This is an invalid graph structure. */
+    VX_ERROR_GRAPH_ABANDONED            = -22,/*!< \brief Indicates that the graph is stopped due to an error or a callback that abandoned execution. */
+    VX_ERROR_GRAPH_SCHEDULED            = -21,/*!< \brief Indicates that the supplied graph already has been scheduled and may be currently executing. */
+    VX_ERROR_INVALID_SCOPE              = -20,/*!< \brief Indicates that the supplied parameter is from another scope and cannot be used in the current scope. */
+    VX_ERROR_INVALID_NODE               = -19,/*!< \brief Indicates that the supplied node could not be created.*/
+    VX_ERROR_INVALID_GRAPH              = -18,/*!< \brief Indicates that the supplied graph has invalid connections (cycles). */
+    VX_ERROR_INVALID_TYPE               = -17,/*!< \brief Indicates that the supplied type parameter is incorrect. */
+    VX_ERROR_INVALID_VALUE              = -16,/*!< \brief Indicates that the supplied parameter has an incorrect value. */
+    VX_ERROR_INVALID_DIMENSION          = -15,/*!< \brief Indicates that the supplied parameter is too big or too small in dimension. */
+    VX_ERROR_INVALID_FORMAT             = -14,/*!< \brief Indicates that the supplied parameter is in an invalid format. */
+    VX_ERROR_INVALID_LINK               = -13,/*!< \brief Indicates that the link is not possible as specified. The parameters are incompatible. */
+    VX_ERROR_INVALID_REFERENCE          = -12,/*!< \brief Indicates that the reference provided is not valid. */
+    VX_ERROR_INVALID_MODULE             = -11,/*!< \brief This is returned from <tt>\ref vxLoadKernels</tt> when the module does not contain the entry point. */
+    VX_ERROR_INVALID_PARAMETERS         = -10,/*!< \brief Indicates that the supplied parameter information does not match the kernel contract. */
+    VX_ERROR_OPTIMIZED_AWAY             = -9,/*!< \brief Indicates that the object refered to has been optimized out of existence. */
+    VX_ERROR_NO_MEMORY                  = -8,/*!< \brief Indicates that an internal or implicit allocation failed. Typically catastrophic. After detection, deconstruct the context. \see vxVerifyGraph. */
+    VX_ERROR_NO_RESOURCES               = -7,/*!< \brief Indicates that an internal or implicit resource can not be acquired (not memory). This is typically catastrophic. After detection, deconstruct the context. \see vxVerifyGraph. */
+    VX_ERROR_NOT_COMPATIBLE             = -6,/*!< \brief Indicates that the attempt to link two parameters together failed due to type incompatibilty. */
+    VX_ERROR_NOT_ALLOCATED              = -5,/*!< \brief Indicates to the system that the parameter must be allocated by the system.  */
+    VX_ERROR_NOT_SUFFICIENT             = -4,/*!< \brief Indicates that the given graph has failed verification due to an insufficient number of required parameters, which cannot be automatically created. Typically this indicates required atomic parameters. \see vxVerifyGraph. */
+    VX_ERROR_NOT_SUPPORTED              = -3,/*!< \brief Indicates that the requested set of parameters produce a configuration that cannot be supported. Refer to the supplied documentation on the configured kernels. \see vx_kernel_e. */
+    VX_ERROR_NOT_IMPLEMENTED            = -2,/*!< \brief Indicates that the requested kernel is missing. \see vx_kernel_e vxGetKernelByName. */
+    VX_FAILURE                          = -1,/*!< \brief Indicates a generic error code, used when no other describes the error. */
+    VX_SUCCESS                          =  0,/*!< \brief No error. */
+};
+
+/*! \brief A formal status type with known fixed size.
+ * \see vx_status_e
+ * \ingroup group_basic_features
+ */
+typedef vx_enum vx_status;
+
+/*! \brief The formal typedef of the response from the callback.
+ * \see vx_action_e
+ * \ingroup group_node_callback
+ */
+typedef vx_enum vx_action;
+
+/*! \brief A callback to the client after a particular node has completed.
+ * \see vx_action
+ * \see vxAssignNodeCallback
+ * \param [in] node The node to which the callback was attached.
+ * \return An action code from <tt>\ref vx_action_e</tt>.
+ * \ingroup group_node_callback
+ */
+typedef vx_action (VX_CALLBACK *vx_nodecomplete_f)(vx_node node);
+
+/*! \brief Vendor IDs are 2 nibbles in size and are located in the upper byte of
+ * the 4 bytes of an enumeration.
+ * \ingroup group_basic_features
+ */
+#define VX_VENDOR_MASK                      (0xFFF00000)
+
+/*! \brief A type mask removes the scalar/object type from the attribute.
+ * It is 3 nibbles in size and is contained between the third and second byte.
+ * \see vx_type_e
+ * \ingroup group_basic_features
+ */
+#define VX_TYPE_MASK                        (0x000FFF00)
+
+/*! \brief A library is a set of vision kernels with its own ID supplied by a vendor.
+ * The vendor defines the library ID. The range is \f$ [0,2^{8}-1] \f$ inclusive.
+ * \ingroup group_basic_features
+ */
+#define VX_LIBRARY_MASK                     (0x000FF000)
+
+/*! \brief An individual kernel in a library has its own unique ID within \f$ [0,2^{12}-1] \f$ (inclusive).
+ * \ingroup group_basic_features
+ */
+#define VX_KERNEL_MASK                      (0x00000FFF)
+
+/*! \brief An object's attribute ID is within the range of \f$ [0,2^{8}-1] \f$ (inclusive).
+ * \ingroup group_basic_features
+ */
+#define VX_ATTRIBUTE_ID_MASK                (0x000000FF)
+
+/*! \brief A type of enumeration. The valid range is between \f$ [0,2^{8}-1] \f$ (inclusive).
+ * \ingroup group_basic_features
+ */
+#define VX_ENUM_TYPE_MASK                   (0x000FF000)
+
+/*! \brief A generic enumeration list can have values between \f$ [0,2^{12}-1] \f$ (inclusive).
+ * \ingroup group_basic_features
+ */
+#define VX_ENUM_MASK                        (0x00000FFF)
+
+/*! \brief A macro to extract the vendor ID from the enumerated value.
+ * \ingroup group_basic_features
+ */
+#define VX_VENDOR(e)                        (((vx_uint32)e & VX_VENDOR_MASK) >> 20)
+
+/*! \brief A macro to extract the type from an enumerated attribute value.
+ * \ingroup group_basic_features
+ */
+#define VX_TYPE(e)                          (((vx_uint32)e & VX_TYPE_MASK) >> 8)
+
+/*! \brief A macro to extract the enum type from an enumerated value.
+ * \ingroup group_basic_features
+ */
+#define VX_ENUM_TYPE(e)                     (((vx_uint32)e & VX_ENUM_TYPE_MASK) >> 12)
+
+/*! \brief A macro to extract the kernel library enumeration from a enumerated kernel value.
+ * \ingroup group_basic_features
+ */
+#define VX_LIBRARY(e)                       (((vx_uint32)e & VX_LIBRARY_MASK) >> 12)
+
+#if defined(_LITTLE_ENDIAN_) || (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(_WIN32)
+#define VX_DF_IMAGE(a,b,c,d)                  ((a) | (b << 8) | (c << 16) | (d << 24))
+#define VX_ATTRIBUTE_BASE(vendor, object)   (((vendor) << 20) | (object << 8))
+#define VX_KERNEL_BASE(vendor, lib)         (((vendor) << 20) | (lib << 12))
+#define VX_ENUM_BASE(vendor, id)            (((vendor) << 20) | (id << 12))
+#elif defined(_BIG_ENDIAN_) || (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define VX_DF_IMAGE(a,b,c,d)                  ((d) | (c << 8) | (b << 16) | (a << 24))
+#define VX_ATTRIBUTE_BASE(vendor, object)   ((vendor) | (object << 12))
+#define VX_KERNEL_BASE(vendor, lib)         ((vendor) | (lib << 12))
+#define VX_ENUM_BASE(vendor, id)            ((vendor) | (id << 12))
+#else
+#error "Endian-ness must be defined!"
+#endif
+
+/*! \def VX_DF_IMAGE
+ * \brief Converts a set of four chars into a \c uint32_t container of a VX_DF_IMAGE code.
+ * \note Use a <tt>\ref vx_df_image</tt> variable to hold the value.
+ * \ingroup group_basic_features
+ */
+
+/*! \def VX_ATTRIBUTE_BASE
+ * \brief Defines the manner in which to combine the Vendor and Object IDs to get
+ * the base value of the enumeration.
+ * \ingroup group_basic_features
+ */
+
+/*! \def VX_KERNEL_BASE
+ * \brief Defines the manner in which to combine the Vendor and Library IDs to get
+ * the base value of the enumeration.
+ * \ingroup group_basic_features
+ */
+
+/*! \def VX_ENUM_BASE
+ * \brief Defines the manner in which to combine the Vendor and Object IDs to get
+ * the base value of the enumeration.
+ * \details From any enumerated value (with exceptions), the vendor, and enumeration
+ * type should be extractable. Those types that are exceptions are
+ * <tt>\ref vx_vendor_id_e</tt>, <tt>\ref vx_type_e</tt>, <tt>\ref vx_enum_e</tt>, <tt>\ref vx_df_image_e</tt>, and \c vx_bool.
+ * \ingroup group_basic_features
+ */
+
+/*! \brief The set of supported enumerations in OpenVX.
+ * \details These can be extracted from enumerated values using <tt>\ref VX_ENUM_TYPE</tt>.
+ * \ingroup group_basic_features
+ */
+enum vx_enum_e {
+    VX_ENUM_DIRECTION       = 0x00, /*!< \brief Parameter Direction. */
+    VX_ENUM_ACTION          = 0x01, /*!< \brief Action Codes. */
+    VX_ENUM_HINT            = 0x02, /*!< \brief Hint Values. */
+    VX_ENUM_DIRECTIVE       = 0x03, /*!< \brief Directive Values. */
+    VX_ENUM_INTERPOLATION   = 0x04, /*!< \brief Interpolation Types. */
+    VX_ENUM_OVERFLOW        = 0x05, /*!< \brief Overflow Policies. */
+    VX_ENUM_COLOR_SPACE     = 0x06, /*!< \brief Color Space. */
+    VX_ENUM_COLOR_RANGE     = 0x07, /*!< \brief Color Space Range. */
+    VX_ENUM_PARAMETER_STATE = 0x08, /*!< \brief Parameter State. */
+    VX_ENUM_CHANNEL         = 0x09, /*!< \brief Channel Name. */
+    VX_ENUM_CONVERT_POLICY  = 0x0A, /*!< \brief Convert Policy. */
+    VX_ENUM_THRESHOLD_TYPE  = 0x0B, /*!< \brief Threshold Type List. */
+    VX_ENUM_BORDER_MODE     = 0x0C, /*!< \brief Border Mode List. */
+    VX_ENUM_COMPARISON      = 0x0D, /*!< \brief Comparison Values. */
+    VX_ENUM_IMPORT_MEM      = 0x0E, /*!< \brief The memory import enumeration. */
+    VX_ENUM_TERM_CRITERIA   = 0x0F, /*!< \brief A termination criteria. */
+    VX_ENUM_NORM_TYPE       = 0x10, /*!< \brief A norm type. */
+    VX_ENUM_ACCESSOR        = 0x11, /*!< \brief An accessor flag type. */
+    VX_ENUM_ROUND_POLICY    = 0x12, /*!< \brief Rounding Policy. */
+};
+
+/*! \brief A return code enumeration from a <tt>\ref vx_nodecomplete_f</tt> during execution.
+ * \see <tt>vxAssignNodeCallback</tt>
+ * \ingroup group_node_callback
+ */
+enum vx_action_e {
+    /*! \brief Continue executing the graph with no changes. */
+    VX_ACTION_CONTINUE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACTION) + 0x0,
+    /*! \brief Stop executing the graph. */
+    VX_ACTION_ABANDON  = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACTION) + 0x1,
+};
+
+/*! \brief An indication of how a kernel will treat the given parameter.
+ * \ingroup group_parameter
+ */
+enum vx_direction_e {
+    /*! \brief The parameter is an input only. */
+    VX_INPUT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTION) + 0x0,
+    /*! \brief The parameter is an output only. */
+    VX_OUTPUT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTION) + 0x1,
+    /*! \brief The parameter is both an input and output. */
+    VX_BIDIRECTIONAL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTION) + 0x2,
+};
+
+/*! \brief These enumerations are given to the \c vxHint API to enable/disable platform
+ * optimizations and/or features. Hints are optional and usually are vendor-specific.
+ * \see <tt>vxHint</tt>
+ * \ingroup group_hint
+ */
+enum vx_hint_e {
+    /*! \brief Indicates to the implementation that the user wants to disable
+     * any parallelization techniques. Implementations may not be parallelized,
+     * so this is a hint only.
+     */
+    VX_HINT_SERIALIZE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_HINT) + 0x0,
+};
+
+/*! \brief These enumerations are given to the \c vxDirective API to enable/disable
+ * platform optimizations and/or features. Directives are not optional and
+ * usually are vendor-specific, by defining a vendor range of directives and
+ * starting their enumeration from there.
+ * \see <tt>vxDirective</tt>
+ * \ingroup group_directive
+ */
+enum vx_directive_e {
+    /*! \brief Disables recording information for graph debugging. */
+    VX_DIRECTIVE_DISABLE_LOGGING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTIVE) + 0x0,
+    /*! \brief Enables recording information for graph debugging. */
+    VX_DIRECTIVE_ENABLE_LOGGING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTIVE) + 0x1,
+};
+
+/*! \brief The Conversion Policy Enumeration.
+ * \ingroup group_basic_features
+ */
+enum vx_convert_policy_e {
+    /*! \brief Results are the least significant bits of the output operand, as if
+     * stored in two's complement binary format in the size of its bit-depth.
+     */
+    VX_CONVERT_POLICY_WRAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x0,
+    /*! \brief Results are saturated to the bit depth of the output operand. */
+    VX_CONVERT_POLICY_SATURATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x1,
+};
+
+/*! \brief Based on the VX_DF_IMAGE definition.
+ * \note Use <tt>\ref vx_df_image</tt> to contain these values.
+ * \ingroup group_basic_features
+ */
+enum vx_df_image_e {
+    /*! \brief A virtual image of no defined type. */
+    VX_DF_IMAGE_VIRT = VX_DF_IMAGE('V','I','R','T'),
+    /*! \brief A single plane of 24-bit pixel as 3 interleaved 8-bit units of
+     * R then G then B data. This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_RGB  = VX_DF_IMAGE('R','G','B','2'),
+    /*! \brief A single plane of 32-bit pixel as 4 interleaved 8-bit units of
+     * R then G then B data, then a <i>don't care</i> byte.
+     * This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_RGBX = VX_DF_IMAGE('R','G','B','A'),
+    /*! \brief A 2-plane YUV format of Luma (Y) and interleaved UV data at
+     * 4:2:0 sampling. This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_NV12 = VX_DF_IMAGE('N','V','1','2'),
+    /*! \brief A 2-lane YUV format of Luma (Y) and interleaved VU data at
+     * 4:2:0 sampling. This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_NV21 = VX_DF_IMAGE('N','V','2','1'),
+    /*! \brief A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 bytes.
+     * This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_UYVY = VX_DF_IMAGE('U','Y','V','Y'),
+    /*! \brief A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes.
+     * This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_YUYV = VX_DF_IMAGE('Y','U','Y','V'),
+    /*! \brief A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes.
+     * This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_IYUV = VX_DF_IMAGE('I','Y','U','V'),
+    /*! \brief A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes.
+     * This uses the BT709 full range by default.
+     */
+    VX_DF_IMAGE_YUV4 = VX_DF_IMAGE('Y','U','V','4'),
+    /*! \brief A single plane of unsigned 8-bit data.
+     * The range of data is not specified, as it may be extracted from a YUV or
+     * generated.
+     */
+    VX_DF_IMAGE_U8 = VX_DF_IMAGE('U','0','0','8'),
+    /*! \brief A single plane of unsigned 16-bit data.
+     * The range of data is not specified, as it may be extracted from a YUV or
+     * generated.
+     */
+    VX_DF_IMAGE_U16  = VX_DF_IMAGE('U','0','1','6'),
+    /*! \brief A single plane of signed 16-bit data.
+     * The range of data is not specified, as it may be extracted from a YUV or
+     * generated.
+     */
+    VX_DF_IMAGE_S16  = VX_DF_IMAGE('S','0','1','6'),
+    /*! \brief A single plane of unsigned 32-bit data.
+     * The range of data is not specified, as it may be extracted from a YUV or
+     * generated.
+     */
+    VX_DF_IMAGE_U32  = VX_DF_IMAGE('U','0','3','2'),
+    /*! \brief A single plane of unsigned 32-bit data.
+     * The range of data is not specified, as it may be extracted from a YUV or
+     * generated.
+     */
+    VX_DF_IMAGE_S32  = VX_DF_IMAGE('S','0','3','2'),
+};
+
+/*! \brief The reference attributes list.
+ * \ingroup group_reference
+ */
+enum vx_reference_attribute_e {
+    /*! \brief Returns the reference count of the object. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_REF_ATTRIBUTE_COUNT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REFERENCE) + 0x0,
+    /*! \brief Returns the <tt>\ref vx_type_e</tt> of the reference. Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_REF_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REFERENCE) + 0x1,
+};
+
+/*! \brief A list of context attributes.
+ * \ingroup group_context
+ */
+enum vx_context_attribute_e {
+    /*! \brief Queries the unique vendor ID. Use a <tt>\ref vx_uint16</tt>. */
+    VX_CONTEXT_ATTRIBUTE_VENDOR_ID = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x0,
+    /*! \brief Queries the OpenVX Version Number. Use a <tt>\ref vx_uint16</tt> */
+    VX_CONTEXT_ATTRIBUTE_VERSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x1,
+    /*! \brief Queries the context for the number of \e unique kernels. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNELS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x2,
+    /*! \brief Queries the context for the number of active modules. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_CONTEXT_ATTRIBUTE_MODULES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x3,
+    /*! \brief Queries the context for the number of active references. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_CONTEXT_ATTRIBUTE_REFERENCES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x4,
+    /*! \brief Queries the context for it's implementation name. Use a <tt>\ref vx_char</tt>[<tt>\ref VX_MAX_IMPLEMENTATION_NAME</tt>] array */
+    VX_CONTEXT_ATTRIBUTE_IMPLEMENTATION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x5,
+    /*! \brief Queries the number of bytes in the extensions string. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_CONTEXT_ATTRIBUTE_EXTENSIONS_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x6,
+    /*! \brief Retrieves the extensions string. This is a space-separated string of extension names. Use a <tt>\ref vx_char</tt> pointer allocated to the size returned from <tt>\ref VX_CONTEXT_ATTRIBUTE_EXTENSIONS_SIZE</tt>. */
+    VX_CONTEXT_ATTRIBUTE_EXTENSIONS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x7,
+    /*! \brief The maximum width or height of a convolution matrix.
+     * Use a <tt>\ref vx_size</tt> parameter.
+     * Each vendor must support centered kernels of size w X h, where both w
+     * and h are odd numbers, 3 <= w <= n and 3 <= h <= n, where n is the value of the
+     * <tt>\ref VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION</tt> attribute. n is an odd
+     * number that should not be smaller than 9. w and h may or may not be equal to
+     * each other. All combinations of w and h meeting the conditions above must be
+     * supported. The behavior of <tt>\ref vxCreateConvolution</tt> is undefined for values
+     * larger than the value returned by this attribute.
+     */
+    VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x8,
+    /*! \brief The maximum window dimension of the OpticalFlowPyrLK kernel.
+     * \see <tt>\ref VX_KERNEL_OPTICAL_FLOW_PYR_LK</tt>. Use a <tt>\ref vx_size</tt> parameter.
+     */
+    VX_CONTEXT_ATTRIBUTE_OPTICAL_FLOW_WINDOW_MAXIMUM_DIMENSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x9,
+    /*! \brief The border mode for immediate mode functions.
+     * \details Graph mode functions are unaffected by this attribute. Use a pointer to a <tt>\ref vx_border_mode_t</tt> structure as parameter.
+     * \note The assumed default value for immediate mode functions is <tt>\ref VX_BORDER_MODE_UNDEFINED</tt>.
+     */
+    VX_CONTEXT_ATTRIBUTE_IMMEDIATE_BORDER_MODE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xA,
+    /*! \brief Returns the table of all unique the kernels that exist in the context.
+     *  Use a <tt>\ref vx_kernel_info_t</tt> array.
+     * \pre You must call <tt>\ref vxQueryContext</tt> with <tt>\ref VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNELS</tt>
+     * to compute the necessary size of the array.
+     */
+    VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNEL_TABLE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xB,
+};
+
+/*! \brief The kernel attributes list
+ * \ingroup group_kernel
+ */
+enum vx_kernel_attribute_e {
+    /*! \brief Queries a kernel for the number of parameters the kernel
+     * supports. Use a <tt>\ref vx_uint32</tt> parameter.
+     */
+    VX_KERNEL_ATTRIBUTE_PARAMETERS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x0,
+    /*! \brief Queries the name of the kernel. Not settable.
+     * Use a <tt>\ref vx_char</tt>[<tt>\ref VX_MAX_KERNEL_NAME</tt>] array (not a <tt>\ref vx_array</tt>).
+     */
+    VX_KERNEL_ATTRIBUTE_NAME = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x1,
+    /*! \brief Queries the enum of the kernel. Not settable.
+     * Use a <tt>\ref vx_enum</tt> parameter.
+     */
+    VX_KERNEL_ATTRIBUTE_ENUM = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x2,
+    /*! \brief The local data area allocated with each kernel when it becomes a
+     * node. Use a <tt>\ref vx_size</tt> parameter.
+     * \note If not set it will default to zero.
+     */
+    VX_KERNEL_ATTRIBUTE_LOCAL_DATA_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x3,
+    /*! \brief The local data pointer allocated with each kernel when it becomes
+     * a node. Use a void pointer parameter.
+     * Use a <tt>\ref vx_size</tt> parameter.
+     */
+    VX_KERNEL_ATTRIBUTE_LOCAL_DATA_PTR = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x4,
+};
+
+/*! \brief The node attributes list.
+ * \ingroup group_node
+ */
+enum vx_node_attribute_e {
+    /*! \brief Queries the status of node execution. Use a <tt>\ref vx_status</tt> parameter. */
+    VX_NODE_ATTRIBUTE_STATUS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x0,
+    /*! \brief Queries the performance of the node execution. Use a <tt>\ref vx_perf_t</tt> parameter. */
+    VX_NODE_ATTRIBUTE_PERFORMANCE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x1,
+    /*! \brief Gets or sets the border mode of the node.
+     * Use a <tt>\ref vx_border_mode_t</tt> structure.
+     */
+    VX_NODE_ATTRIBUTE_BORDER_MODE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x2,
+    /*! \brief Indicates the size of the kernel local memory area.
+     * Use a <tt>\ref vx_size</tt> parameter.
+     */
+    VX_NODE_ATTRIBUTE_LOCAL_DATA_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x3,
+    /*! \brief Indicates the pointer kernel local memory area.
+     * Use a void * parameter.
+     */
+    VX_NODE_ATTRIBUTE_LOCAL_DATA_PTR = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x4,
+};
+
+/*! \brief The parameter attributes list
+ * \ingroup group_parameter
+ */
+enum vx_parameter_attribute_e {
+    /*! \brief Queries a parameter for its index value on the kernel with which it is associated. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_PARAMETER_ATTRIBUTE_INDEX = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x0,
+    /*! \brief Queries a parameter for its direction value on the kernel with which it is associated. Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_PARAMETER_ATTRIBUTE_DIRECTION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x1,
+    /*! \brief Queries a parameter for its type, \ref vx_type_e is returned. The size of the parameter is implied for plain data objects. For opaque data objects like images and arrays a query to their attributes has to be called to determine the size. */
+    VX_PARAMETER_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x2,
+    /*! \brief Queries a parameter for its state. A value in <tt>\ref vx_parameter_state_e</tt> is returned. Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_PARAMETER_ATTRIBUTE_STATE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x3,
+    /*! \brief Use to extract the reference contained in the parameter. Use a <tt>\ref vx_reference</tt> parameter.  */
+    VX_PARAMETER_ATTRIBUTE_REF = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x4,
+};
+
+/*! \brief The image attributes list.
+ * \ingroup group_image
+ */
+enum vx_image_attribute_e {
+    /*! \brief Queries an image for its height. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x0,
+    /*! \brief Queries an image for its width. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x1,
+    /*! \brief Queries an image for its format. Use a <tt>\ref vx_df_image</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x2,
+    /*! \brief Queries an image for its number of planes. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_PLANES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x3,
+    /*! \brief Queries an image for its color space (see <tt>\ref vx_color_space_e</tt>). Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_SPACE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x4,
+    /*! \brief Queries an image for its channel range (see <tt>\ref vx_channel_range_e</tt>). Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_RANGE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x5,
+    /*! \brief Queries an image for its total number of bytes. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_IMAGE_ATTRIBUTE_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x6,
+};
+
+/*! \brief The scalar attributes list.
+ * \ingroup group_scalar
+ */
+enum vx_scalar_attribute_e {
+    /*! \brief Queries the type of atomic that is contained in the scalar. Use a <tt>\ref vx_enum</tt> parameter.*/
+    VX_SCALAR_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_SCALAR) + 0x0,
+};
+
+/*! \brief The graph attributes list.
+ * \ingroup group_graph
+ */
+enum vx_graph_attribute_e {
+    /*! \brief Returns the number of nodes in a graph. Use a <tt>\ref vx_uint32</tt> parameter.*/
+    VX_GRAPH_ATTRIBUTE_NUMNODES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x0,
+    /*! \brief Returns the overall status of the graph. Use a <tt>\ref vx_status</tt> parameter.*/
+    VX_GRAPH_ATTRIBUTE_STATUS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x1,
+    /*! \brief Returns the overall performance of the graph. Use a <tt>\ref vx_perf_t</tt> parameter. */
+    VX_GRAPH_ATTRIBUTE_PERFORMANCE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x2,
+    /*! \brief Returns the number of explicitly declared parameters on the graph. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_GRAPH_ATTRIBUTE_NUMPARAMETERS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x3,
+};
+
+/*! \brief The Look-Up Table (LUT) attribute list.
+ * \ingroup group_lut
+ */
+enum vx_lut_attribute_e {
+    /*! \brief Indicates the value type of the LUT. Use a <tt>\ref vx_enum</tt>. */
+    VX_LUT_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x0,
+    /*! \brief Indicates the number of elements in the LUT. Use a <tt>\ref vx_size</tt>. */
+    VX_LUT_ATTRIBUTE_COUNT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x1,
+    /*! \brief Indicates the total size of the LUT in bytes. Uses a <tt>\ref vx_size</tt>. */
+    VX_LUT_ATTRIBUTE_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x2,
+};
+
+/*! \brief The distribution attribute list.
+ * \ingroup group_distribution
+ */
+enum vx_distribution_attribute_e {
+    /*! \brief Indicates the number of dimensions in the distribution. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_DISTRIBUTION_ATTRIBUTE_DIMENSIONS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x0,
+    /*! \brief Indicates the start of the values to use (inclusive). Use a <tt>\ref vx_int32</tt> parameter. */
+    VX_DISTRIBUTION_ATTRIBUTE_OFFSET = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x1,
+    /*! \brief Indicates end value to use as the range. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_DISTRIBUTION_ATTRIBUTE_RANGE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x2,
+    /*! \brief Indicates the number of bins. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_DISTRIBUTION_ATTRIBUTE_BINS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x3,
+    /*! \brief Indicates the range of a bin. Use a <tt>\ref vx_uint32</tt> parameter.  */
+    VX_DISTRIBUTION_ATTRIBUTE_WINDOW = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x4,
+    /*! \brief Indicates the total size of the distribution in bytes. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_DISTRIBUTION_ATTRIBUTE_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x5,
+};
+
+/*! \brief The Threshold types.
+ * \ingroup group_threshold
+ */
+enum vx_threshold_type_e {
+    /*! \brief A threshold with only 1 value. */
+    VX_THRESHOLD_TYPE_BINARY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_THRESHOLD_TYPE) + 0x0,
+    /*! \brief A threshold with 2 values (upper/lower). Use with Canny Edge Detection. */
+    VX_THRESHOLD_TYPE_RANGE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_THRESHOLD_TYPE) + 0x1,
+};
+
+/*! \brief The threshold attributes.
+ * \ingroup group_threshold
+ */
+enum vx_threshold_attribute_e {
+    /*! \brief The value type of the threshold. Use a <tt>\ref vx_enum</tt> parameter. Will contain a <tt>\ref vx_threshold_type_e</tt>. */
+    VX_THRESHOLD_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x0,
+    /*! \brief The value of the single threshold. Use a <tt>\ref vx_int32</tt> parameter. */
+    VX_THRESHOLD_ATTRIBUTE_THRESHOLD_VALUE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x1,
+    /*! \brief The value of the lower threshold. Use a <tt>\ref vx_int32</tt> parameter. */
+    VX_THRESHOLD_ATTRIBUTE_THRESHOLD_LOWER = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x2,
+    /*! \brief The value of the higher threshold. Use a <tt>\ref vx_int32</tt> parameter. */
+    VX_THRESHOLD_ATTRIBUTE_THRESHOLD_UPPER = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x3,
+    /*! \brief The value of the TRUE threshold. Use a <tt>\ref vx_int32</tt> parameter. */
+    VX_THRESHOLD_ATTRIBUTE_TRUE_VALUE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x4,
+    /*! \brief The value of the FALSE threshold. Use a <tt>\ref vx_int32</tt> parameter. */
+    VX_THRESHOLD_ATTRIBUTE_FALSE_VALUE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x5,
+    /*! \brief The data type of the threshold's value. Use a <tt>\ref vx_enum</tt> parameter. Will contain a <tt>\ref vx_type_e</tt>.*/
+    VX_THRESHOLD_ATTRIBUTE_DATA_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x6,
+};
+
+/*! \brief The matrix attributes.
+ * \ingroup group_matrix
+ */
+enum vx_matrix_attribute_e {
+    /*! \brief The value type of the matrix. Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_MATRIX_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x0,
+    /*! \brief The M dimension of the matrix. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_MATRIX_ATTRIBUTE_ROWS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x1,
+    /*! \brief The N dimension of the matrix. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_MATRIX_ATTRIBUTE_COLUMNS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x2,
+    /*! \brief The total size of the matrix in bytes. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_MATRIX_ATTRIBUTE_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x3,
+};
+
+/*! \brief The convolution attributes.
+ * \ingroup group_convolution
+ */
+enum vx_convolution_attribute_e {
+    /*! \brief The number of rows of the convolution matrix. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_CONVOLUTION_ATTRIBUTE_ROWS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x0,
+    /*! \brief The number of columns of the convolution matrix. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_CONVOLUTION_ATTRIBUTE_COLUMNS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x1,
+    /*! \brief The scale of the convolution matrix. Use a <tt>\ref vx_uint32</tt> parameter.
+     * \if OPENVX_STRICT_1_0
+     * \note For 1.0, only powers of 2 are supported up to 2^31.
+     * \endif
+     */
+    VX_CONVOLUTION_ATTRIBUTE_SCALE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x2,
+    /*! \brief The total size of the convolution matrix in bytes. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_CONVOLUTION_ATTRIBUTE_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x3,
+};
+
+/*! \brief The pyramid object attributes.
+ * \ingroup group_pyramid
+ */
+enum vx_pyramid_attribute_e {
+    /*! \brief The number of levels of the pyramid. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_PYRAMID_ATTRIBUTE_LEVELS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x0,
+    /*! \brief The scale factor between each level of the pyramid. Use a <tt>\ref vx_float32</tt> parameter. */
+    VX_PYRAMID_ATTRIBUTE_SCALE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x1,
+    /*! \brief The width of the 0th image in pixels. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_PYRAMID_ATTRIBUTE_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x2,
+    /*! \brief The height of the 0th image in pixels. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_PYRAMID_ATTRIBUTE_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x3,
+    /*! \brief The <tt>\ref vx_df_image_e</tt> format of the image. Use a <tt>\ref vx_df_image</tt> parameter. */
+    VX_PYRAMID_ATTRIBUTE_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x4,
+};
+
+/*! \brief The remap object attributes.
+ * \ingroup group_remap
+ */
+enum vx_remap_attribute_e {
+    /*! \brief The source width. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_REMAP_ATTRIBUTE_SOURCE_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x0,
+    /*! \brief The source height. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_REMAP_ATTRIBUTE_SOURCE_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x1,
+    /*! \brief The destination width. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_REMAP_ATTRIBUTE_DESTINATION_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x2,
+    /*! \brief The destination height. Use a <tt>\ref vx_uint32</tt> parameter. */
+    VX_REMAP_ATTRIBUTE_DESTINATION_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x3,
+};
+
+/*! \brief The array object attributes.
+ * \ingroup group_array
+ */
+enum vx_array_attribute_e {
+    /*! \brief The type of the Array items. Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_ARRAY_ATTRIBUTE_ITEMTYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x0,
+    /*! \brief The number of items in the Array. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_ARRAY_ATTRIBUTE_NUMITEMS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x1,
+    /*! \brief The maximal number of items that the Array can hold. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_ARRAY_ATTRIBUTE_CAPACITY = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x2,
+    /*! \brief Queries an array item size. Use a <tt>\ref vx_size</tt> parameter. */
+    VX_ARRAY_ATTRIBUTE_ITEMSIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x3,
+};
+
+/*! \brief The meta format object attributes.
+ * \ingroup group_user_kernels
+ */
+enum vx_meta_format_attribute_e {
+    /*! \brief Configures a delta rectangle during kernel output parameter validation. Use a <tt>\ref vx_delta_rectangle_t</tt>. */
+    VX_META_FORMAT_ATTRIBUTE_DELTA_RECTANGLE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_META_FORMAT) + 0x0,
+};
+
+/*! \brief The channel enumerations for channel extractions.
+ * \see vxChannelExtractNode
+ * \see vxuChannelExtract
+ * \see VX_KERNEL_CHANNEL_EXTRACT
+ * \ingroup group_basic_features
+ */
+enum vx_channel_e {
+    /*! \brief Used by formats with unknown channel types. */
+    VX_CHANNEL_0 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x0,
+    /*! \brief Used by formats with unknown channel types. */
+    VX_CHANNEL_1 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x1,
+    /*! \brief Used by formats with unknown channel types. */
+    VX_CHANNEL_2 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x2,
+    /*! \brief Used by formats with unknown channel types. */
+    VX_CHANNEL_3 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x3,
+
+    /*! \brief Use to extract the RED channel, no matter the byte or packing order. */
+    VX_CHANNEL_R = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x10,
+    /*! \brief Use to extract the GREEN channel, no matter the byte or packing order. */
+    VX_CHANNEL_G = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x11,
+    /*! \brief Use to extract the BLUE channel, no matter the byte or packing order. */
+    VX_CHANNEL_B = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x12,
+    /*! \brief Use to extract the ALPHA channel, no matter the byte or packing order. */
+    VX_CHANNEL_A = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x13,
+    /*! \brief Use to extract the LUMA channel, no matter the byte or packing order. */
+    VX_CHANNEL_Y = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x14,
+    /*! \brief Use to extract the Cb/U channel, no matter the byte or packing order. */
+    VX_CHANNEL_U = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x15,
+    /*! \brief Use to extract the Cr/V/Value channel, no matter the byte or packing order. */
+    VX_CHANNEL_V = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x16,
+};
+
+/*! \brief An enumeration of memory import types.
+ * \ingroup group_context
+ */
+enum vx_import_type_e {
+    /*! \brief For memory allocated through OpenVX, this is the import type. */
+    VX_IMPORT_TYPE_NONE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_IMPORT_MEM) + 0x0,
+
+    /*! \brief The default memory type to import from the Host. */
+    VX_IMPORT_TYPE_HOST = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_IMPORT_MEM) + 0x1,
+};
+
+/*! \brief The image reconstruction filters supported by image resampling operations.
+ *
+ * The edge of a pixel is interpreted as being aligned to the edge of the image.
+ * The value for an output pixel is evaluated at the center of that pixel.
+ *
+ * This means, for example, that an even enlargement of a factor of two in nearest-neighbor
+ * interpolation will replicate every source pixel into a 2x2 quad in the destination, and that
+ * an even shrink by a factor of two in bilinear interpolation will create each destination pixel
+ * by average a 2x2 quad of source pixels.
+ *
+ * Samples that cross the boundary of the source image have values determined by the border
+ * mode - see <tt>\ref vx_border_mode_e</tt> and <tt>\ref VX_NODE_ATTRIBUTE_BORDER_MODE</tt>.
+ * \see vxuScaleImage
+ * \see vxScaleImageNode
+ * \see VX_KERNEL_SCALE_IMAGE
+ * \see vxuWarpAffine
+ * \see vxWarpAffineNode
+ * \see VX_KERNEL_WARP_AFFINE
+ * \see vxuWarpPerspective
+ * \see vxWarpPerspectiveNode
+ * \see VX_KERNEL_WARP_PERSPECTIVE
+ * \ingroup group_basic_features
+ */
+enum vx_interpolation_type_e {
+    /*! \brief Output values are defined to match the source pixel whose center is nearest to the sample position. */
+    VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x0,
+    /*! \brief Output values are defined by bilinear interpolation between the pixels whose centers are closest
+     * to the sample position, weighted linearly by the distance of the sample from the pixel centers. */
+    VX_INTERPOLATION_TYPE_BILINEAR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x1,
+    /*! \brief Output values are determined by averaging the source pixels whose areas fall under the
+     * area of the destination pixel, projected onto the source image. */
+    VX_INTERPOLATION_TYPE_AREA = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x2,
+};
+
+/*! \brief The image color space list used by the <tt>\ref VX_IMAGE_ATTRIBUTE_SPACE</tt> attribute of a <tt>\ref vx_image</tt>.
+ * \ingroup group_image
+ */
+enum vx_color_space_e {
+    /*! \brief Use to indicate that no color space is used. */
+    VX_COLOR_SPACE_NONE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x0,
+    /*! \brief Use to indicate that the BT.601 coefficients and SMPTE C primaries are used for conversions. */
+    VX_COLOR_SPACE_BT601_525 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x1,
+    /*! \brief Use to indicate that the BT.601 coefficients and BTU primaries are used for conversions. */
+    VX_COLOR_SPACE_BT601_625 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x2,
+    /*! \brief Use to indicate that the BT.709 coefficients are used for conversions. */
+    VX_COLOR_SPACE_BT709 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x3,
+
+    /*! \brief All images in VX are by default BT.709 */
+    VX_COLOR_SPACE_DEFAULT = VX_COLOR_SPACE_BT709,
+};
+
+/*! \brief The image channel range list used by the <tt>\ref VX_IMAGE_ATTRIBUTE_RANGE</tt> attribute of a <tt>\ref vx_image</tt>.
+ *  \ingroup group_image
+ */
+enum vx_channel_range_e {
+    /*! \brief Full range of the unit of the channel */
+    VX_CHANNEL_RANGE_FULL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_RANGE) + 0x0,
+    /*! \brief Restricted range of the unit of the channel based on the space given */
+    VX_CHANNEL_RANGE_RESTRICTED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_RANGE) + 0x1,
+};
+
+/*! \brief The parameter state type.
+ * \ingroup group_parameter
+ */
+enum vx_parameter_state_e {
+    /*! \brief Default. The parameter must be supplied. If not set, during
+     * Verify, an error is returned.
+     */
+    VX_PARAMETER_STATE_REQUIRED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PARAMETER_STATE) + 0x0,
+    /*! \brief The parameter may be unspecified. The kernel takes care not
+     * to deference optional parameters until it is certain they are valid.
+     */
+    VX_PARAMETER_STATE_OPTIONAL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PARAMETER_STATE) + 0x1,
+};
+
+/*! \brief The border mode list.
+ * \ingroup group_borders
+ */
+enum vx_border_mode_e {
+    /*! \brief No defined border mode behavior is given. */
+    VX_BORDER_MODE_UNDEFINED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER_MODE) + 0x0,
+    /*! \brief For nodes that support this behavior, a constant value is
+     * \e filled-in when accessing out-of-bounds pixels.
+     */
+    VX_BORDER_MODE_CONSTANT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER_MODE) + 0x1,
+    /*! \brief For nodes that support this behavior, a replication of the nearest
+     * edge pixels value is given for out-of-bounds pixels.
+     */
+    VX_BORDER_MODE_REPLICATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER_MODE) + 0x2,
+};
+
+/*! \brief The termination criteria list.
+ * \see group_vision_function_opticalflowpyrlk
+ * \ingroup group_context
+ */
+enum vx_termination_criteria_e {
+    /*! \brief Indicates a termination after a set number of iterations. */
+    VX_TERM_CRITERIA_ITERATIONS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TERM_CRITERIA) + 0x0,
+    /*! \brief Indicates a termination after matching against the value of eplison provided to the function. */
+    VX_TERM_CRITERIA_EPSILON = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TERM_CRITERIA) + 0x1,
+    /*! \brief Indicates that both an iterations and eplison method are employed. Whichever one matches first
+     * causes the termination.
+     */
+    VX_TERM_CRITERIA_BOTH = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TERM_CRITERIA) + 0x2,
+};
+
+/*! \brief A normalization type.
+ * \see group_vision_function_canny
+ * \ingroup group_vision_function_canny
+ */
+enum vx_norm_type_e {
+    /*! \brief The L1 normalization. */
+    VX_NORM_L1 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NORM_TYPE) + 0x0,
+    /*! \brief The L2 normalization. */
+    VX_NORM_L2 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NORM_TYPE) + 0x1,
+};
+
+/*! \brief The delay attribute list.
+ * \ingroup group_delay
+ */
+enum vx_delay_attribute_e {
+    /*! \brief The type of reference contained in the delay. Use a <tt>\ref vx_enum</tt> parameter. */
+    VX_DELAY_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DELAY) + 0x0,
+    /*! \brief The number of items in the delay. Use a <tt>\ref vx_size</tt> parameter.*/
+    VX_DELAY_ATTRIBUTE_SLOTS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DELAY) + 0x1,
+};
+
+/*! \brief The memory accessor hint flags.
+ * These enumeration values are used to indicate desired \e system behavior,
+ * not the \b User intent. For example: these can be interpretted as hints to the
+ * system about cache operations or marshalling operations.
+ * \ingroup group_context
+ */
+enum vx_accessor_e {
+    /*! \brief The memory shall be treated by the system as if it were read-only.
+     * If the User writes to this memory, the results are implementation defined.
+     */
+    VX_READ_ONLY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACCESSOR) + 0x1,
+    /*! \brief The memory shall be treated by the system as if it were write-only.
+     * If the User reads from this memory, the results are implementation defined.
+     */
+    VX_WRITE_ONLY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACCESSOR) + 0x2,
+    /*! \brief The memory shall be treated by the system as if it were readable and writeable.
+     */
+    VX_READ_AND_WRITE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACCESSOR) + 0x3,
+};
+
+/*! \brief The Round Policy Enumeration.
+ * \ingroup group_context
+ */
+enum vx_round_policy_e {
+    /*! \brief When scaling, this truncates the least significant values that are lost in operations. */
+    VX_ROUND_POLICY_TO_ZERO = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ROUND_POLICY) + 0x1,
+    /*! \brief When scaling, this rounds to nearest even output value. */
+    VX_ROUND_POLICY_TO_NEAREST_EVEN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ROUND_POLICY) + 0x2,
+};
+
+/*!
+ * \brief The entry point into modules loaded by <tt>\ref vxLoadKernels</tt>.
+ * \param [in] context The handle to the implementation context.
+ * \note The symbol exported from the user module must be <tt>vxPublishKernels</tt> in extern C format.
+ * \ingroup group_user_kernels
+ */
+typedef vx_status (VX_API_CALL *vx_publish_kernels_f)(vx_context context);
+
+/*!
+ * \brief The pointer to the Host side kernel.
+ * \param [in] node The handle to the node that contains this kernel.
+ * \param [in] parameters The array of parameter references.
+ * \param [in] num The number of parameters.
+ * \ingroup group_user_kernels
+ */
+typedef vx_status (VX_CALLBACK *vx_kernel_f)(vx_node node, const vx_reference *parameters, vx_uint32 num);
+
+/*!
+ * \brief The pointer to the kernel initializer. If the host code requires a call
+ * to initialize data once all the parameters have been validated, this function is called
+ * if not NULL.
+ * \param [in] node The handle to the node that contains this kernel.
+ * \param [in] parameters The array of parameter references.
+ * \param [in] num The number of parameters.
+ * \ingroup group_user_kernels
+ */
+typedef vx_status (VX_CALLBACK *vx_kernel_initialize_f)(vx_node node, const vx_reference *parameters, vx_uint32 num);
+
+/*!
+ * \brief The pointer to the kernel deinitializer. If the host code requires a call
+ * to deinitialize data during a node garbage collection, this function is called
+ * if not NULL.
+ * \param [in] node The handle to the node that contains this kernel.
+ * \param [in] parameters The array of parameter references.
+ * \param [in] num The number of parameters.
+ * \ingroup group_user_kernels
+ */
+typedef vx_status (VX_CALLBACK *vx_kernel_deinitialize_f)(vx_node node, const vx_reference *parameters, vx_uint32 num);
+
+/*!
+ * \brief The user-defined kernel node input parameter validation function.
+ * \note This function is called once for each VX_INPUT or VX_BIDIRECTIONAL
+ * parameter index.
+ * \param [in] node The handle to the node that is being validated.
+ * \param [in] index The index of the parameter being validated.
+ * \return An error code describing the validation status on this
+ * parameter.
+ * \retval VX_ERROR_INVALID_FORMAT The parameter format was incorrect.
+ * \retval VX_ERROR_INVALID_VALUE The value of the parameter was incorrect.
+ * \retval VX_ERROR_INVALID_DIMENSION The dimensionality of the parameter was incorrect.
+ * \retval VX_ERROR_INVALID_PARAMETERS The index was out of bounds.
+ * \ingroup group_user_kernels
+ */
+typedef vx_status (VX_CALLBACK *vx_kernel_input_validate_f)(vx_node node, vx_uint32 index);
+
+/*!
+ * \brief The user-defined kernel node output parameter validation function. The function only
+ * needs to fill in the meta data structure.
+ * \note This function is called once for each VX_OUTPUT parameter index.
+ * \param [in] node The handle to the node that is being validated.
+ * \param [in] index The index of the parameter being validated.
+ * \param [in] ptr A pointer to a pre-allocated structure that the system holds.
+ * The validation function fills in the correct type, format, and dimensionality for
+ * the system to use either to create memory or to check against existing memory.
+ * \return An error code describing the validation status on this
+ * parameter.
+ * \retval VX_ERROR_INVALID_PARAMETERS The index is out of bounds.
+ * \ingroup group_user_kernels
+ */
+typedef vx_status (VX_CALLBACK *vx_kernel_output_validate_f)(vx_node node, vx_uint32 index, vx_meta_format meta);
+
+#if defined(_WIN32) || defined(UNDER_CE)
+#if defined(_WIN64)
+/*! Use to aid in debugging values in OpenVX.
+ * \ingroup group_basic_features
+ */
+#define VX_FMT_REF  "%I64u"
+/*! Use to aid in debugging values in OpenVX.
+ * \ingroup group_basic_features
+ */
+#define VX_FMT_SIZE "%I64u"
+#else
+/*! Use to aid in debugging values in OpenVX.
+ * \ingroup group_basic_features
+ */
+#define VX_FMT_REF  "%lu"
+/*! Use to aid in debugging values in OpenVX.
+ * \ingroup group_basic_features
+ */
+#define VX_FMT_SIZE "%lu"
+#endif
+#else
+/*! Use to aid in debugging values in OpenVX.
+ * \ingroup group_basic_features
+ */
+#define VX_FMT_REF  "%p"
+/*! Use to aid in debugging values in OpenVX.
+ * \ingroup group_basic_features
+ */
+#define VX_FMT_SIZE "%zu"
+#endif
+/*! Use to indicate the 1:1 ratio in Q22.10 format.
+ * \ingroup group_basic_features
+ */
+#define VX_SCALE_UNITY (1024u)
+
+/*!
+ * \brief The addressing image patch structure is used by the Host only
+ * to address pixels in an image patch. The fields of the structure are defined as:
+ * \arg dim - The dimensions of the image in logical pixel units in the x & y direction.
+ * \arg stride - The physical byte distance from a logical pixel to the next
+ * logically adjacent pixel in the positive x or y direction.
+ * \arg scale - The relationship of scaling from the primary plane (typically
+ * the zero indexed plane) to this plane. An integer down-scaling factor of \f$ f \f$ shall be
+ * set to a value equal to \f$ scale = \frac{unity}{f} \f$ and an integer up-scaling factor of \f$ f \f$
+ * shall be set to a value of \f$ scale = unity * f \f$. \f$ unity \f$ is defined as <tt>\ref VX_SCALE_UNITY</tt>.
+ * \arg step - The step is the number of logical pixel units to skip to
+ * arrive at the next physically unique pixel. For example, on a plane that is
+ * half-scaled in a dimension, the step in that dimension is 2 to indicate that
+ * every other pixel in that dimension is an alias. This is useful in situations
+ * where iteration over unique pixels is required, such as in serializing
+ * or de-serializing the image patch information.
+ * \see <tt>\ref vxAccessImagePatch</tt>
+ * \ingroup group_image
+ * \include vx_imagepatch.c
+ */
+typedef struct _vx_imagepatch_addressing_t {
+    vx_uint32 dim_x;        /*!< \brief Width of patch in X dimension in pixels. */
+    vx_uint32 dim_y;        /*!< \brief Height of patch in Y dimension in pixels. */
+    vx_int32  stride_x;     /*!< \brief Stride in X dimension in bytes. */
+    vx_int32  stride_y;     /*!< \brief Stride in Y dimension in bytes. */
+    vx_uint32 scale_x;      /*!< \brief Scale of X dimension. For sub-sampled planes this is the scaling factor of the dimension of the plane in relation to the zero plane. Use <tt>\ref VX_SCALE_UNITY</tt> in the numerator. */
+    vx_uint32 scale_y;      /*!< \brief Scale of Y dimension. For sub-sampled planes this is the scaling factor of the dimension of the plane in relation to the zero plane. Use <tt>\ref VX_SCALE_UNITY</tt> in the numerator.  */
+    vx_uint32 step_x;       /*!< \brief Step of X dimension in pixels. */
+    vx_uint32 step_y;       /*!< \brief Step of Y dimension in pixels. */
+} vx_imagepatch_addressing_t;
+
+/*! \brief Use to initialize a <tt>\ref vx_imagepatch_addressing_t</tt> structure on the stack.
+ * \ingroup group_image
+ */
+#define VX_IMAGEPATCH_ADDR_INIT {0u, 0u, 0, 0, 0u, 0u, 0u, 0u}
+
+/*! \brief The performance measurement structure.
+ * \ingroup group_performance
+ */
+typedef struct _vx_perf_t {
+    vx_uint64 tmp;          /*!< \brief Holds the last measurement. */
+    vx_uint64 beg;          /*!< \brief Holds the first measurement in a set. */
+    vx_uint64 end;          /*!< \brief Holds the last measurement in a set. */
+    vx_uint64 sum;          /*!< \brief Holds the summation of durations. */
+    vx_uint64 avg;          /*!< \brief Holds the average of the durations. */
+    vx_uint64 min;          /*!< \brief Holds the minimum of the durations. */
+    vx_uint64 num;          /*!< \brief Holds the number of measurements. */
+    vx_uint64 max;          /*!< \brief Holds the maximum of the durations. */
+} vx_perf_t;
+
+/*! \brief Initializes a <tt>\ref vx_perf_t</tt> on the stack.
+ * \ingroup group performance
+ */
+#define VX_PERF_INIT    {0ul, 0ul, 0ul, 0ul, 0ul, 0ul}
+
+/*! \brief The Kernel Information Structure. This is returned by the Context
+ * to indicate which kernels are available in the OpenVX implementation.
+ * \ingroup group_kernel
+ */
+typedef struct _vx_kernel_info_t {
+    /*! \brief The kernel enumeration value from <tt>\ref vx_kernel_e</tt> (or an
+     * extension thereof).
+     * \see vxGetKernelByEnum
+     */
+    vx_enum enumeration;
+
+    /*! \brief The kernel name in dotted hierarchical format.
+     * e.g. "org.khronos.openvx.sobel3x3"
+     * \see vxGetKernelByName
+     */
+    vx_char name[VX_MAX_KERNEL_NAME];
+} vx_kernel_info_t;
+
+/*! \brief Use to indicate a half-scale pyramid.
+ * \ingroup group_pyramid
+ */
+#define VX_SCALE_PYRAMID_HALF       (0.5f)
+
+/*! \brief Use to indicate a ORB scaled pyramid whose scaling factor is \f$ \frac{1}{\root 4 \of {2}} \f$.
+ * \ingroup group_pyramid
+ */
+#define VX_SCALE_PYRAMID_ORB        ((vx_float32)0.8408964f)
+
+/*! \brief Use with the enumeration <tt>\ref VX_NODE_ATTRIBUTE_BORDER_MODE</tt> to set the
+ * border mode behavior of a node that supports borders.
+ * \ingroup group_borders
+ */
+typedef struct _vx_border_mode_t {
+    /*! \brief See <tt>\ref vx_border_mode_e</tt>. */
+    vx_enum mode;
+    /*! \brief For the mode <tt>\ref VX_BORDER_MODE_CONSTANT</tt>, this value is
+     * filled into each pixel. If there are sub-channels in the pixel then this
+     * value is divided up accordingly.
+     */
+    vx_uint32 constant_value;
+} vx_border_mode_t;
+
+/*! \brief The keypoint data structure.
+ * \ingroup group_basic_features
+ */
+typedef struct _vx_keypoint_t {
+    vx_int32 x;                 /*!< \brief The x coordinate. */
+    vx_int32 y;                 /*!< \brief The y coordinate. */
+    vx_float32 strength;        /*!< \brief The strength of the keypoint. Its definition is specific to the corner detector. */
+    vx_float32 scale;           /*!< \brief Initialized to 0 by corner detectors. */
+    vx_float32 orientation;     /*!< \brief Initialized to 0 by corner detectors. */
+    vx_int32 tracking_status;   /*!< \brief A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    vx_float32 error;           /*!< \brief A tracking method specific error. Initialized to 0 by corner detectors. */
+} vx_keypoint_t;
+
+/*! \brief The rectangle data structure that is shared with the users.
+ * \ingroup group_basic_features
+ */
+typedef struct _vx_rectangle_t {
+    vx_uint32 start_x;          /*!< \brief The Start X coordinate. */
+    vx_uint32 start_y;          /*!< \brief The Start Y coordinate. */
+    vx_uint32 end_x;            /*!< \brief The End X coordinate. */
+    vx_uint32 end_y;            /*!< \brief The End Y coordinate. */
+} vx_rectangle_t;
+
+/*! \brief The changes in dimensions of the rectangle between input and output
+ * images in an output parameter validator. Used in conjunction with
+ * <tt>\ref VX_META_FORMAT_ATTRIBUTE_DELTA_RECTANGLE</tt> and
+ * <tt>\ref vxSetMetaFormatAttribute</tt>.
+ * \see vx_kernel_output_validate_f
+ * \see vx_meta_format
+ * \ingroup group_basic_features
+ */
+typedef struct _vx_delta_rectangle_t {
+    vx_int32 delta_start_x; /*!< \brief The change in the start x. */
+    vx_int32 delta_start_y; /*!< \brief The change in the start y. */
+    vx_int32 delta_end_x;   /*!< \brief The change in the end x. */
+    vx_int32 delta_end_y;   /*!< \brief The change in the end y. */
+} vx_delta_rectangle_t;
+
+/*! \brief The 2D Coordinates structure.
+ * \ingroup group_basic_features
+ */
+typedef struct _vx_coordinates2d_t {
+    vx_uint32 x;    /*!< \brief The X coordinate. */
+    vx_uint32 y;    /*!< \brief The Y coordinate. */
+} vx_coordinates2d_t;
+
+/*! \brief The 3D Coordinates structure.
+ * \ingroup group_basic_features
+ */
+typedef struct _vx_coordinates3d_t {
+    vx_uint32 x;    /*!< \brief The X coordinate. */
+    vx_uint32 y;    /*!< \brief The Y coordinate. */
+    vx_uint32 z;    /*!< \brief The Z coordinate. */
+} vx_coordinates3d_t;
+
+/*! \brief The log callback function.
+ * \ingroup group_log
+ */
+typedef void (VX_CALLBACK *vx_log_callback_f)(vx_context context,
+                                  vx_reference ref,
+                                  vx_status status,
+                                  const vx_char string[]);
+
+#endif
diff --git a/openvx/include/VX/vx_vendors.h b/openvx/include/VX/vx_vendors.h
new file mode 100644
index 0000000..cb535d1
--- /dev/null
+++ b/openvx/include/VX/vx_vendors.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY,\todo FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#ifndef _OPENVX_VENDORS_H_
+#define _OPENVX_VENDORS_H_
+
+/*!
+ * \file
+ * \brief The Vendor ID list for OpenVX.
+ */
+
+/*! \brief The Vendor ID of the Implementation. As new vendors submit their
+ * implementations, this enumeration will grow.
+ * \ingroup group_basic_features
+ */
+enum vx_vendor_id_e {
+    VX_ID_KHRONOS   = 0x000, /*!< \brief The Khronos Group */
+    VX_ID_TI        = 0x001, /*!< \brief Texas Instruments, Inc. */
+    VX_ID_QUALCOMM  = 0x002, /*!< \brief Qualcomm, Inc. */
+    VX_ID_NVIDIA    = 0x003, /*!< \brief NVIDIA Corporation */
+    VX_ID_ARM       = 0x004, /*!< \brief ARM Ltd. */
+    VX_ID_BDTI      = 0x005, /*!< \brief Berkley Design Technology, Inc. */
+    VX_ID_RENESAS   = 0x006, /*!< \brief Renasas Electronics */
+    VX_ID_VIVANTE   = 0x007, /*!< \brief Vivante Corporation */
+    VX_ID_XILINX    = 0x008, /*!< \brief Xilinx Inc. */
+    VX_ID_AXIS      = 0x009, /*!< \brief Axis Communications */
+    VX_ID_MOVIDIUS  = 0x00A, /*!< \brief Movidius Ltd. */
+    VX_ID_SAMSUNG   = 0x00B, /*!< \brief Samsung Electronics */
+    VX_ID_FREESCALE = 0x00C, /*!< \brief Freescale Semiconductor */
+    VX_ID_AMD       = 0x00D, /*!< \brief Advanced Micro Devices */
+    VX_ID_BROADCOM  = 0x00E, /*!< \brief Broadcom Corporation */
+    VX_ID_INTEL     = 0x00F, /*!< \brief Intel Corporation */
+    VX_ID_MARVELL   = 0x010, /*!< \brief Marvell Technology Group Ltd. */
+    VX_ID_MEDIATEK  = 0x011, /*!< \brief MediaTek, Inc. */
+    VX_ID_ST        = 0x012, /*!< \brief STMicroelectronics */
+    VX_ID_CEVA      = 0x013, /*!< \brief CEVA DSP */
+    VX_ID_ITSEEZ    = 0x014, /*!< \brief Itseez, Inc. */
+    VX_ID_IMAGINATION=0x015, /*!< \brief Imagination Technologies */
+    VX_ID_COGNIVUE  = 0x016, /*!< \brief CogniVue Corporation */
+    VX_ID_VIDEANTIS = 0x017, /*!< \brief Videantis */
+    VX_ID_SYNOPSYS  = 0x018, /*!< \brief Synopsys */
+    /* Add new vendor code above this line */
+
+    VX_ID_MAX       = 0xFFF,
+    /*! \brief For use by all Kernel authors until they can obtain an assigned ID. */
+    VX_ID_DEFAULT = VX_ID_MAX,
+};
+
+#endif
+
diff --git a/openvx/include/VX/vxu.h b/openvx/include/VX/vxu.h
new file mode 100644
index 0000000..3edb0de
--- /dev/null
+++ b/openvx/include/VX/vxu.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2012-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ */
+
+#ifndef _OPENVX_UTILITY_H_
+#define _OPENVX_UTILITY_H_
+
+/*!
+ * \file
+ * \brief The OpenVX Utility Library.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief [Immediate] Invokes an immediate Color Conversion.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image.
+ * \param [out] output The output image.
+ * \ingroup group_vision_function_colorconvert
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuColorConvert(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Invokes an immediate Channel Extract.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image. Must be one of the defined <tt>\ref vx_df_image_e</tt> multiplanar formats.
+ * \param [in] channel The <tt>\ref vx_channel_e</tt> enumeration to extract.
+ * \param [out] output The output image. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \ingroup group_vision_function_channelextract
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuChannelExtract(vx_context context, vx_image input, vx_enum channel, vx_image output);
+
+/*! \brief [Immediate] Invokes an immediate Channel Combine.
+ * \param [in] context The reference to the overall context.
+ * \param [in] plane0 The plane that forms channel 0. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] plane1 The plane that forms channel 1. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] plane2 [optional] The plane that forms channel 2. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] plane3 [optional] The plane that forms channel 3. Must be <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [out] output The output image.
+ * \ingroup group_vision_function_channelcombine
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuChannelCombine(vx_context context, vx_image plane0, vx_image plane1, vx_image plane2, vx_image plane3, vx_image output);
+
+/*! \brief [Immediate] Invokes an immediate Sobel 3x3.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output_x [optional] The output gradient in the x direction in <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \param [out] output_y [optional] The output gradient in the y direction in <tt>\ref VX_DF_IMAGE_S16</tt>.
+ * \ingroup group_vision_function_sobel3x3
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuSobel3x3(vx_context context, vx_image input, vx_image output_x, vx_image output_y);
+
+/*! \brief [Immediate] Invokes an immediate Magnitude.
+ * \param [in] context The reference to the overall context.
+ * \param [in] grad_x The input x image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [in] grad_y The input y image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] output The magnitude image. This will be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_magnitude
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuMagnitude(vx_context context, vx_image grad_x, vx_image grad_y, vx_image output);
+
+/*! \brief [Immediate] Invokes an immediate Phase.
+ * \param [in] context The reference to the overall context.
+ * \param [in] grad_x The input x image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [in] grad_y The input y image. This must be in <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] output The phase image. This will be in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_phase
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuPhase(vx_context context, vx_image grad_x, vx_image grad_y, vx_image output);
+
+/*! \brief [Immediate] Scales an input image to an output image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] src The source image of type <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [out] dst The destintation image of type <tt>\ref VX_DF_IMAGE_U8</tt>.
+ * \param [in] type The interpolation type. \see vx_interpolation_type_e.
+ * \ingroup group_vision_function_scale_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuScaleImage(vx_context context, vx_image src, vx_image dst, vx_enum type);
+
+/*! \brief [Immediate] Processes the image through the LUT.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt>
+ * \param [in] lut The LUT which is of type VX_TYPE_UINT8
+ * \param [out] output The output image of type <tt>\ref VX_DF_IMAGE_U8</tt>
+ * \ingroup group_vision_function_lut
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuTableLookup(vx_context context, vx_image input, vx_lut lut, vx_image output);
+
+/*! \brief [Immediate] Generates a distribution from an image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt>
+ * \param [out] distribution The output distribution.
+ * \ingroup group_vision_function_histogram
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuHistogram(vx_context context, vx_image input, vx_distribution distribution);
+
+/*! \brief [Immediate] Equalizes the Histogram of a grayscale image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The grayscale input image in <tt>\ref VX_DF_IMAGE_U8</tt>
+ * \param [out] output The grayscale output image of type <tt>\ref VX_DF_IMAGE_U8</tt> with equalized brightness and contrast.
+ * \ingroup group_vision_function_equalize_hist
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuEqualizeHist(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Computes the absolute difference between two images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 An input image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [in] in2 An input image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] out The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_absdiff
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuAbsDiff(vx_context context, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Immediate] Computes the mean value and standard deviation.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image. <tt>\ref VX_DF_IMAGE_U8</tt> is supported.
+ * \param [out] mean The average pixel value.
+ * \param [out] stddev The standard deviation of the pixel values.
+ * \ingroup group_vision_function_meanstddev
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuMeanStdDev(vx_context context, vx_image input, vx_float32 *mean, vx_float32 *stddev);
+
+/*! \brief [Immediate] Threshold's an input image and produces a <tt>\ref VX_DF_IMAGE_U8</tt>  * boolean image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image. <tt>\ref VX_DF_IMAGE_U8</tt> is supported.
+ * \param [in] thresh The thresholding object that defines the parameters of
+ * the operation.
+ * \param [out] output The output Boolean image. Values are either 0 or 255.
+ * \ingroup group_vision_function_threshold
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuThreshold(vx_context context, vx_image input, vx_threshold thresh, vx_image output);
+
+/*! \brief [Immediate] Computes the integral image of the input.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U32</tt> format.
+ * \ingroup group_vision_function_integral_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuIntegralImage(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Erodes an image by a 3x3 window.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_erode_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuErode3x3(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Dilates an image by a 3x3 window.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_dilate_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuDilate3x3(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Computes a median filter on the image by a 3x3 window.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_median_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuMedian3x3(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Computes a box filter on the image by a 3x3 window.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_box_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuBox3x3(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Computes a gaussian filter on the image by a 3x3 window.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_gaussian_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuGaussian3x3(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Computes a convolution on the input image with the supplied
+ * matrix.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \param [in] matrix The convolution matrix.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_custom_convolution
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuConvolve(vx_context context, vx_image input, vx_convolution matrix, vx_image output);
+
+/*! \brief [Immediate] Computes a Gaussian pyramid from an input image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt>
+ * \param [out] gaussian The Gaussian pyramid with <tt>\ref VX_DF_IMAGE_U8</tt> to construct.
+ * \ingroup group_vision_function_gaussian_pyramid
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuGaussianPyramid(vx_context context, vx_image input, vx_pyramid gaussian);
+
+/*! \brief [Immediate] Computes an accumulation.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in,out] accum The accumulation image in <tt>\ref VX_DF_IMAGE_S16</tt>
+ * \ingroup group_vision_function_accumulate
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateImage(vx_context context, vx_image input, vx_image accum);
+
+/*! \brief [Immediate] Computes a weighted accumulation.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] scale A <tt>\ref VX_TYPE_FLOAT32</tt> type, the input value with the range \f$ 0.0 \le \alpha \le 1.0 \f$.
+ * \param [in,out] accum The <tt>\ref VX_DF_IMAGE_U8</tt> accumulation image.
+ * \ingroup group_vision_function_accumulate_weighted
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateWeightedImage(vx_context context, vx_image input, vx_scalar scale, vx_image accum);
+
+/*! \brief [Immediate] Computes a squared accumulation.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] shift A <tt>\ref VX_TYPE_UINT32</tt> type, the input value with the range \f$ 0 \le shift \le 15 \f$.
+ * \param [in,out] accum The accumulation image in <tt>\ref VX_DF_IMAGE_S16</tt>
+ * \ingroup group_vision_function_accumulate_square
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateSquareImage(vx_context context, vx_image input, vx_scalar shift, vx_image accum);
+
+/*! \brief [Immediate] Computes the minimum and maximum values of the image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \param [out] minVal The minimum value in the image.
+ * \param [out] maxVal The maximum value in the image.
+ * \param [out] minLoc The minimum locations (optional). If the input image has several minimums, the kernel will return all of them).
+ * \param [out] maxLoc The maximum locations (optional). If the input image has several maximums, the kernel will return all of them).
+ * \param [out] minCount The total number of detected minimums in image (optional).
+ * \param [out] maxCount The total number of detected maximums in image (optional).
+ * \ingroup group_vision_function_minmaxloc
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuMinMaxLoc(vx_context context, vx_image input,
+                        vx_scalar minVal, vx_scalar maxVal,
+                        vx_array minLoc, vx_array maxLoc,
+                        vx_scalar minCount, vx_scalar maxCount);
+
+/*! \brief [Immediate] Converts the input images bit-depth into the output image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input image.
+ * \param [out] output The output image.
+ * \param [in] policy A \ref vx_convert_policy_e enumeration.
+ * \param [in] shift The shift value.
+ * \ingroup group_vision_function_convertdepth
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>..
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuConvertDepth(vx_context context, vx_image input, vx_image output, vx_enum policy, vx_int32 shift);
+
+/*! \brief [Immediate] Computes Canny Edges on the input image into the output image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] hyst The double threshold for hysteresis.
+ * \param [in] gradient_size The size of the Sobel filter window, must support at least 3, 5 and 7.
+ * \param [in] norm_type A flag indicating the norm used to compute the gradient, VX_NORM_L1 or VX_NORM_L2.
+ * \param [out] output The output image in <tt>\ref VX_DF_IMAGE_U8</tt> format.
+ * \ingroup group_vision_function_canny
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuCannyEdgeDetector(vx_context context, vx_image input, vx_threshold hyst,
+                               vx_int32 gradient_size, vx_enum norm_type,
+                               vx_image output);
+
+/*! \brief [Immediate] Performs a Gaussian Blur on an image then half-scales it.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] kernel_size The input size of the Gaussian filter. Supported values are 3 and 5.
+ * \ingroup group_vision_function_scale_image
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuHalfScaleGaussian(vx_context context, vx_image input, vx_image output, vx_int32 kernel_size);
+
+/*! \brief [Immediate] Computes the bitwise and between two images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [out] out The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_and
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuAnd(vx_context context, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Immediate] Computes the bitwise inclusive-or between two images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [out] out The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_or
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuOr(vx_context context, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Immediate] Computes the bitwise exclusive-or between two images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [out] out The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_xor
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuXor(vx_context context, vx_image in1, vx_image in2, vx_image out);
+
+/*! \brief [Immediate] Computes the bitwise not of an image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The <tt>\ref VX_DF_IMAGE_U8</tt> input image
+ * \param [out] output The <tt>\ref VX_DF_IMAGE_U8</tt> output image.
+ * \ingroup group_vision_function_not
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuNot(vx_context context, vx_image input, vx_image output);
+
+/*! \brief [Immediate] Performs elementwise multiplications on pixel values in the input images and a scale.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> input image.
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> input image.
+ * \param [in] scale The scale value.
+ * \param [in] overflow_policy A <tt>\ref vx_convert_policy_e</tt> enumeration.
+ * \param [in] rounding_policy A <tt>\ref vx_round_policy_e</tt> enumeration.
+ * \param [out] out The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_mult
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuMultiply(vx_context context, vx_image in1, vx_image in2, vx_float32 scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_image out);
+
+/*! \brief [Immediate] Performs arithmetic addition on pixel values in the input images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> input image.
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> input image.
+ * \param [in] policy A \ref vx_convert_policy_e enumeration.
+ * \param [out] out The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_add
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuAdd(vx_context context, vx_image in1, vx_image in2, vx_enum policy, vx_image out);
+
+/*! \brief [Immediate] Performs arithmetic subtraction on pixel values in the input images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] in1 A <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> input image, the minuend.
+ * \param [in] in2 A <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> input image, the subtrahend.
+ * \param [in] policy A \ref vx_convert_policy_e enumeration.
+ * \param [out] out The output image in <tt>\ref VX_DF_IMAGE_U8</tt> or <tt>\ref VX_DF_IMAGE_S16</tt> format.
+ * \ingroup group_vision_function_sub
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuSubtract(vx_context context, vx_image in1, vx_image in2, vx_enum policy, vx_image out);
+
+/*! \brief [Immediate] Performs an Affine warp on an image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] matrix The affine matrix. Must be 2x3 of type \ref VX_TYPE_FLOAT32.
+ * \param [in] type The interpolation type from \ref vx_interpolation_type_e.
+ * \ref VX_INTERPOLATION_TYPE_AREA is not supported.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \ingroup group_vision_function_warp_affine
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuWarpAffine(vx_context context, vx_image input, vx_matrix matrix, vx_enum type, vx_image output);
+
+/*! \brief [Immediate] Performs an Perspective warp on an image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] matrix The perspective matrix. Must be 3x3 of type \ref VX_TYPE_FLOAT32.
+ * \param [in] type The interpolation type from \ref vx_interpolation_type_e.
+ * \ref VX_INTERPOLATION_TYPE_AREA is not supported.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \ingroup group_vision_function_warp_perspective
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuWarpPerspective(vx_context context, vx_image input, vx_matrix matrix, vx_enum type, vx_image output);
+
+/*! \brief [Immediate] Computes the Harris Corners over an image and produces the array of scored points.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] strength_thresh The <tt>\ref VX_TYPE_FLOAT32</tt> minimum threshold which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+ * \param [in] min_distance The <tt>\ref VX_TYPE_FLOAT32</tt> radial Euclidean distance for non-maximum suppression.
+ * \param [in] sensitivity The <tt>\ref VX_TYPE_FLOAT32</tt> scalar sensitivity threshold \f$ k \f$ from the Harris-Stephens equation.
+ * \param [in] gradient_size The gradient window size to use on the input. The
+ * implementation must support at least 3, 5, and 7.
+ * \param [in] block_size The block window size used to compute the harris corner score.
+ * The implementation must support at least 3, 5, and 7.
+ * \param [out] corners The array of <tt>\ref VX_TYPE_KEYPOINT</tt> structs.
+ * \param [out] num_corners The total number of detected corners in image (optional). Use a \ref VX_TYPE_SIZE scalar
+ * \ingroup group_vision_function_harris
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuHarrisCorners(vx_context context,
+                           vx_image input,
+                           vx_scalar strength_thresh,
+                           vx_scalar min_distance,
+                           vx_scalar sensitivity,
+                           vx_int32 gradient_size,
+                           vx_int32 block_size,
+                           vx_array corners,
+                           vx_scalar num_corners);
+
+
+/*! \brief [Immediate] Computes corners on an image using FAST algorithm and produces the array of feature points.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] strength_thresh Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3 (<tt>\ref VX_TYPE_FLOAT32</tt> scalar)
+ * \param [in] nonmax_suppression If true, non-maximum suppression is applied to
+ * detected corners before being places in the <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt> structs.
+ * \param [out] corners Output corner <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt>.
+ * \param [out] num_corners The total number of detected corners in image (optional). Use a \ref VX_TYPE_SIZE scalar.
+ * \ingroup group_vision_function_fast
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval *          An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuFastCorners(vx_context context, vx_image input, vx_scalar strength_thresh, vx_bool nonmax_suppression, vx_array corners, vx_scalar num_corners);
+
+/*! \brief [Immediate] Computes an optical flow on two images.
+ * \param [in] context The reference to the overall context.
+ * \param [in] old_images Input of first (old) image pyramid
+ * \param [in] new_images Input of destination (new) image pyramid
+ * \param [in] old_points an array of key points in a vx_array of <tt>\ref VX_TYPE_KEYPOINT</tt> those key points are defined at
+ *  the old_images high resolution pyramid
+ * \param [in] new_points_estimates an array of estimation on what is the output key points in a <tt>\ref vx_array</tt> of
+ * <tt>\ref VX_TYPE_KEYPOINT</tt> those keypoints are defined at the new_images high resolution pyramid
+ * \param [out] new_points an output array of key points in a <tt>\ref vx_array</tt> of <tt>\ref VX_TYPE_KEYPOINT</tt> those key points are
+ *  defined at the new_images high resolution pyramid
+ * \param [in] termination termination can be <tt>\ref VX_TERM_CRITERIA_ITERATIONS</tt> or <tt>\ref VX_TERM_CRITERIA_EPSILON</tt> or
+ * <tt>\ref VX_TERM_CRITERIA_BOTH</tt>
+ * \param [in] epsilon is the <tt>\ref vx_float32</tt> error for terminating the algorithm
+ * \param [in] num_iterations is the number of iterations. Use a <tt>\ref VX_TYPE_UINT32</tt> scalar.
+ * \param [in] use_initial_estimate Can be set to either <tt>\ref vx_false_e</tt> or <tt>\ref vx_true_e</tt>.
+ * \param [in] window_dimension The size of the window on which to perform the algorithm. See 
+ *  <tt>\ref VX_CONTEXT_ATTRIBUTE_OPTICAL_FLOW_WINDOW_MAXIMUM_DIMENSION</tt>
+ *
+ * \ingroup group_vision_function_opticalflowpyrlk
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Success
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuOpticalFlowPyrLK(vx_context context,
+                              vx_pyramid old_images,
+                              vx_pyramid new_images,
+                              vx_array old_points,
+                              vx_array new_points_estimates,
+                              vx_array new_points,
+                              vx_enum termination,
+                              vx_scalar epsilon,
+                              vx_scalar num_iterations,
+                              vx_scalar use_initial_estimate,
+                              vx_size window_dimension);
+
+/*! \brief [Immediate] Remaps an output image from an input image.
+ * \param [in] context The reference to the overall context.
+ * \param [in] input The input <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \param [in] table The remap table object.
+ * \param [in] policy The interpolation policy from \ref vx_interpolation_type_e.
+ * \ref VX_INTERPOLATION_TYPE_AREA is not supported.
+ * \param [out] output The output <tt>\ref VX_DF_IMAGE_U8</tt> image.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \ingroup group_vision_function_remap
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxuRemap(vx_context context,
+                  vx_image input,
+                  vx_remap table,
+                  vx_enum policy,
+                  vx_image output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/openvx/include/vx_ext_amd.h b/openvx/include/vx_ext_amd.h
new file mode 100644
index 0000000..6b9e284
--- /dev/null
+++ b/openvx/include/vx_ext_amd.h
@@ -0,0 +1,305 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef _VX_EXT_AMD_H_
+#define _VX_EXT_AMD_H_
+
+#include <VX/vx.h>
+#ifdef __cplusplus
+#include <string>
+#endif
+
+/*! \brief AMD target affinity enumerations for AgoTargetAffinityInfo.device_type
+*/
+#define AGO_TARGET_AFFINITY_CPU       0x0010 // CPU
+#define AGO_TARGET_AFFINITY_GPU       0x0020 // GPU
+
+/*! \brief AMD internal parameters. [TODO: This needs to be moved to ago_internal.h]
+*/
+#define AGO_MAX_PARAMS                                   32
+#define AGO_MERGE_RULE_MAX_FIND                           4
+#define AGO_MERGE_RULE_MAX_REPLACE                        4
+#define AGO_MERGE_RULE_SOLITARY_FLAG                   0x20
+#define AGO_TARGET_AFFINITY_GPU_INFO_DEVICE_MASK       0x0F
+#define AGO_TARGET_AFFINITY_GPU_INFO_SVM_MASK          0xF0
+#define AGO_TARGET_AFFINITY_GPU_INFO_SVM_ENABLE        0x10
+#define AGO_TARGET_AFFINITY_GPU_INFO_SVM_AS_CLMEM      0x20
+#define AGO_TARGET_AFFINITY_GPU_INFO_SVM_NO_FGS        0x40
+
+/*! \brief Maximum size of scalar string buffer. The local buffers used for accessing scalar strings 
+* should be of size VX_MAX_STRING_BUFFER_SIZE_AMD and the maximum allowed string length is
+* VX_MAX_STRING_BUFFER_SIZE_AMD-1.
+* \ingroup group_scalar
+*/
+#define VX_MAX_STRING_BUFFER_SIZE_AMD                   256
+
+/*! \brief The type enumeration lists all the AMD specific types in OpenVX.
+*/
+enum ago_type_public_e {
+	/*! \brief AMD data types
+	*/
+	VX_TYPE_STRING_AMD          = VX_TYPE_SCALAR_MAX,        // scalar data type for string
+
+	/*! \brief AMD data structs
+	*/
+	AGO_TYPE_KEYPOINT_XYS = VX_TYPE_VENDOR_STRUCT_START,     // AGO struct data type for keypoint XYS
+
+	/*! \brief AMD data object types
+	*/
+	AGO_TYPE_MEANSTDDEV_DATA = VX_TYPE_VENDOR_OBJECT_START,  // AGO data structure for AGO MeanStdDev kernels
+	AGO_TYPE_MINMAXLOC_DATA,                                 // AGO data structure for AGO MinMaxLoc kernels
+	AGO_TYPE_CANNY_STACK,                                    // AGO data structure for AGO Canny kernels
+	AGO_TYPE_SCALE_MATRIX,                                   // AGO data structure for AGO Scale kernels
+};
+
+/*! \brief The AMD context attributes list.
+*/
+enum vx_context_attribute_amd_e {
+	/*! \brief OpenCL context. Use a <tt>\ref cl_context</tt> parameter.*/
+	VX_CONTEXT_ATTRIBUTE_AMD_OPENCL_CONTEXT = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x01,
+	/*! \brief context affinity. Use a <tt>\ref AgoTargetAffinityInfo</tt> parameter.*/
+	VX_CONTEXT_ATTRIBUTE_AMD_AFFINITY       = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x02,
+	/*! \brief set a text macro definition. Use a <tt>\ref AgoContextMacroInfo</tt> parameter.*/
+	VX_CONTEXT_ATTRIBUTE_AMD_SET_TEXT_MACRO = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x03,
+	/*! \brief set a merge rule. Use a <tt>\ref AgoNodeMergeRule</tt> parameter.*/
+	VX_CONTEXT_ATTRIBUTE_AMD_SET_MERGE_RULE = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x04,
+};
+
+/*! \brief The AMD kernel attributes list.
+*/
+enum vx_kernel_attribute_amd_e {
+	/*! \brief kernel callback for query target support. Use a <tt>\ref amd_kernel_query_target_support_f</tt> parameter.*/
+	VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT    = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_KERNEL) + 0x01,
+	/*! \brief kernel callback for OpenCL code generation. Use a <tt>\ref amd_kernel_opencl_codegen_callback_f</tt> parameter.*/
+	VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_KERNEL) + 0x02,
+	/*! \brief kernel callback for node regeneration. Use a <tt>\ref amd_kernel_node_regen_callback_f</tt> parameter.*/
+	VX_KERNEL_ATTRIBUTE_AMD_NODE_REGEN_CALLBACK     = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_KERNEL) + 0x03,
+};
+
+/*! \brief The AMD graph attributes list.
+*/
+enum vx_graph_attribute_amd_e {
+	/*! \brief graph affinity. Use a <tt>\ref AgoNodeAffinityInfo</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_AFFINITY                     = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x01,
+	/*! \brief imports a graph from a text file. Use a <tt>\ref AgoGraphImportInfo</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_IMPORT_FROM_TEXT             = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x02,
+	/*! \brief export a graph into a text file. Use a <tt>\ref AgoGraphExportInfo</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_EXPORT_TO_TEXT               = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x03,
+	/*! \brief graph optimizer flags. Use a <tt>\ref vx_uint32</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS              = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x04,
+	/*! \brief graph last performance (internal). Use a <tt>\ref AgoGraphPerfInternalInfo</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_LAST    = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x05,
+	/*! \brief graph avg performance (internal). Use a <tt>\ref AgoGraphPerfInternalInfo</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_AVG     = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x06,
+	/*! \brief graph internal performance profile. Use a char * fileName parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_PROFILE = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x07,
+	/*! \brief OpenCL command queue. Use a <tt>\ref cl_command_queue</tt> parameter.*/
+	VX_GRAPH_ATTRIBUTE_AMD_OPENCL_COMMAND_QUEUE         = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_GRAPH) + 0x08,
+};
+
+/*! \brief The AMD node attributes list.
+*/
+enum vx_node_attribute_amd_e {
+	/*! \brief node affinity. Use a <tt>\ref AgoTargetAffinityInfo</tt> parameter.*/
+	VX_NODE_ATTRIBUTE_AMD_AFFINITY          = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_NODE) + 0x01,
+};
+
+/*! \brief The AMD image attributes list.
+*/
+enum vx_image_attribute_amd_e {
+	/*! \brief sync with user specified OpenCL buffer. Use a <tt>\ref cl_mem</tt> parameter.*/
+	VX_IMAGE_ATTRIBUTE_AMD_OPENCL_BUFFER             = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_IMAGE) + 0x01,
+	/*! \brief sync with user specified OpenCL buffer offset. Use a <tt>\ref cl_uint</tt> parameter.*/
+	VX_IMAGE_ATTRIBUTE_AMD_OPENCL_BUFFER_OFFSET      = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_IMAGE) + 0x02,
+	/*! \brief Enable user kernel's own OpenCL buffer for virtual images. Supports only images with
+	* single color plane and stride should match framework's internal alignment. image ROI not supported.
+	* Use a <tt>\ref vx_bool</tt> parameter.*/
+	VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_IMAGE) + 0x03,
+};
+
+/*! \brief These enumerations are given to the \c vxDirective API to enable/disable
+* platform optimizations and/or features. Directives are not optional and
+* usually are vendor-specific, by defining a vendor range of directives and
+* starting their enumeration from there.
+* \see <tt>vxDirective</tt>
+* \ingroup group_directive
+*/
+enum vx_directive_amd_e {
+	/*! \brief data object is readonly after this directive is given. */
+	VX_DIRECTIVE_AMD_READ_ONLY      = VX_ENUM_BASE(VX_ID_AMD, VX_ENUM_DIRECTIVE) + 0x01,
+	/*! \brief data object copy to OpenCL. */
+	VX_DIRECTIVE_AMD_COPY_TO_OPENCL = VX_ENUM_BASE(VX_ID_AMD, VX_ENUM_DIRECTIVE) + 0x02,
+};
+
+/*! \brief Based on the VX_DF_IMAGE definition.
+* \note Use <tt>\ref vx_df_image</tt> to contain these values.
+*/
+enum vx_df_image_amd_e {
+	VX_DF_IMAGE_U1_AMD    = VX_DF_IMAGE('U', '0', '0', '1'),  // AGO image with 1-bit data
+	VX_DF_IMAGE_F32_AMD   = VX_DF_IMAGE('F', '0', '3', '2'),  // AGO image with 32-bit floating-point (float)
+	VX_DF_IMAGE_F64_AMD   = VX_DF_IMAGE('F', '0', '6', '4'),  // AGO image with 64-bit floating-point (double)
+	VX_DF_IMAGE_F32x3_AMD = VX_DF_IMAGE('F', '3', '3', '2'),  // AGO image with THREE 32-bit floating-point channels in one buffer
+};
+
+/*! \brief AMD data structure to specify target affinity.
+*/
+typedef struct {
+	vx_uint32 device_type; // shall be AGO_TARGET_AFFINITY_CPU or AGO_TARGET_AFFINITY_GPU
+	vx_uint32 device_info; // reserved -- shall be initialized to ZERO and shall not be modified
+	vx_uint32 reserved[2]; // reserved -- shall be initialized to ZERO and shall not be modified
+} AgoTargetAffinityInfo;
+
+/*! \brief AMD data structure to set a text macro.
+*/
+typedef struct {
+	vx_char macroName[256];
+	vx_char * text;
+} AgoContextTextMacroInfo;
+
+/*! \brief AMD data structure to import a graph from a text.
+**    text:
+**      "macro <macro-name>" to use a pre-defined macro
+**      "file <file-name>" to load from a file
+**      otherwise use the text as is
+*/
+typedef struct {
+	vx_char * text;
+	vx_uint32 num_ref;
+	vx_reference * ref;
+	vx_int32 dumpToConsole;
+	void (VX_CALLBACK * data_registry_callback_f) (void * obj, vx_reference ref, const char * name, const char * app_params);
+	void * data_registry_callback_obj;
+} AgoGraphImportInfo;
+
+/*! \brief AMD data structure to export a graph to a text.
+*/
+typedef struct {
+	vx_char fileName[256];
+	vx_uint32 num_ref;
+	vx_reference * ref;
+	vx_char comment[64];
+} AgoGraphExportInfo;
+
+/*! \brief AMD data structure to get internal performance data.
+*/
+typedef struct {
+	vx_uint64 kernel_enqueue;
+	vx_uint64 kernel_wait;
+	vx_uint64 buffer_read;
+	vx_uint64 buffer_write;
+} AgoGraphPerfInternalInfo;
+
+/*! \brief AMD data structure to specify node merge rule.
+*/
+typedef struct AgoNodeMergeRule_t {
+	struct {
+		vx_enum    kernel_id;
+		vx_uint32  arg_spec[AGO_MAX_PARAMS];
+	} find[AGO_MERGE_RULE_MAX_FIND];
+	struct {
+		vx_enum    kernel_id;
+		vx_uint32  arg_spec[AGO_MAX_PARAMS];
+	} replace[AGO_MERGE_RULE_MAX_REPLACE];
+} AgoNodeMergeRule;
+
+#ifdef __cplusplus
+/*! \brief AMD usernode callback for target support check - supported_target_affinity shall contain bitfields AGO_TARGET_AFFINITY_CPU and AGO_TARGET_AFFINITY_GPU.
+*   When this callback is not available, the framework assumes that supported_target_affinity = AGO_TARGET_AFFINITY_CPU.
+*/
+typedef vx_status(VX_CALLBACK * amd_kernel_query_target_support_f) (vx_graph graph, vx_node node,
+	vx_bool use_opencl_1_2,              // [input]  false: OpenCL driver is 2.0+; true: OpenCL driver is 1.2
+	vx_uint32& supported_target_affinity // [output] must be set to AGO_TARGET_AFFINITY_CPU or AGO_TARGET_AFFINITY_GPU or (AGO_TARGET_AFFINITY_CPU | AGO_TARGET_AFFINITY_GPU)
+	);
+
+/*! \brief AMD usernode callback for OpenCL source code generation. The framework will pass
+*   OpenVX objects as parameters to OpenCL kernels in othe order they appear to OpenVX node.
+*   The mapping of OpenVX object to OpenCL kernel argument as shown below:
+*     vx_image:       uint width, uint height, __global <type> * buf, uint stride_in_bytes, uint offset
+*     vx_array:       __global <type> * buf, uint offset_in_bytes, uint numitems
+*     vx_scalar:      float value or uint value or int value
+*     vx_matrix:      float matrix[<ROWS>*<COLS>]
+*     vx_convolution: float convolution[<ROWS>*<COLS>]
+*     vx_threshold:   int value or int2 value
+*     vx_remap:       __global short2 * buf, uint stride_in_bytes
+*     vx_lut:         __read_only image1d_t lut
+*/
+typedef vx_status(VX_CALLBACK * amd_kernel_opencl_codegen_callback_f) (vx_node node,
+	bool opencl_load_function,                     // [input]  false: normal OpenCL kernel; true: reserved
+	char opencl_kernel_function_name[64],          // [output] kernel_name for clCreateKernel()
+	std::string& opencl_kernel_code,               // [output] string for clCreateProgramWithSource()
+	std::string& opencl_build_options,             // [output] options for clBuildProgram()
+	vx_uint32& opencl_work_dim,                    // [output] work_dim for clEnqueueNDRangeKernel()
+	vx_size opencl_global_work[],                  // [output] global_work[] for clEnqueueNDRangeKernel()
+	vx_size opencl_local_work[],                   // [output] local_work[] for clEnqueueNDRangeKernel()
+	vx_uint32& opencl_local_buffer_usage_mask,     // [output] reserved: must be ZERO
+	vx_uint32& opencl_local_buffer_size_in_bytes   // [output] reserved: must be ZERO
+	);
+
+/*! \brief AMD usernode callback for regenerating a node.
+*/
+typedef vx_status(VX_CALLBACK * amd_kernel_node_regen_callback_f) (vx_graph graph, vx_node node, vx_bool& regen_not_needed);
+#endif
+
+/*==============================================================================
+MISCELLANEOUS
+=============================================================================*/
+
+/*! \brief Name a reference
+* \ingroup vx_framework_reference
+*
+* This function is used to associate a name to a reference. This name
+* can be used by the OpenVX implementation in log messages and any
+* other reporting mechanisms.
+*
+* The OpenVX implementation will not check if the name is unique in
+* the reference scope (context or graph). Several references can then
+* have the same name.
+*
+* \param [in] ref The reference to name.
+* \param [in] name Pointer to the '\0' terminated string that identifies
+*             the reference.
+*             The string is copied by the function so that it
+*             stays the property of the caller.
+*             NULL means that the reference is not named.
+* \return A \ref vx_status_e enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE if reference is not valid.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxSetReferenceName(vx_reference ref, const vx_char *name);
+
+/**
+* \brief Retrieve the name of a reference
+* \ingroup vx_framework_reference
+*
+* This function is used to retrieve the name of a reference.
+*
+* \param [in] ref The reference.
+* \param [out] name Pointer to copy the name of the reference.
+* \param [in] size Size of the name buffer.
+* \return A \ref vx_status_e enumeration.
+* \retval VX_SUCCESS No errors.
+* \retval VX_ERROR_INVALID_REFERENCE if reference is not valid.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxGetReferenceName(vx_reference ref, vx_char name[], vx_size size);
+
+#endif
diff --git a/openvx/openvx.vcxproj b/openvx/openvx.vcxproj
new file mode 100644
index 0000000..0466cd3
--- /dev/null
+++ b/openvx/openvx.vcxproj
@@ -0,0 +1,146 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{973F2004-2215-431F-8A2C-93ABAAFB6A24}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>agodll</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <TargetName>OpenVX</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <TargetName>OpenVX</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_OPENCL=1;VX_API_ENTRY=__declspec(dllexport);_DEBUG;_WINDOWS;_USRDLL;AGODLL_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>include;$(AMDAPPSDKROOT)\include;ago;api</AdditionalIncludeDirectories>
+      <TreatWarningAsError>true</TreatWarningAsError>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(AMDAPPSDKROOT)lib\x86_64</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>ENABLE_OPENCL=1;VX_API_ENTRY=__declspec(dllexport);NDEBUG;_WINDOWS;_USRDLL;AGODLL_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>include;$(AMDAPPSDKROOT)\include;ago;api</AdditionalIncludeDirectories>
+      <TreatWarningAsError>true</TreatWarningAsError>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(AMDAPPSDKROOT)lib\x86_64</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="ago\ago_drama.cpp" />
+    <ClCompile Include="ago\ago_drama_alloc.cpp" />
+    <ClCompile Include="ago\ago_drama_analyze.cpp" />
+    <ClCompile Include="ago\ago_drama_divide.cpp" />
+    <ClCompile Include="ago\ago_drama_merge.cpp" />
+    <ClCompile Include="ago\ago_drama_remove.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_arithmetic.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_canny.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_ch_extract_combine.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_color_convert.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_fast_corners.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_filter.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_geometric.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_harris.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_histogram.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_logical.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_opticalflow.cpp" />
+    <ClCompile Include="ago\ago_haf_cpu_pyramid.cpp" />
+    <ClCompile Include="ago\ago_haf_gpu_common.cpp" />
+    <ClCompile Include="ago\ago_haf_gpu_conversion.cpp" />
+    <ClCompile Include="ago\ago_haf_gpu_corners.cpp" />
+    <ClCompile Include="ago\ago_haf_gpu_linear_filter.cpp" />
+    <ClCompile Include="ago\ago_haf_gpu_special_filters.cpp" />
+    <ClCompile Include="ago\ago_interface.cpp" />
+    <ClCompile Include="ago\ago_kernel_api.cpp" />
+    <ClCompile Include="ago\ago_kernel_list.cpp" />
+    <ClCompile Include="ago\ago_platform.cpp" />
+    <ClCompile Include="ago\ago_util.cpp" />
+    <ClCompile Include="ago\ago_util_opencl.cpp" />
+    <ClCompile Include="api\vxu.cpp" />
+    <ClCompile Include="api\vx_api.cpp" />
+    <ClCompile Include="api\vx_nodes.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="ago\ago_haf_cpu.h" />
+    <ClInclude Include="ago\ago_haf_gpu.h" />
+    <ClInclude Include="ago\ago_internal.h" />
+    <ClInclude Include="ago\ago_kernels.h" />
+    <ClInclude Include="ago\ago_kernel_api.h" />
+    <ClInclude Include="ago\ago_platform.h" />
+    <ClInclude Include="include\vx_ext_amd.h" />
+    <ClInclude Include="include\VX\vx.h" />
+    <ClInclude Include="include\VX\vxu.h" />
+    <ClInclude Include="include\VX\vx_api.h" />
+    <ClInclude Include="include\VX\vx_kernels.h" />
+    <ClInclude Include="include\VX\vx_nodes.h" />
+    <ClInclude Include="include\VX\vx_types.h" />
+    <ClInclude Include="include\VX\vx_vendors.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/openvx/openvx.vcxproj.filters b/openvx/openvx.vcxproj.filters
new file mode 100644
index 0000000..b4808bb
--- /dev/null
+++ b/openvx/openvx.vcxproj.filters
@@ -0,0 +1,174 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+    <Filter Include="Source Files\api">
+      <UniqueIdentifier>{86e09a40-2462-4cda-b926-dcf406735f76}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\ago">
+      <UniqueIdentifier>{faccb16a-5a94-4cc5-8ad6-8ef797c2a218}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\VX">
+      <UniqueIdentifier>{02732ce8-1c1f-4990-8bb3-259058b2f6ed}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\ago">
+      <UniqueIdentifier>{8ea93c93-a932-4579-9fed-6abdd5c359be}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ago\ago_drama.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_drama_alloc.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_drama_analyze.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_drama_divide.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_drama_merge.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_drama_remove.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_arithmetic.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_canny.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_ch_extract_combine.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_color_convert.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_fast_corners.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_filter.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_geometric.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_harris.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_histogram.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_logical.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_opticalflow.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_cpu_pyramid.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_gpu_common.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_gpu_conversion.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_gpu_corners.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_gpu_linear_filter.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_haf_gpu_special_filters.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_interface.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_kernel_api.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_kernel_list.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_util.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_util_opencl.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+    <ClCompile Include="api\vx_api.cpp">
+      <Filter>Source Files\api</Filter>
+    </ClCompile>
+    <ClCompile Include="api\vx_nodes.cpp">
+      <Filter>Source Files\api</Filter>
+    </ClCompile>
+    <ClCompile Include="api\vxu.cpp">
+      <Filter>Source Files\api</Filter>
+    </ClCompile>
+    <ClCompile Include="ago\ago_platform.cpp">
+      <Filter>Source Files\ago</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="include\VX\vx.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\VX\vx_api.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\VX\vx_kernels.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\VX\vx_nodes.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\VX\vx_types.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\VX\vx_vendors.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\VX\vxu.h">
+      <Filter>Header Files\VX</Filter>
+    </ClInclude>
+    <ClInclude Include="include\vx_ext_amd.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ago\ago_haf_cpu.h">
+      <Filter>Header Files\ago</Filter>
+    </ClInclude>
+    <ClInclude Include="ago\ago_haf_gpu.h">
+      <Filter>Header Files\ago</Filter>
+    </ClInclude>
+    <ClInclude Include="ago\ago_internal.h">
+      <Filter>Header Files\ago</Filter>
+    </ClInclude>
+    <ClInclude Include="ago\ago_kernel_api.h">
+      <Filter>Header Files\ago</Filter>
+    </ClInclude>
+    <ClInclude Include="ago\ago_kernels.h">
+      <Filter>Header Files\ago</Filter>
+    </ClInclude>
+    <ClInclude Include="ago\ago_platform.h">
+      <Filter>Header Files\ago</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/runvx/CMakeLists.txt b/runvx/CMakeLists.txt
new file mode 100644
index 0000000..5bfc479
--- /dev/null
+++ b/runvx/CMakeLists.txt
@@ -0,0 +1,75 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#  
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#  
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required (VERSION 3.1)
+project (runvx)
+
+set (CMAKE_CXX_STANDARD 11)
+
+find_package(OpenCL)
+find_package(OpenCV)
+
+include_directories(../openvx/include)
+
+list(APPEND SOURCES
+	runvx.cpp
+	vxArray.cpp
+	vxConvolution.cpp
+	vxDistribution.cpp
+	vxEngine.cpp
+	vxEngineUtil.cpp
+	vxImage.cpp
+	vxLUT.cpp
+	vxMatrix.cpp
+	vxParameter.cpp
+	vxParamHelper.cpp
+	vxPyramid.cpp
+	vxRemap.cpp
+	vxScalar.cpp
+	vxThreshold.cpp
+	vxUtils.cpp
+)
+
+add_executable(runvx ${SOURCES})
+
+target_link_libraries(runvx openvx)
+
+if (OpenCL_FOUND)
+	target_compile_definitions(runvx PUBLIC USE_OPENCL=1)
+	include_directories(${OpenCL_INCLUDE_DIRS})
+	target_link_libraries(runvx ${OpenCL_LIBRARIES})
+else(OpenCL_FOUND)
+	target_compile_definitions(runvx PUBLIC USE_OPENCL=0)
+endif(OpenCL_FOUND)
+
+if (OpenCV_FOUND)
+	target_compile_definitions(runvx PUBLIC USE_OPENCV=1)
+	include_directories(${OpenCV_INCLUDE_DIRS})
+	target_link_libraries(runvx ${OpenCV_LIBRARIES})
+else(OpenCV_FOUND)
+	target_compile_definitions(runvx PUBLIC USE_OPENCV=0)
+endif(OpenCV_FOUND)
+
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+	target_link_libraries(runvx crypto)
+endif()
diff --git a/runvx/README.md b/runvx/README.md
new file mode 100644
index 0000000..6022669
--- /dev/null
+++ b/runvx/README.md
@@ -0,0 +1,253 @@
+# AMD RunVX USER GUIDE
+
+## DESCRIPTION
+This guide provides an overview of the content and usage of the RunVX tool. This tool is used to run OpenVX graphs, with a simple, easy-to-use interface. It encapsulates most of the routine OpenVX calls, thus speeding up development and enabling rapid prototyping.
+
+## GETTING STARTED
+The RunVX tool is simple to setup and use. The graph is provided as a GDF format file (graph description file), a simple and intuitive syntax to describe the various nodes and the dependencies. The tool has other useful features like comparing outputs, visualizing output keypoints on the picture window, using the OpenCV library.
+
+### Supported Development Environments
+The following are the minimum requirements for running the AMD RunVX tool.
+* OS:             Microsoft Windows 7/8.1/10 (64-bit only), Ubuntu 15.10 64-bit 
+* IDE/Compiler:   Microsoft Visual Studio Professional 2013 (for example code)
+* CPU:            SSE4.1 or above CPU, 64-bit.
+* GPU:            Radeon R7 Series or above (Kaveri+ APU), Radeon 3xx Series or above
+* DRIVER:         AMD Catalyst 15.7 (version 15.20) with OpenCL2.0 runtimes
+* SUPPORT TOOLS
+    * AMD APP SDK v3.0 Beta or higher (64-bit) from [here](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
+    * OpenCV 3.0 compliant cameras and webcams are required for graphs using camera inputs. The following cameras have been tested with this version
+        * Microsoft LifeCam HD3000
+        * Logitech c270 HD Webcam
+        * Creative Senz3D VF0780 
+        * HP laptop built-in camera
+
+## RUNVX TOOL
+RunVX is a command line tool used to execute OpenVX graphs. As input, RUNVX takes a GDF (Graph Description Format) file and outputs images, pyramids, arrays, distributions, etc. This guide describes the elements of RunVX tool and the syntax used to run OpenVX graphs.
+
+### RunVX Syntax
+    runvx.exe [options] file <GDF-file> argument(s)
+    runvx.exe [options] node <kernelName> argument(s)
+
+    options:
+      -h                                 -- show full help
+      -v                                 -- verbose
+      -root:<directory>                  -- replace ~ in filenames with directory 
+                                              in the command-line and GDF file. 
+                                              The default value of '~' is '.' 
+                                              when this option is not specified.
+      -frames:[<start>[,<end>]]/[live]   -- run the graph/node for specified frames
+      -frames-or-eof::[<start>[,<end>]]  -- run the graph/node for specified frames
+                                              or until eof
+      -affinity:<type>                   -- set default target affinity to type 
+                                              (shall be CPU or GPU)
+      -ago-profile                       -- print node-level performance data for 
+                                              profiling
+      -no-abort-on-mismatch              -- continue graph processing even if a 
+                                              mismatch occurs
+      -no-run                            -- abort after vxVerifyGraph
+      -disable-virtual                   -- replace all virtual data types in GDF 
+                                              with non-virtual data types
+    
+    kernelName:
+    The supported kernel name list is given below.
+        org.khronos.openvx.color_convert
+        org.khronos.openvx.channel_extract
+        org.khronos.openvx.channel_combine
+        org.khronos.openvx.sobel_3x3
+        org.khronos.openvx.magnitude
+        org.khronos.openvx.phase
+        org.khronos.openvx.scale_image
+        org.khronos.openvx.table_lookup
+        org.khronos.openvx.histogram
+        org.khronos.openvx.equalize_histogram
+        org.khronos.openvx.absdiff
+        org.khronos.openvx.mean_stddev
+        org.khronos.openvx.threshold
+        org.khronos.openvx.integral_image
+        org.khronos.openvx.dilate_3x3
+        org.khronos.openvx.erode_3x3
+        org.khronos.openvx.median_3x3
+        org.khronos.openvx.box_3x3
+        org.khronos.openvx.gaussian_3x3
+        org.khronos.openvx.custom_convolution
+        org.khronos.openvx.gaussian_pyramid
+        org.khronos.openvx.accumulate
+        org.khronos.openvx.accumulate_weighted
+        org.khronos.openvx.accumulate_square
+        org.khronos.openvx.minmaxloc
+        org.khronos.openvx.convertdepth
+        org.khronos.openvx.canny_edge_detector
+        org.khronos.openvx.and
+        org.khronos.openvx.or
+        org.khronos.openvx.xor
+        org.khronos.openvx.not
+        org.khronos.openvx.multiply
+        org.khronos.openvx.add
+        org.khronos.openvx.subtract
+        org.khronos.openvx.warp_affine
+        org.khronos.openvx.warp_perspective
+        org.khronos.openvx.harris_corners
+        org.khronos.openvx.fast_corners
+        org.khronos.openvx.optical_flow_pyr_lk
+        org.khronos.openvx.remap
+        org.khronos.openvx.halfscale_gaussian
+        
+
+    argument(s):
+    GDF objects can be created on command-line with I/O capability and passed into a GDF file. A GDF file can access these GDF objects using $1, $2, $3, etc. corresponding to its position command-line. In addition to read/write/compare/initialize, image objects can be connected to a camera, image and keypoint-array objects can be displayed in a window. See below for argument specification, which is an extension to GDF object syntax for limited set of GDF objects.
+
+    image:<width>,<height>,<format>[[:<io-params>]...]
+    image-virtual:<width>,<height>,<image-format>
+    image-uniform:<width>,<height>,<image-format>,<uniform-pixel-value>[[:<io-params>]...] 
+      <image-format>         -   four letter string with values of VX_DF_IMAGE (see vx_df_image_e in vx_types.h)
+                                 some useful formats are: RGB2,RGBX,IYUV,NV12,U008,S016,U032,U001,F032,...
+      <uniform-pixel-val>    -   pixel value for uniform image
+      <io-params>            -   READ,<fileNameorURL>[,frames{<start>[;<count>;repeat]}] or camera,deviceNumber
+                                   to read input from file, URL, camera 
+                                 VIEW,<window-name> to display in a window
+                                 WRITE,<fileNameOrURL> to write
+                                 COMPARE,<fileName>[,rect{<start-x>;<start-y>;<end-x>;<end-y>}][,err{<min>;<max>}]
+                                   [,checksum|checksum-save-instead-of-test]
+                                   to compare output for exact-match or md5 checksum match or match
+                                   with range of pixel values, or generate md5 checksum without comparing.
+
+    array:<format>,<capacity>[[:<io-params>]...]
+    array-virtual:<format>,<capacity>[[:<io-params>]...]
+      <format>               -   KEYPOINT/COORDINATES2D/COORDINATES3D/RECTANGLE
+      <io-params>            -   READ,<fileName>[,ascii|binary] to read 
+                                 VIEW,<window-name> to display points overlaid in a window
+                                 WRITE,<fileName>[,ascii|binary] to write
+                                 COMPARE,<fileName>[,ascii|binary][,err{<x>;<y>[;<strength>]}]
+
+    pyramid:<numLevels>,half|orb|<scale-factor>,<width>,<height>,<image-format>[:<io-params>]
+    pyramid-virtual:<numLevels>,half|orb|<scale-factor>,<width>,<height>,<image-format>
+      <numLevels>            -   number of levels of the pyramid
+      <image-format>         -   four letter string with values of VX_DF_IMAGE (see vx_df_image_e in vx_types.h)
+                                   some useful formats are: U008,S016,...
+    <io-params>              -   READ,<fileName[%n]> read image into pyramid at level n
+                                 WRITE,<fileName[%n]> write level n to file
+                                 COMPARE,<fileName>[,rect{<start-x>;<start-y>;<end-x>;<end-y>}][,err{<min>;<max>}]
+                                 [,checksum|checksum-save-instead-of-test]
+                                   to compare output for exact-match or md5 checksum match or match
+                                   with range of pixel values, or generate md5 checksum without comparing
+
+    distribution:<numBins>,<offset>,<range>[[:<io-params>]...]
+      <numBins>              -   num of bins
+      <offset>               -   start offset value
+      <range>                -   end value to use as range
+      <io-params>            -   READ,<fileName>[,ascii|binary] to read 
+                                 VIEW,<window-name> to overlay distribution visualization on a window
+                                 WRITE,<fileName>[,ascii|binary] to write
+                                 COMPARE,<fileName>[,ascii|binary] exact match compare
+
+    convolution:<columns>,<rows>[[:<io-params>]...]
+      <io-params>            -   READ,<fileName>[,ascii|binary|scale] to read 
+                                 WRITE,<fileName>[,ascii|binary|scale] to write
+                                 COMPARE,<fileName>[,ascii|binary|scale] exact match compare
+                                 SCALE,<scale-factor> scale value of the convolution
+                                 INIT,{<value1>;<value2>;...<valueN>} initialize matrix to immediate value
+
+    lut:<data-type>,<count>[[:<io-params>]...]
+      <data-type>            -   useful data types are: UINT8
+      <io-params>            -   READ,<fileName>[,ascii|binary] to read 
+                                 WRITE,<fileName>[,ascii|binary] to write
+                                 COMPARE,<fileName>[,ascii|binary] exact match compare
+                                 VIEW,<window-name> to view lookup table on a window
+
+    matrix:<data-type>,<columns>,<rows>[:<mode>,<fileName>]
+      <data-type>            -   useful data types are: INT32, FLOAT32
+      <io-params>            -   READ,<fileName>[,ascii|binary] to read 
+                                 WRITE,<fileName>[,ascii|binary] to write
+                                 COMPARE,<fileName>[,ascii|binary] exact match compare
+                                 INIT,{<value1>;<value2>;...<valueN>} initialize matrix to immediate value
+
+    remap:<srcWidth>,<srcHeight>,<dstWidth>,<dstHeight>[:<io-params>]
+      <io-params>            -   READ,<fileName>[,ascii|binary] to read 
+                                 WRITE,<fileName>[,ascii|binary] to write
+                                 COMPARE,<fileName>[,ascii|binary][,err{<x>;<y>}] compare within range
+                                 INIT,<filename>|rotate-90|rotate-180|rotate-270|scale|hflip|vflip
+                                   initialize remap table with file or in-built remaps
+
+    scalar:<data-type>,<value>[:<io-params>]
+      <data-type>            -   any scalar datatype supported by OpenVX some useful data types are: INT32, UINT32, FLOAT32, ENUM, BOOL, SIZE, ...
+      <io-params>            -   READ,<fileName> to read from file
+                                 WRITE,<fileName> to write to file
+                                 COMPARE,<fileName>[,range] compare within range in file,separated by space
+                                 VIEW,<window-name> to view scalar value on a window
+
+    threshold:<thresh-type>,<data-type>[:<io-params>]
+      <thresh-type>          -   BINARY/RANGE
+      <data-type>            -   useful data types are: UINT8, INT16
+      <io-params>            -   READ,<fileName> to read from file
+                                 INIT,<value1>[,<value2>] threshold value or range
+
+## Examples
+Here are few examples that demonstrate use of RUNVX prototyping tool.
+
+### Canny Edge Detector
+This example demonstrates building OpenVX graph for Canny edge detector. Use [raja-koduri-640x480.jpg](http://cdn5.applesencia.com/wp-content/blogs.dir/17/files/2013/04/raja-koduri-640x480.jpg) for this example.
+
+    % runvx[.exe] file canny.gdf image:640,480,RGB2:READ,raja-koduri-640x480.jpg:VIEW,RGB image:640,480,U008:VIEW,SKIN
+
+File **canny.gdf**:
+
+    # compute luma image channel from input RGB image
+    data yuv  = image-virtual:0,0,IYUV
+    data luma = image-virtual:0,0,U008
+    node org.khronos.openvx.color_convert $1 yuv
+    node org.khronos.openvx.channel_extract yuv !CHANNEL_Y luma
+
+    # compute edges in luma image using Canny edge detector
+    data hyst = threshold:RANGE,UINT8:INIT,80,100
+    data gradient_size = scalar:INT32,3
+    node org.khronos.openvx.canny_edge_detector luma hyst gradient_size !NORM_L1 $2
+    
+### Skintone Pixel Detector
+This example demonstrates building OpenVX graph for pixel-based skin tone detector [Peer et al. 2003]. Use [raja-koduri-640x480.jpg](http://cdn5.applesencia.com/wp-content/blogs.dir/17/files/2013/04/raja-koduri-640x480.jpg) for this example.
+
+    % runvx[.exe] file skintonedetect.gdf image:640,480,RGB2:READ,raja-koduri-640x480.jpg:VIEW,RGB image:640,480,U008:VIEW,SKIN
+
+File **skintonedetect.gdf**:
+
+    # threshold objects
+    data thr95  = threshold:BINARY,UINT8:INIT,95 # threshold for computing R > 95
+    data thr40  = threshold:BINARY,UINT8:INIT,40 # threshold for computing G > 40
+    data thr20  = threshold:BINARY,UINT8:INIT,20 # threshold for computing B > 20
+    data thr15  = threshold:BINARY,UINT8:INIT,15 # threshold for computing R-G > 15
+    data thr0   = threshold:BINARY,UINT8:INIT,0  # threshold for computing R-B > 0
+
+    # virtual image objects for intermediate results
+    data R      = image-virtual:0,0,U008
+    data G      = image-virtual:0,0,U008
+    data B      = image-virtual:0,0,U008
+    data RmG    = image-virtual:0,0,U008
+    data RmB    = image-virtual:0,0,U008
+    data R95    = image-virtual:0,0,U008
+    data G40    = image-virtual:0,0,U008
+    data B20    = image-virtual:0,0,U008
+    data RmG15  = image-virtual:0,0,U008
+    data RmB0   = image-virtual:0,0,U008
+    data and1   = image-virtual:0,0,U008
+    data and2   = image-virtual:0,0,U008
+    data and3   = image-virtual:0,0,U008
+
+    # extract R,G,B channels and compute R-G and R-B
+    node org.khronos.openvx.channel_extract $1 !CHANNEL_B R # extract R channel from input (argument 1)
+    node org.khronos.openvx.channel_extract $1 !CHANNEL_G G # extract G channel from input (argument 1)
+    node org.khronos.openvx.channel_extract $1 !CHANNEL_R B # extract B channel from input (argument 1)
+    node org.khronos.openvx.subtract R   G   !SATURATE RmG  # compute R-G
+    node org.khronos.openvx.subtract R   B   !SATURATE RmB  # compute R-B
+
+    # compute threshold
+    node org.khronos.openvx.threshold R   thr95 R95         # compute R > 95
+    node org.khronos.openvx.threshold G   thr40 G40         # compute G > 40
+    node org.khronos.openvx.threshold B   thr20 B20         # compute B > 20
+    node org.khronos.openvx.threshold RmG thr15 RmG15       # compute RmG > 15
+    node org.khronos.openvx.threshold RmB thr0  RmB0        # compute RmB > 0
+
+    # aggregate all thresholded values to produce SKIN pixels
+    node org.khronos.openvx.and R95   G40   and1            # compute R95 & G40
+    node org.khronos.openvx.and and1  B20   and2            # compute B20 & and1
+    node org.khronos.openvx.and RmG15 RmB0  and3            # compute RmG15 & RmB0
+    node org.khronos.openvx.and and2 and3 $2                # compute and2 & and3 as output (argument 2)
+    
diff --git a/runvx/runvx.cpp b/runvx/runvx.cpp
new file mode 100644
index 0000000..b1b3bbe
--- /dev/null
+++ b/runvx/runvx.cpp
@@ -0,0 +1,446 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#if _DEBUG
+#define _CRTDBG_MAP_ALLOC
+#include <stdlib.h>
+#include <crtdbg.h>
+#endif
+
+#include "vxParamHelper.h"
+#include "vxEngineUtil.h"
+#include "vxEngine.h"
+
+// version
+#define RUNVX_VERSION "0.9.0"
+
+class CFileBuffer {
+public:
+	CFileBuffer(const char * fileName) {
+		size_in_bytes = 0; buffer_allocated = buffer_aligned = 0;
+		FILE * fp = fopen(fileName, "rb");
+		if (!fp) {
+			printf("ERROR: unable to open '%s'\n", fileName);
+		}
+		else {
+			fseek(fp, 0L, SEEK_END); size_in_bytes = ftell(fp); fseek(fp, 0L, SEEK_SET);
+			buffer_allocated = new unsigned char[size_in_bytes + 32];
+			buffer_aligned = (unsigned char *)((((intptr_t)buffer_allocated) + 31) & ~31);
+			(void)fread(buffer_aligned, 1, size_in_bytes, fp);
+			buffer_aligned[size_in_bytes] = 0;
+			//printf("OK: read %d bytes from %s\n", size_in_bytes, fileName);
+			fclose(fp);
+		}
+	}
+	CFileBuffer(size_t _size_in_bytes, size_t _prefix_bytes = 0, size_t _postfix_bytes = 0) {
+		size_in_bytes = _size_in_bytes;
+		prefix_bytes = _prefix_bytes;
+		postfix_bytes = _postfix_bytes;
+		buffer_allocated = new unsigned char[size_in_bytes + prefix_bytes + postfix_bytes + 32];
+		buffer_aligned = (unsigned char *)((((intptr_t)(buffer_allocated + prefix_bytes)) + 31) & ~31);
+		memset(buffer_aligned, 0, size_in_bytes);
+	}
+	~CFileBuffer() { if (buffer_allocated) delete[] buffer_allocated; }
+	void * GetBuffer() { return buffer_aligned; }
+	size_t GetSizeInBytes() { return size_in_bytes; }
+	int WriteFile(const char * fileName) {
+		if (!buffer_aligned) return -1;
+		FILE * fp = fopen(fileName, "wb"); if (!fp) { printf("ERROR: unable to open '%s'\n", fileName); return -1; }
+		fwrite(buffer_aligned, 1, size_in_bytes, fp); fclose(fp);
+		printf("OK: wrote %d bytes into %s\n", (int)size_in_bytes, fileName);
+		return 0;
+	}
+private:
+	unsigned char * buffer_allocated, *buffer_aligned;
+	size_t size_in_bytes, prefix_bytes, postfix_bytes;
+};
+
+
+void show_usage(const char * program, bool detail)
+{
+	printf("RUNVX.EXE %s\n", RUNVX_VERSION);
+	printf("\n");
+	printf("Usage:\n");
+	printf("  runvx.exe [options] file <GDF-file> argument(s)\n");
+	printf("  runvx.exe [options] node <kernelName> argument(s)\n");
+	printf("\n");
+	printf("[options]:\n");
+	printf("  -h                                 -- show full help\n");
+	printf("  -v                                 -- verbose\n");
+	printf("  -root:<directory>                  -- replace ~ in filenames with <directory> in the command-line and\n");
+	printf("                                        GDF file.The default value of '~' is '.' when\n");
+	printf("                                        this option is not specified.\n");
+	printf("  -frames:[<start>[,<end>]]/[live]   -- run the graph/node for specified frames\n");
+	printf("  -frames-or-eof::[<start>[,<end>]]  -- run the graph/node for specified frames or until eof\n");
+	printf("  -affinity:<type>                   -- set default target affinity to <type> (shall be CPU or GPU)\n");
+	printf("  -ago-profile                       -- print node-level performance data for profiling\n");
+	printf("  -no-abort-on-mismatch              -- continue graph processing even if a mismatch occurs\n");
+	printf("  -no-run                            -- abort after vxVerifyGraph\n");
+	printf("  -disable-virtual                   -- replace all virtual data types in GDF with non-virtual data types\n");
+
+	if (!detail) return;
+	printf("\n");
+	printf("argument(s):\n");
+	printf("  GDF objects can be created on command-line with I/O capability and passed into a GDF file.\n");
+	printf("  A GDF file can access these GDF objects using $1, $2, $3, etc. corresponding to its position\n");
+	printf("  command-line. In addition to read/write/compare/initialize, image objects can be connected to\n");
+	printf("  a camera, image and keypoint-array objects can be displayed in a window. See below for argument\n");
+	printf("  specification, which is an extension to GDF object syntax for limited set of GDF objects.\n");
+	printf("\n");
+	printf("  image:<width>,<height>,<format>[[:<io-params>]...]\n");
+	printf("  image-virtual:<width>,<height>,<image-format>\n");
+	printf("  image-uniform:<width>,<height>,<image-format>,<uniform-pixel-value>[[:<io-params>]...] \n");
+	printf("    <image-format>         -   four letter string with values of VX_DF_IMAGE (see vx_df_image_e in vx_types.h)\n");
+	printf("                               some useful formats are: RGB2,RGBX,IYUV,NV12,U008,S016,U032,U001,F032,...\n");
+	printf("    <uniform-pixel-val>    -   pixel value for uniform image\n");
+	printf("    <io-params>            -   READ,<fileNameorURL>[,frames{<start>[;<count>;repeat]}] or camera,deviceNumber\n");
+	printf("                                   to read input from file, URL, camera \n");
+	printf("                               VIEW,<window-name> to display in a window\n");
+	printf("                               WRITE,<fileNameOrURL> to write\n");
+	printf("                               COMPARE,<fileName>[,rect{<start-x>;<start-y>;<end-x>;<end-y>}][,err{<min>;<max>}]\n");
+	printf("                                                 [,checksum|checksum-save-instead-of-test]\n");
+	printf("                                   to compare output for exact-match or md5 checksum match or match\n");
+	printf("                                   with range of pixel values, or generate md5 checksum without comparing\n");
+	printf("\n");
+	printf("  array:<format>,<capacity>[[:<io-params>]...]\n");
+	printf("  array-virtual:<format>,<capacity>[[:<io-params>]...]\n");
+	printf("    <format>               -   KEYPOINT/COORDINATES2D/COORDINATES3D/RECTANGLE\n");
+	printf("    <io-params>            -   READ,<fileName>[,ascii|binary] to read \n");
+	printf("                               VIEW,<window-name> to display points overlaid in a window\n");
+	printf("                               WRITE,<fileName>[,ascii|binary] to write\n");
+	printf("                               COMPARE,<fileName>[,ascii|binary][,err{<x>;<y>[;<strength>]}]\n");
+	printf("\n");
+	printf("  pyramid:<numLevels>,half|orb|<scale-factor>,<width>,<height>,<image-format>[:<io-params>]\n");
+	printf("  pyramid-virtual:<numLevels>,half|orb|<scale-factor>,<width>,<height>,<image-format>\n");
+	printf("    <numLevels>            -   number of levels of the pyramid\n");
+	printf("    <image-format>         -   four letter string with values of VX_DF_IMAGE (see vx_df_image_e in vx_types.h)\n");
+	printf("                               some useful formats are: U008,S016,...\n");
+	printf("    <io-params>            -   READ,<fileName[%%n]> read image into pyramid at level n\n");
+	printf("                               WRITE,<fileName[%%n]> write level n to file\n");
+	printf("                               COMPARE,<fileName>[,rect{<start-x>;<start-y>;<end-x>;<end-y>}][,err{<min>;<max>}]\n");
+	printf("                                                 [,checksum|checksum-save-instead-of-test]\n");
+	printf("                                   to compare output for exact-match or md5 checksum match or match\n");
+	printf("                                   with range of pixel values, or generate md5 checksum without comparing\n");
+	printf("\n");
+	printf("  distribution:<numBins>,<offset>,<range>[[:<io-params>]...]\n");
+	printf("    <numBins>              -   num of bins\n");
+	printf("    <offset>               -   start offset value\n");
+	printf("    <range>                -   end value to use as range\n");
+	printf("    <io-params>            -   READ,<fileName>[,ascii|binary] to read \n");
+	printf("                               VIEW,<window-name> to overlay distribution visualization on a window\n");
+	printf("                               WRITE,<fileName>[,ascii|binary] to write\n");
+	printf("                               COMPARE,<fileName>[,ascii|binary] exact match compare\n");
+	printf("\n");
+	printf("  convolution:<columns>,<rows>[[:<io-params>]...]\n");
+	printf("    <io-params>            -   READ,<fileName>[,ascii|binary|scale] to read \n");
+	printf("                               WRITE,<fileName>[,ascii|binary|scale] to write\n");
+	printf("                               COMPARE,<fileName>[,ascii|binary|scale] exact match compare\n");
+	printf("                               SCALE,<scale-factor> scale value of the convolution\n");
+	printf("                               INIT,{<value1>;<value2>;...<valueN>} initialize matrix to immediate value\n");
+	printf("\n");
+	printf("  lut:<data-type>,<count>[[:<io-params>]...]\n");
+	printf("    <data-type>            -   useful data types are: UINT8\n");
+	printf("    <io-params>            -   READ,<fileName>[,ascii|binary] to read \n");
+	printf("                               WRITE,<fileName>[,ascii|binary] to write\n");
+	printf("                               COMPARE,<fileName>[,ascii|binary] exact match compare\n");
+	printf("                               VIEW,<window-name> to view lookup table on a window\n");
+	printf("\n");
+	printf("  matrix:<data-type>,<columns>,<rows>[:<mode>,<fileName>]\n");
+	printf("    <data-type>            -   useful data types are: INT32, FLOAT32\n");
+	printf("    <io-params>            -   READ,<fileName>[,ascii|binary] to read \n");
+	printf("                               WRITE,<fileName>[,ascii|binary] to write\n");
+	printf("                               COMPARE,<fileName>[,ascii|binary] exact match compare\n");
+	printf("                               INIT,{<value1>;<value2>;...<valueN>} initialize matrix to immediate value\n");
+	printf("\n");
+	printf("  remap:<srcWidth>,<srcHeight>,<dstWidth>,<dstHeight>[:<io-params>]\n");
+	printf("    <io-params>            -   READ,<fileName>[,ascii|binary] to read \n");
+	printf("                               WRITE,<fileName>[,ascii|binary] to write\n");
+	printf("                               COMPARE,<fileName>[,ascii|binary][,err{<x>;<y>}] compare within range\n");
+	printf("                               INIT,<filename>|rotate-90|rotate-180|rotate-270|scale|hflip|vflip\n");
+	printf("                                   initialize remap table with file or in-built remaps\n");
+	printf("\n");
+	printf("  scalar:<data-type>,<value>[:<io-params>]\n");
+	printf("    <data-type>            -   any scalar datatype supported by OpenVX\n");
+	printf("                               some useful data types are: INT32, UINT32, FLOAT32, ENUM, BOOL, SIZE, ...\n");
+	printf("    <io-params>            -   READ,<fileName> to read from file\n");
+	printf("                               WRITE,<fileName> to write to file\n");
+	printf("                               COMPARE,<fileName>[,range] compare within range in file,separated by space\n");
+	printf("                               VIEW,<window-name> to view scalar value on a window\n");
+	printf("\n");
+	printf("  threshold:<thresh-type>,<data-type>[:<io-params>]\n");
+	printf("    <thresh-type>          -   BINARY/RANGE\n");
+	printf("    <data-type>            -   useful data types are: UINT8, INT16\n");
+	printf("    <io-params>            -   READ,<fileName> to read from file\n");
+	printf("                               INIT,<value1>[,<value2>] threshold value or range\n");
+}
+
+int main(int argc, char * argv[])
+{	
+	// process command-line options
+	const char * program = "runvx.exe";
+	bool verbose = false;
+	bool enableMultiFrameProcessing = false;
+	bool framesEofRequested = true;
+	bool useAgoImport = false, useAgoDump = false, doRun = true, doUseProcessGraph = false;
+	bool doPause = false;
+	bool doGetInternalProfile = false;
+	bool disableVirtual = false;
+	bool abortOnMismatch = true;
+	vx_uint32 defaultTargetAffinity = 0;
+	vx_uint32 defaultTargetInfo = 0;
+	bool doSetGraphOptimizerFlags = false;
+	vx_uint32 graphOptimizerFlags = 0;
+	int arg, frameStart = 0, frameEnd = 1;
+	bool frameCountSpecified = false;
+	for (arg = 1; arg < argc; arg++){
+		if (argv[arg][0] == '-'){
+			if (!strcmp(argv[arg], "-h")) {
+				show_usage(program, true);
+				exit(0);
+			}
+			else if (!strcmp(argv[arg], "-v")) {
+				verbose ^= true;
+			}
+			else if (!strncmp(argv[arg], "--", 2)) { // skip specified number of arguments: --[#] (default just skip --)
+				arg += atoi(&argv[arg][2]);
+			}
+			else if (!strncmp(argv[arg], "-root:", 6)) {
+				SetRootDir(argv[arg] + 6);
+			}
+			else if (!strncmp(argv[arg], "-frames:", 8) || !strncmp(argv[arg], "-frames-or-eof:", 15)) {
+				int spos = 8;
+				framesEofRequested = false;
+				if (!strncmp(argv[arg], "-frames-or-eof:", 15)) {
+					spos = 15;
+					framesEofRequested = true;
+				}
+				if (!strcmp(&argv[arg][spos], "live")) {
+					enableMultiFrameProcessing = true;
+				}
+				else {
+					int k = sscanf(&argv[arg][spos], "%d,%d", &frameStart, &frameEnd);
+					if (k == 1) { frameEnd = frameStart, frameStart = 0; }
+					else if (k != 2) { printf("ERROR: invalid -frames option\n"); return -1; }
+				}
+				frameCountSpecified = true;
+			}
+			else if (!_strnicmp(argv[arg], "-affinity:", 10)) {
+				if (!_strnicmp(&argv[arg][10], "cpu", 3)) defaultTargetAffinity = AGO_TARGET_AFFINITY_CPU;
+				else if (!_strnicmp(&argv[arg][10], "gpu", 3)) defaultTargetAffinity = AGO_TARGET_AFFINITY_GPU;
+				else { printf("ERROR: unsupported affinity target: %s\n", &argv[arg][10]); return -1; }
+				if (argv[arg][13] >= '0' && argv[arg][13] <= '9')
+					defaultTargetInfo = atoi(&argv[arg][13]);
+			}
+			else if (!strcmp(argv[arg], "-pause")) {
+				doPause = true;
+			}
+			else if (!strcmp(argv[arg], "-ago-profile")) {
+				doGetInternalProfile = true;
+			}
+			else if (!strcmp(argv[arg], "-ago-import")) {
+				useAgoImport = true;
+			}
+			else if (!strcmp(argv[arg], "-no-ago-import")) {
+				useAgoImport = false;
+			}
+			else if (!strcmp(argv[arg], "-ago-dump")) {
+				useAgoDump = true;
+			}
+			else if (!strcmp(argv[arg], "-no-run")) {
+				doRun = false;
+			}
+			else if (!strcmp(argv[arg], "-no-abort-on-mismatch")) {
+				abortOnMismatch = false;
+			}
+			else if (!strcmp(argv[arg], "-use-process-graph")) {
+				doUseProcessGraph = true;
+			}
+			else if (!_stricmp(argv[arg], "-disable-virtual")) {
+				disableVirtual = true;
+			}
+			else if (!_strnicmp(argv[arg], "-graph-optimizer-flags:", 23)) {
+				if (sscanf(&argv[arg][23], "%i", &graphOptimizerFlags) == 1) {
+					doSetGraphOptimizerFlags = true;
+				}
+				else { printf("ERROR: invalid graph optimizer flags: %s\n", argv[arg]); return -1; }
+			}
+			else { printf("ERROR: invalid option: %s\n", argv[arg]); return -1; }
+		}
+		else break;
+	}
+	if (arg == argc) { show_usage(program, false); return -1; }
+	int argCount = argc - arg - 2;
+
+	// get optional arguments
+	int optionCount = 0;
+	char optionString[1024] = { 0 };
+	for (int i = 0; i < argCount; i++) {
+		char * option = argv[arg + 2 + i];
+		if (*option == '/') {
+			optionCount++;
+			char * p = strstr(option, "=");
+			if (!strncmp(option, "/def-var:", 9) && p != NULL) {
+				char * s = optionString + strlen(optionString);
+				sprintf(s, "def-var %s\n", option + 9);
+				*strstr(s, "=") = ' ';
+			}
+		}
+		if (strstr(option, ":R,CAMERA-")) enableMultiFrameProcessing = true;
+	}
+	fflush(stdout);
+
+	CVxEngine engine;
+	int errorCode = 0;
+	try {
+		// initialize engine
+		if (engine.Initialize(argCount - optionCount, defaultTargetAffinity, defaultTargetInfo, doUseProcessGraph, disableVirtual) < 0) throw -1;
+		if (doSetGraphOptimizerFlags) {
+			engine.SetGraphOptimizerFlags(graphOptimizerFlags);
+		}
+		engine.SetCaptureFrameStart(frameStart);
+		fflush(stdout);
+		for (int i = 0, j = 0; i < argCount; i++) {
+			char * param = argv[arg + 2 + i];
+			if (param[0] != '/') {
+				// pass non-options as parameters to the engine
+				if (engine.SetParameter(j++, param) < 0)
+					throw -1;
+			}
+		}
+		fflush(stdout);
+
+		if (!strncmp(argv[arg], "file", 4)) {
+			if ((arg+1) == argc){
+				printf("ERROR: Need to specify a .gdf file to run\n");
+				throw -1;
+			}
+			arg++;
+			const char * fileName = RootDirUpdated(argv[arg]);
+			CFileBuffer txt(fileName);
+			char * txtBuffer = (char *)txt.GetBuffer();
+			if (!txtBuffer) {
+				printf("ERROR: unable to open: %s\n", fileName);
+				throw -1;
+			}
+			char * fullText = new char[strlen(optionString) + strlen(txtBuffer) + 1];
+			strcpy(fullText, optionString);
+			strcat(fullText, txtBuffer);
+			if (engine.BuildGraph(fullText, useAgoImport, useAgoDump) < 0)
+				throw -1;
+			delete[] fullText;
+		}
+		else if (!strncmp(argv[arg], "node", 4)) {
+			char txt[1024];
+			int paramCount = argc - arg - 2;
+			arg++;
+			sprintf(txt, "node %s ", argv[arg]);
+			for (int i = 0, j = 0; i < paramCount; i++) {
+				if (argv[arg + 1 + i][0] != '/'){
+					sprintf(txt + strlen(txt), "$%d ", j++ + 1);
+				}
+			}
+			if (engine.BuildGraph(txt, useAgoImport, useAgoDump) < 0) throw -1;
+		}
+		else { printf("ERROR: invalid command: %s\n", argv[arg]); throw -1; }
+		fflush(stdout);
+		if (doRun) {
+			engine.SetVerbose(verbose);
+			engine.SetAbortOnMismatch(abortOnMismatch);
+			if (engine.IsUsingMultiFrameCapture()) {
+				enableMultiFrameProcessing = true;
+			}
+			if (frameCountSpecified) {
+				enableMultiFrameProcessing = false;
+			}
+			// execute the graph for all requested frames
+			int count = 0, status = 0;
+			printf("csv,HEADER ,STATUS, COUNT,cur-ms,avg-ms,min-ms,clenqueue-ms,clwait-ms,clwrite-ms,clread-ms\n");
+			fflush(stdout);
+
+			int64_t start_time = utilGetClockCounter();
+
+			for (int frameNumber = frameStart; enableMultiFrameProcessing || frameNumber < frameEnd; frameNumber++, count++){
+				//read input data, when specified
+				if ((status = engine.ReadFrame(frameNumber)) < 0) throw -1;
+				if (framesEofRequested && status > 0) {
+					// data is not available
+					if (frameNumber == frameStart) {
+						ReportError("ERROR: insufficient input data -- check input files\n");
+					}
+					else break; 
+				}
+				// execute graph for current frame
+				status = engine.ExecuteFrame(frameNumber);
+				if (status == VX_ERROR_GRAPH_ABANDONED)
+					break; // don't report graph abandoned as an error
+				if (status < 0) throw -1;
+				// compare output data, when requested
+				status = engine.CompareFrame(frameNumber);
+				// write output data, when requested
+				if (engine.WriteFrame(frameNumber) < 0) throw -1;
+
+				if (verbose) {
+					printf("csv,FRAME  ,  %s,%s\n", (status == 0 ? "PASS" : "FAIL"), engine.MeasureFrame(frameNumber));
+					fflush(stdout);
+				}
+				if (status < 0) throw - 1;
+				else if (status) break;
+				// display refresh
+				if (ProcessCvWindowKeyRefresh() > 0)
+					break;
+			}
+		
+			int64_t end_time = utilGetClockCounter();
+			int64_t frequency = utilGetClockFrequency();
+			float elapsed_time = (float)(end_time - start_time) / frequency;
+			printf("csv,OVERALL,  %s,%s\n", (status >= 0 ? "PASS" : "FAIL"), engine.GetPerformanceStatistics());
+			printf("Elapsed Time: %6.2f sec\n", (float)elapsed_time);
+
+			engine.GetMedianRunTime();
+#if _DEBUG
+			_CrtDumpMemoryLeaks();
+#endif
+			if (doGetInternalProfile) {
+				char fileName[] = "stdout";
+				engine.DumpInternalProfile(fileName);
+			}
+			fflush(stdout);
+		}
+		if (engine.Shutdown() < 0) throw -1;
+		fflush(stdout);
+	}
+	catch (int errorCode_) {
+		fflush(stdout);
+		engine.DisableWaitForKeyPress();
+		errorCode = errorCode_;
+	}
+	if (doPause) {
+		fflush(stdout);
+		printf("Press ENTER to exit ...\n");
+		while (getchar() != '\n')
+			;
+	}
+	return errorCode;
+}
diff --git a/runvx/runvx.vcxproj b/runvx/runvx.vcxproj
new file mode 100644
index 0000000..f5bd0e7
--- /dev/null
+++ b/runvx/runvx.vcxproj
@@ -0,0 +1,130 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E14F83E9-2295-466C-9647-7BD0D03ECE4B}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>runvx</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>..\openvx\include;$(OpenCV_DIR)\include</AdditionalIncludeDirectories>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>OpenVX.lib;IlmImfd.lib;ippicvmt.lib;zlibd.lib;libwebpd.lib;libjpegd.lib;libtiffd.lib;libpngd.lib;libjasperd.lib;vfw32.lib;opencv_imgcodecs300d.lib;opencv_hal300d.lib;opencv_core300d.lib;opencv_highgui300d.lib;opencv_imgproc300d.lib;opencv_videoio300d.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(OutDir);$(OpenCV_DIR)\x64\vc12\staticlib;$(AMDAPPSDKROOT)lib\x86_64</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>..\openvx\include;$(OpenCV_DIR)\include</AdditionalIncludeDirectories>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>OpenVX.lib;IlmImf.lib;ippicvmt.lib;zlib.lib;libwebp.lib;libjpeg.lib;libtiff.lib;libpng.lib;libjasper.lib;vfw32.lib;opencv_imgcodecs300.lib;opencv_hal300.lib;opencv_core300.lib;opencv_highgui300.lib;opencv_imgproc300.lib;opencv_videoio300.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(OutDir);$(OpenCV_DIR)\x64\vc12\staticlib;$(AMDAPPSDKROOT)lib\x86_64</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="vxArray.cpp" />
+    <ClCompile Include="vxConvolution.cpp" />
+    <ClCompile Include="vxDistribution.cpp" />
+    <ClCompile Include="vxEngine.cpp" />
+    <ClCompile Include="vxEngineUtil.cpp" />
+    <ClCompile Include="runvx.cpp" />
+    <ClCompile Include="vxImage.cpp" />
+    <ClCompile Include="vxLUT.cpp" />
+    <ClCompile Include="vxMatrix.cpp" />
+    <ClCompile Include="vxParameter.cpp" />
+    <ClCompile Include="vxParamHelper.cpp" />
+    <ClCompile Include="vxPyramid.cpp" />
+    <ClCompile Include="vxRemap.cpp" />
+    <ClCompile Include="vxScalar.cpp" />
+    <ClCompile Include="vxThreshold.cpp" />
+    <ClCompile Include="vxUtils.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="vxArray.h" />
+    <ClInclude Include="vxConvolution.h" />
+    <ClInclude Include="vxDistribution.h" />
+    <ClInclude Include="vxEngine.h" />
+    <ClInclude Include="vxImage.h" />
+    <ClInclude Include="vxLUT.h" />
+    <ClInclude Include="vxMatrix.h" />
+    <ClInclude Include="vxParameter.h" />
+    <ClInclude Include="vxEngineUtil.h" />
+    <ClInclude Include="vxParamHelper.h" />
+    <ClInclude Include="vxPyramid.h" />
+    <ClInclude Include="vxRemap.h" />
+    <ClInclude Include="vxScalar.h" />
+    <ClInclude Include="vxThreshold.h" />
+    <ClInclude Include="vxUtils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/runvx/runvx.vcxproj.filters b/runvx/runvx.vcxproj.filters
new file mode 100644
index 0000000..999c3a9
--- /dev/null
+++ b/runvx/runvx.vcxproj.filters
@@ -0,0 +1,114 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="runvx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxEngine.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxEngineUtil.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxParameter.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxUtils.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxImage.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxArray.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxPyramid.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxDistribution.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxConvolution.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxLUT.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxMatrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxRemap.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxThreshold.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxScalar.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="vxParamHelper.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="vxEngine.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxParameter.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxEngineUtil.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxUtils.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxImage.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxArray.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxPyramid.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxDistribution.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxConvolution.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxLUT.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxRemap.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxScalar.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxThreshold.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="vxParamHelper.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/runvx/vxArray.cpp b/runvx/vxArray.cpp
new file mode 100644
index 0000000..cd351b9
--- /dev/null
+++ b/runvx/vxArray.cpp
@@ -0,0 +1,751 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxArray.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamArray
+//
+
+CVxParamArray::CVxParamArray()
+{
+	m_vxObjType = VX_TYPE_ARRAY;
+	m_format = VX_TYPE_KEYPOINT;
+	m_capacity = 0;
+	m_itemSize = 0;
+	m_readFileIsBinary = false;
+	m_writeFileIsBinary = false;
+	m_compareFileIsBinary = false;
+	m_array = nullptr;
+	m_bufForRead = nullptr;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	m_useSyncOpenCLWriteDirective = false;
+	m_errX = 0;
+	m_errY = 0;
+	m_errStrength = 1e-10f;
+	m_errMismatchPercent = 0.0f;
+}
+
+CVxParamArray::~CVxParamArray()
+{
+	Shutdown();
+}
+
+int CVxParamArray::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64];
+	const char * ioParams = ScanParameters(desc, "array|array-virtual:", "s:", objType);
+	if (!_stricmp(objType, "array") || !_stricmp(objType, "array-virtual")) {
+		// syntax: array[-virtual]:<format>,<capacity>[:<io-params>]
+		char itemType[64];
+		ioParams = ScanParameters(ioParams, "<format>,<capacity>", "s,D", &itemType, &m_capacity);
+		bool found_userStruct = false;
+		for (auto it = m_userStructMap->begin(); it != m_userStructMap->end(); ++it){
+			if (strcmp(itemType, it->first.c_str()) == 0){
+				found_userStruct = true;
+				m_format = it->second;
+			}
+		}
+		if (found_userStruct == false){
+			m_format = ovxName2Enum(itemType);
+			if (m_format == 0) {
+				ReportError("ERROR: invalid array item type specified: %s\n", itemType);
+			}
+		}
+		// create array object
+		if (!_stricmp(objType, "array-virtual")) {
+			m_array = vxCreateVirtualArray(graph, m_format, m_capacity);
+		}
+		else {
+			m_array = vxCreateArray(context, m_format, m_capacity);
+		}
+	}
+	else ReportError("ERROR: unsupported array type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_array);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: array creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_array) vxReleaseArray(&m_array);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_array;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamArray::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_array = (vx_array)m_vxObjRef;
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_ITEMTYPE, &m_format, sizeof(m_format)));
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_CAPACITY, &m_capacity, sizeof(m_capacity)));
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_ITEMSIZE, &m_itemSize, sizeof(m_itemSize)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>[,ascii|binary]
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+			m_readFileIsBinary = (m_fileNameRead.find(".txt") != m_fileNameRead.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_readFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_readFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid array read option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>[,ascii|binary]
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			m_writeFileIsBinary = (m_fileNameWrite.find(".txt") != m_fileNameWrite.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_writeFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_writeFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid array write option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare syntax: compare,fileName[,ascii|binary][,err{<x>;<y>[;<strength>][;<%mismatch>]}][,log{<fileName>}]
+			m_fileNameCompareLog = "";
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_compareFileIsBinary = (m_fileNameCompare.find(".txt") != m_fileNameCompare.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[256];
+				io_params = ScanParameters(io_params, ",ascii|binary|err{<x>;<y>[;<strength>][;<%mismatch>]}|log{<fileName>}", ",S", option);
+				if (!_stricmp(option, "ascii")) {
+					m_compareFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_compareFileIsBinary = true;
+				}
+				else if (!_strnicmp(option, "err{", 4)) {
+					if (m_format == VX_TYPE_KEYPOINT) {
+						const char * p = ScanParameters(&option[3], "{<x>;<y>;<strength>[;<%mismatch>]}", "{d;d;f", &m_errX, &m_errY, &m_errStrength);
+						if (*p == ';') {
+							ScanParameters(p, ";<%mismatch>}", ";f}", &m_errMismatchPercent);
+						}
+					}
+					else if (m_format == VX_TYPE_COORDINATES2D) {
+						const char * p = ScanParameters(&option[3], "{<x>;<y>[;<%mismatch>]}", "{d;d", &m_errX, &m_errY);
+						if (*p == ';') {
+							ScanParameters(p, ";<%mismatch>}", ";f}", &m_errMismatchPercent);
+						}
+					}
+					else ReportError("ERROR: array compare option not supported for this array: %s\n", option);
+				}
+				else if (!_strnicmp(option, "log{", 4)) {
+					option[strlen(option) - 1] = 0;
+					m_fileNameCompareLog.assign(RootDirUpdated(&option[4]));
+				}
+				else ReportError("ERROR: invalid array compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "view")) {
+			m_displayName.assign(fileName);
+			m_paramList.push_back(this);
+		}
+		else if (!_stricmp(ioType, "directive") && !_stricmp(fileName, "sync-cl-write")) {
+			m_useSyncOpenCLWriteDirective = true;
+		}
+		else ReportError("ERROR: invalid array operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamArray::Finalize()
+{
+	// get attributes
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_ITEMSIZE, &m_itemSize, sizeof(m_itemSize)));
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_CAPACITY, &m_capacity, sizeof(m_capacity)));
+
+	return 0;
+}
+
+int CVxParamArray::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: array COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_array) {
+		vxReleaseArray(&m_array);
+		m_array = nullptr;
+	}
+	if (m_bufForRead) {
+		delete[] m_bufForRead;
+		m_bufForRead = nullptr;
+	}
+
+	return 0;
+}
+
+// read file into m_bufForRead: returns numItems
+size_t CVxParamArray::ReadFileIntoBuffer(FILE * fp, bool readFileIsBinary)
+{
+	// make sure m_bufForRead is allocated
+	if (!m_bufForRead) NULLPTR_CHECK(m_bufForRead = new vx_uint8[m_capacity * m_itemSize]);
+
+	// read file into m_bufForRead
+	size_t numItems = 0;
+	if (readFileIsBinary)
+	{ // data in file is in BINARY format
+		numItems = fread(m_bufForRead, m_itemSize, m_capacity, fp);
+	}
+	else
+	{ // data in file is in ASCII format
+		if (m_format == VX_TYPE_KEYPOINT) {
+			// input syntax of each item: <x> <y> <strength> <scale> <orientation> <tracking_status> <error>
+			vx_keypoint_t * item = (vx_keypoint_t *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (7 != fscanf(fp, "%d%d%g%g%g%d%g", &item->x, &item->y, &item->strength, &item->scale, &item->orientation, &item->tracking_status, &item->error))
+					break;
+			}
+		}
+		else if (m_format == VX_TYPE_RECTANGLE) {
+			// input syntax of each item: <start_x> <start_y> <end_x> <end_y>
+			vx_rectangle_t * item = (vx_rectangle_t *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (4 != fscanf(fp, "%d%d%d%d", &item->start_x, &item->start_y, &item->end_x, &item->end_y))
+					break;
+			}
+		}
+		else if (m_format == VX_TYPE_COORDINATES2D) {
+			// input syntax of each item: <x> <y>
+			vx_coordinates2d_t * item = (vx_coordinates2d_t *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (2 != fscanf(fp, "%d%d", &item->x, &item->y))
+					break;
+			}
+		}
+		else if (m_format == VX_TYPE_COORDINATES3D) {
+			// input syntax of each item: <x> <y> <z>
+			vx_coordinates3d_t * item = (vx_coordinates3d_t *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (3 != fscanf(fp, "%d%d%d", &item->x, &item->y, &item->y))
+					break;
+			}
+		}
+		else if (m_format == VX_TYPE_INT32 || m_format == VX_TYPE_UINT32 || m_format == VX_TYPE_BOOL) {
+			// input syntax of each item: <x> <y> <z>
+			vx_uint32 * item = (vx_uint32 *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (1 != fscanf(fp, "%i", item))
+					break;
+			}
+		}
+		else if (m_format == VX_TYPE_FLOAT32) {
+			// input syntax of each item: <x> <y> <z>
+			vx_float32 * item = (vx_float32 *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (1 != fscanf(fp, "%g", item))
+					break;
+			}
+		}
+		else if (m_format == VX_TYPE_FLOAT64) {
+			// input syntax of each item: <x> <y> <z>
+			vx_float64 * item = (vx_float64 *)m_bufForRead;
+			for (numItems = 0; numItems < m_capacity; numItems++, item++) {
+				if (1 != fscanf(fp, "%lg", item))
+					break;
+			}
+		}
+		else {
+			// read input as hex value of each byte
+			vx_size numBytes = 0;
+			while (numBytes < (m_itemSize * m_capacity)) {
+				int value;
+				if (1 != fscanf(fp, "%x", &value))
+					break;
+				m_bufForRead[numBytes++] = (vx_uint8)value;
+			}
+			numItems = numBytes / m_itemSize;
+		}
+	}
+
+	return numItems;
+}
+
+int CVxParamArray::ReadFrame(int frameNumber)
+{
+	// check if user specified input file to read from
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// for single frame reads, there is no need to read the array again
+	// as it is already read into the object
+	if (!m_fileNameForReadHasIndex && frameNumber != m_captureFrameStart) {
+		return 0;
+	}
+
+	// reading data from input file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameRead.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_readFileIsBinary ? "rb" : "r");
+	if(!fp) {
+		if (frameNumber >= (int)m_captureFrameStart) {
+			// end of sequence detected for multiframe sequences
+			return 1;
+		}
+		else ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	size_t numItems = ReadFileIntoBuffer(fp, m_readFileIsBinary);
+	fclose(fp);
+
+	// set array size to numItems and write the data into array object
+	ERROR_CHECK(vxTruncateArray(m_array, 0));
+	if (numItems > 0) {
+		ERROR_CHECK(vxAddArrayItems(m_array, numItems, m_bufForRead, m_itemSize));
+	}
+
+	// process user requested directives
+	if (m_useSyncOpenCLWriteDirective) {
+		ERROR_CHECK(vxDirective((vx_reference)m_array, VX_DIRECTIVE_AMD_COPY_TO_OPENCL));
+	}
+
+	return 0;
+}
+
+int CVxParamArray::WriteFrame(int frameNumber)
+{
+	// check if user specified file to write
+	if (m_fileNameWrite.length() < 1) return 0;
+
+	// create the output file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameWrite.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_writeFileIsBinary ? "wb" : "w");
+	if(!fp) ReportError("ERROR: Unable to create: %s\n", fileName);
+
+	// get numItems and write if any items exist
+	vx_size numItems;
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_NUMITEMS, &numItems, sizeof(numItems)));
+	if (numItems > 0) {
+		vx_uint8 * base = nullptr;
+		vx_size stride;
+		ERROR_CHECK(vxAccessArrayRange(m_array, 0, numItems, &stride, (void **)&base, VX_READ_ONLY));
+		if (m_writeFileIsBinary)
+		{ // write in binary
+			for (size_t i = 0; i < numItems; i++) {
+				vx_uint8 * item = vxFormatArrayPointer(base, i, stride);
+				fwrite(item, 1, m_itemSize, fp);
+			}
+		}
+		else
+		{ // write in ASCII mode
+			if (m_format == VX_TYPE_KEYPOINT) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_keypoint_t * item = (vx_keypoint_t *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%4d %4d %20.12e %20.12e %20.12e %d %20.12e\n", item->x, item->y, item->strength, item->scale, item->orientation, item->tracking_status, item->error);
+				}
+			}
+			else if (m_format == VX_TYPE_COORDINATES2D) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_coordinates2d_t * item = (vx_coordinates2d_t *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%4d %4d\n", item->x, item->y);
+				}
+			}
+			else if (m_format == VX_TYPE_COORDINATES3D) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_coordinates3d_t * item = (vx_coordinates3d_t *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%4d %4d %4d\n", item->x, item->y, item->z);
+				}
+			}
+			else if (m_format == VX_TYPE_RECTANGLE) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_rectangle_t * item = (vx_rectangle_t *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%4d %4d %4d %4d\n", item->start_x, item->start_y, item->end_x, item->end_y);
+				}
+			}
+			else if (m_format == VX_TYPE_INT32 || m_format == VX_TYPE_BOOL) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_int32 * item = (vx_int32 *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%d\n", *item);
+				}
+			}
+			else if (m_format == VX_TYPE_UINT32) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_uint32 * item = (vx_uint32 *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%u\n", *item);
+				}
+			}
+			else if (m_format == VX_TYPE_FLOAT32) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_float32 * item = (vx_float32 *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%.12g\n", *item);
+				}
+			}
+			else if (m_format == VX_TYPE_FLOAT64) {
+				for (size_t i = 0; i < numItems; i++) {
+					vx_float64 * item = (vx_float64 *)vxFormatArrayPointer(base, i, stride);
+					fprintf(fp, "%.12lg\n", *item);
+				}
+			}
+			else {
+				// write output as hex values
+				for (size_t i = 0; i < numItems; i++) {
+					vx_uint8 * item = vxFormatArrayPointer(base, i, stride);
+					for (size_t j = 0; j < m_itemSize; j++)
+						fprintf(fp, " %02X", item[j]);
+					fprintf(fp, "\n");
+				}
+			}
+		}
+		ERROR_CHECK(vxCommitArrayRange(m_array, 0, numItems, base));
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamArray::CompareFrame(int frameNumber)
+{
+	// check if user specified file to write
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// clear items from m_arrayListForView
+	m_arrayListForView.clear();
+
+	// reading data from reference file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_compareFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	size_t numItemsRef = ReadFileIntoBuffer(fp, m_compareFileIsBinary);
+	fclose(fp);
+
+	// get numItems of the array
+	vx_size numItems;
+	ERROR_CHECK(vxQueryArray(m_array, VX_ARRAY_ATTRIBUTE_NUMITEMS, &numItems, sizeof(numItems)));
+
+	// compare array items
+	bool mismatchDetected = false;
+	if (m_format == VX_TYPE_KEYPOINT && numItems > 0)
+	{ // keypoint compare with user specified tolerance limits
+		mismatchDetected = CompareFrameKeypoints(numItems, numItemsRef, m_bufForRead, frameNumber, fileName);
+	}
+	else if (m_format == VX_TYPE_COORDINATES2D && numItems > 0)
+	{ // coordinates2d compare with user specified tolerance limits
+		mismatchDetected = CompareFrameCoord2d(numItems, numItemsRef, m_bufForRead, frameNumber, fileName);
+	}
+	else
+	{ // fallback to bitwise exact compare
+		mismatchDetected = CompareFrameBitwiseExact(numItems, numItemsRef, m_bufForRead, frameNumber, fileName);
+	}
+
+	// report error if mismatched
+	if (mismatchDetected) {
+		m_compareCountMismatches++;
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+	}
+
+	return 0;
+}
+
+bool CVxParamArray::CompareFrameBitwiseExact(size_t numItems, size_t numItemsRef, vx_uint8 * bufItems, int frameNumber, const char * fileName)
+{
+	// bitwise exact compare
+	size_t numItemsMin = min(numItems, numItemsRef);
+	size_t numMismatches = 0;
+	if (numItemsMin > 0) {
+		void * ptr = nullptr;
+		vx_size stride = 0;
+		ERROR_CHECK(vxAccessArrayRange(m_array, 0, numItems, &stride, &ptr, VX_READ_ONLY));
+		for (size_t i = 0; i < numItems; i++) {
+			vx_uint8 * item = vxFormatArrayPointer(ptr, i, stride);
+			if (memcmp(item, bufItems + i * m_itemSize, m_itemSize) != 0) {
+				numMismatches++;
+			}
+		}
+		ERROR_CHECK(vxCommitArrayRange(m_array, 0, numItems, ptr));
+	}
+	numMismatches += max(numItems, numItemsRef) - numItemsMin;
+	bool mismatchDetected = false;
+	if (numMismatches > 0) {
+		printf("ERROR: array COMPARE MISMATCHED %d/%d for %s with frame#%d of %s\n", (int)numMismatches, (int)numItems, GetVxObjectName(), frameNumber, fileName);
+		mismatchDetected = true;
+	}
+	else {
+		if (m_verbose) printf("OK: array COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+	}
+	return mismatchDetected;
+}
+
+bool CVxParamArray::CompareFrameKeypoints(size_t numItems, size_t numItemsRef, vx_uint8 * bufItems, int frameNumber, const char * fileName)
+{
+	FILE * fpLog = NULL;
+	if (m_fileNameCompareLog.length() > 0) {
+		char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompareLog.c_str(), frameNumber);
+		fpLog = fopen(fileName, "w");
+		if (!fpLog) ReportError("ERROR: Unable to create: %s\n", fileName);
+		printf("OK: creating array compare output log for %s in %s\n", GetVxObjectName(), fileName);
+	}
+
+	enum { // color indices of each list for viewing
+		colorIndex_match_XYexact_S    = 0,
+		colorIndex_match_XYexact_notS = 1,
+		colorIndex_match_XYS = 2,
+		colorIndex_missing_in_ref = 3,
+		colorIndex_missing_in_cur = 4,
+	};
+	// number of keypoint counts
+	size_t count_match_XYexact_S = 0;
+	size_t count_match_XYexact_notS = 0;
+	size_t count_match_XYS = 0;
+	size_t count_missing_in_ref = 0;
+	size_t count_missing_in_cur = 0;
+	size_t count_non_trackable_in_ref = 0;
+	size_t count_non_trackable_in_cur = 0;
+
+	// reset array list for viewing
+	ResetArrayListForView();
+
+	// get reference and actual keypoint buffers
+	vx_keypoint_t * kpRefBase = (vx_keypoint_t *)m_bufForRead, * kpActualBase = nullptr;
+	vx_size stride;
+	ERROR_CHECK(vxAccessArrayRange(m_array, 0, numItems, &stride, (void **)&kpActualBase, VX_READ_ONLY));
+
+	// try matching reference keypoints with actual
+	for (size_t j = 0; j < numItemsRef; j++) {
+		vx_keypoint_t * kpRef = &kpRefBase[j];
+		if (!kpRef->tracking_status) {
+			count_non_trackable_in_ref++;
+		}
+		else {
+			bool matched = false;
+			for (size_t i = 0; i < numItems; i++) {
+				vx_keypoint_t * kpCur = &vxArrayItem(vx_keypoint_t, kpActualBase, i, stride);
+				if (kpCur->tracking_status) {
+					if ((kpCur->x == kpRef->x) && (kpCur->y == kpRef->y)) {
+						if (abs(kpCur->strength - kpRef->strength) <= m_errStrength) {
+							AddToArrayListForView(colorIndex_match_XYexact_S, kpCur->x, kpCur->y, kpCur->strength);
+							if (fpLog) fprintf(fpLog, "MATCH-XY-EXACT-S          -- %5d %5d %20.12e (ref:%06d) %5d %5d %20.12e (cur:%06d)\n", kpRef->x, kpRef->y, kpRef->strength, (int)j, kpCur->x, kpCur->y, kpCur->strength, (int)i);
+							count_match_XYexact_S++;
+						}
+						else {
+							AddToArrayListForView(colorIndex_match_XYexact_notS, kpCur->x, kpCur->y, kpCur->strength);
+							if (fpLog) fprintf(fpLog, "MATCH-XY-EXACT-S-MISMATCH -- %5d %5d %20.12e (ref:%06d) %5d %5d %20.12e (cur:%06d)\n", kpRef->x, kpRef->y, kpRef->strength, (int)j, kpCur->x, kpCur->y, kpCur->strength, (int)i);
+							count_match_XYexact_notS++;
+						}
+						matched = true;
+					}
+					else if ((abs(kpCur->x - kpRef->x) <= m_errX) && (abs(kpCur->y - kpRef->y) <= m_errY) &&
+						(abs(kpCur->strength - kpRef->strength) <= m_errStrength))
+					{
+						AddToArrayListForView(colorIndex_match_XYS, kpCur->x, kpCur->y, kpCur->strength);
+						if (fpLog) fprintf(fpLog, "MATCH-XYS                     -- %5d %5d %20.12e (ref:%06d) %5d %5d %20.12e (cur:%06d)\n", kpRef->x, kpRef->y, kpRef->strength, (int)j, kpCur->x, kpCur->y, kpCur->strength, (int)i);
+						count_match_XYS++;
+						matched = true;
+					}
+					if (matched)
+						break;
+				}
+			}
+			if (!matched) {
+				AddToArrayListForView(colorIndex_missing_in_cur, kpRef->x, kpRef->y, kpRef->strength);
+				if (fpLog) fprintf(fpLog, "MISMATCH-WITH-CUR         -- %5d %5d %20.12e (ref:%06d)\n", kpRef->x, kpRef->y, kpRef->strength, (int)j);
+				count_missing_in_cur++;
+			}
+		}
+	}
+
+	// try matching actual keypoints with reference
+	for (size_t i = 0; i < numItems; i++) {
+		vx_keypoint_t * kpCur = &vxArrayItem(vx_keypoint_t, kpActualBase, i, stride);
+		if (!kpCur->tracking_status) {
+			count_non_trackable_in_cur++;
+		}
+		else {
+			bool matched = false;
+			for (size_t j = 0; j < numItemsRef; j++) {
+				vx_keypoint_t * kpRef = &kpRefBase[j];
+				if (kpRef->tracking_status) {
+					if ((abs(kpCur->x - kpRef->x) <= m_errX) && (abs(kpCur->y - kpRef->y) <= m_errY) &&
+						(abs(kpCur->strength - kpRef->strength) <= m_errStrength))
+					{
+						matched = true;
+					}
+					if (matched)
+						break;
+				}
+			}
+			if (!matched) {
+				AddToArrayListForView(colorIndex_missing_in_ref, kpCur->x, kpCur->y, kpCur->strength);
+				if (fpLog) fprintf(fpLog, "MISMATCH-WITH-REF         --                                               %5d %5d %20.12e (cur:%06d)\n", kpCur->x, kpCur->y, kpCur->strength, (int)i);
+				count_missing_in_ref++;
+			}
+		}
+	}
+
+	ERROR_CHECK(vxCommitArrayRange(m_array, 0, numItems, kpActualBase));
+
+	// check for overall mismatch criteria
+	size_t totalMatched = count_match_XYexact_S + count_match_XYS;
+	size_t totalMismatchesOrMissing = max(count_match_XYexact_notS + count_missing_in_ref, count_missing_in_cur);
+	size_t total = totalMatched + totalMismatchesOrMissing;
+	float percentMismatches = (total > 0) ? (100.0f * (float)totalMismatchesOrMissing / (float)total) : 0.0f;
+	bool mismatched = false;
+	if (percentMismatches > m_errMismatchPercent) {
+		mismatched = true;
+		char line[512];
+		sprintf(line, "ERROR: array COMPARE MISMATCHED [matched %d; mismatched/missing %d (%.3f%%)] [untracked %d/%d(ref) vs %d/%d] for %s with frame#%d of %s\n",
+			(int)totalMatched, (int)totalMismatchesOrMissing, percentMismatches,
+			(int)count_non_trackable_in_ref, (int)numItemsRef, (int)count_non_trackable_in_cur, (int)numItems,
+			GetVxObjectName(), frameNumber, fileName);
+		printf("%s", line);
+		if (fpLog) fprintf(fpLog, "%s", line);
+	}
+	else {
+		char line[512];
+		sprintf(line, "OK: array COMPARE MATCHED %.3f%% [untracked %d/%d(ref) vs %d/%d] for %s with frame#%d of %s\n", 100.0f - percentMismatches, (int)count_non_trackable_in_ref, (int)numItemsRef, (int)count_non_trackable_in_cur, (int)numItems, GetVxObjectName(), frameNumber, fileName);
+		if (m_verbose) printf("%s", line);
+		if (fpLog) fprintf(fpLog, "%s", line);
+	}
+
+	if (fpLog) fclose(fpLog);
+	return mismatched;
+}
+
+bool CVxParamArray::CompareFrameCoord2d(size_t numItems, size_t numItemsRef, vx_uint8 * bufItems, int frameNumber, const char * fileName)
+{
+	FILE * fpLog = NULL;
+	if (m_fileNameCompareLog.length() > 0) {
+		char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompareLog.c_str(), frameNumber);
+		fpLog = fopen(fileName, "w");
+		if (!fpLog) ReportError("ERROR: Unable to create: %s\n", fileName);
+		printf("OK: creating array compare output log for %s in %s\n", GetVxObjectName(), fileName);
+	}
+
+	enum { // color indices of each list for viewing
+		colorIndex_match_XYexact = 0,
+		colorIndex_match_XY = 1,
+		colorIndex_missing_in_ref = 2,
+		colorIndex_missing_in_cur = 3,
+	};
+	// number of keypoint counts
+	size_t count_match_XYexact = 0;
+	size_t count_match_XY = 0;
+	size_t count_missing_in_ref = 0;
+	size_t count_missing_in_cur = 0;
+
+	// reset array list for viewing
+	ResetArrayListForView();
+
+	// get reference and actual keypoint buffers
+	vx_coordinates2d_t * kpRefBase = (vx_coordinates2d_t *)m_bufForRead, *kpActualBase = nullptr;
+	vx_size stride;
+	if (numItems > 0) {
+		ERROR_CHECK(vxAccessArrayRange(m_array, 0, numItems, &stride, (void **)&kpActualBase, VX_READ_ONLY));
+	}
+
+	// try matching reference keypoints with actual
+	for (size_t j = 0; j < numItemsRef; j++) {
+		vx_coordinates2d_t * kpRef = &kpRefBase[j];
+		bool matched = false;
+		for (size_t i = 0; i < numItems; i++) {
+			vx_coordinates2d_t * kpCur = &vxArrayItem(vx_coordinates2d_t, kpActualBase, i, stride);
+			if ((kpCur->x == kpRef->x) && (kpCur->y == kpRef->y)) {
+				AddToArrayListForView(colorIndex_match_XYexact, kpCur->x, kpCur->y, 0.0f);
+				if (fpLog) fprintf(fpLog, "MATCH-XY-EXACT       -- %5d %5d (ref:%06d) %5d %5d (cur:%06d)\n", kpRef->x, kpRef->y, (int)j, kpCur->x, kpCur->y, (int)i);
+				count_match_XYexact++;
+				matched = true;
+			}
+			else if ((abs((vx_int32)kpCur->x - (vx_int32)kpRef->x) <= m_errX) && (abs((vx_int32)kpCur->y - (vx_int32)kpRef->y) <= m_errY)) {
+				AddToArrayListForView(colorIndex_match_XY, kpCur->x, kpCur->y, 0.0f);
+				if (fpLog) fprintf(fpLog, "MATCH-XY             -- %5d %5d (ref:%06d) %5d %5d (cur:%06d)\n", kpRef->x, kpRef->y, (int)j, kpCur->x, kpCur->y, (int)i);
+				count_match_XY++;
+				matched = true;
+			}
+			if (matched)
+				break;
+		}
+		if (!matched) {
+			AddToArrayListForView(colorIndex_missing_in_cur, kpRef->x, kpRef->y, 0.0f);
+			if (fpLog) fprintf(fpLog, "MISMATCH-WITH-CUR    -- %5d %5d (ref:%06d)\n", kpRef->x, kpRef->y, (int)j);
+			count_missing_in_cur++;
+		}
+	}
+
+	// try matching actual keypoints with reference
+	for (size_t i = 0; i < numItems; i++) {
+		vx_coordinates2d_t * kpCur = &vxArrayItem(vx_coordinates2d_t, kpActualBase, i, stride);
+		bool matched = false;
+		for (size_t j = 0; j < numItemsRef; j++) {
+			vx_coordinates2d_t * kpRef = &kpRefBase[j];
+			if ((abs((vx_int32)kpCur->x - (vx_int32)kpRef->x) <= m_errX) && (abs((vx_int32)kpCur->y - (vx_int32)kpRef->y) <= m_errY)) {
+				matched = true;
+				break;
+			}
+		}
+		if (!matched) {
+			AddToArrayListForView(colorIndex_missing_in_ref, kpCur->x, kpCur->y, 0.0f);
+			if (fpLog) fprintf(fpLog, "MISMATCH-WITH-REF    --                          %5d %5d (cur:%06d)\n", kpCur->x, kpCur->y, (int)i);
+			count_missing_in_ref++;
+		}
+	}
+
+	if (numItems > 0) {
+		ERROR_CHECK(vxCommitArrayRange(m_array, 0, numItems, kpActualBase));
+	}
+
+	// check for overall mismatch criteria
+	size_t totalMatched = count_match_XYexact + count_match_XY;
+	size_t totalMismatchesOrMissing = max(count_missing_in_ref, count_missing_in_cur);
+	size_t total = totalMatched + totalMismatchesOrMissing;
+	float percentMismatches = (total > 0) ? (100.0f * (float)totalMismatchesOrMissing / (float)total) : 0.0f;
+	bool mismatched = false;
+	if (percentMismatches > m_errMismatchPercent) {
+		mismatched = true;
+		printf("ERROR: array COMPARE MISMATCHED [matched %d; mismatched/missing %d (%.3f%%)] for %s with frame#%d of %s\n", (int)totalMatched, (int)totalMismatchesOrMissing, percentMismatches, GetVxObjectName(), frameNumber, fileName);
+		if (fpLog) fprintf(fpLog, "ERROR: array COMPARE MISMATCHED [matched %d; mismatched/missing %d (%.3f%%)] for %s with frame#%d of %s\n", (int)totalMatched, (int)totalMismatchesOrMissing, percentMismatches, GetVxObjectName(), frameNumber, fileName);
+	}
+	else {
+		if (m_verbose) printf("OK: array COMPARE MATCHED %.3f%% for %s with frame#%d of %s\n", 100.0f - percentMismatches, GetVxObjectName(), frameNumber, fileName);
+		if (fpLog) fprintf(fpLog, "OK: array COMPARE MATCHED %.3f%% for %s with frame#%d of %s\n", 100.0f - percentMismatches, GetVxObjectName(), frameNumber, fileName);
+	}
+
+	if (fpLog) fclose(fpLog);
+	return mismatched;
+}
diff --git a/runvx/vxArray.h b/runvx/vxArray.h
new file mode 100644
index 0000000..ad33b33
--- /dev/null
+++ b/runvx/vxArray.h
@@ -0,0 +1,75 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_ARRAY_H__
+#define __VX_ARRAY_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamArray : public CVxParameter
+{
+public:
+	CVxParamArray();
+	virtual ~CVxParamArray();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+protected:
+	// read file into m_bufForRead: returns numItems
+	size_t ReadFileIntoBuffer(FILE * fp, bool readFileIsBinary);
+	// compare routines: return true if mismatch is detected, other returns false
+	bool CompareFrameBitwiseExact(size_t numItems, size_t numItemsRef, vx_uint8 * bufItems, int frameNumber, const char * fileName);
+	bool CompareFrameKeypoints(size_t numItems, size_t numItemsRef, vx_uint8 * bufItems, int frameNumber, const char * fileName);
+	bool CompareFrameCoord2d(size_t numItems, size_t numItemsRef, vx_uint8 * bufItems, int frameNumber, const char * fileName);
+
+private:
+	// vx configuration
+	vx_enum m_format;
+	vx_size m_capacity;
+	vx_size m_itemSize;
+	// I/O configuration
+	bool m_readFileIsBinary;
+	bool m_writeFileIsBinary;
+	bool m_compareFileIsBinary;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	bool m_useSyncOpenCLWriteDirective;
+	std::string m_fileNameCompareLog;
+	vx_int32 m_errX;
+	vx_int32 m_errY;
+	vx_float32 m_errStrength;
+	vx_float32 m_errMismatchPercent;
+	// vx object
+	vx_array m_array;
+	vx_uint8 * m_bufForRead;
+};
+
+
+#endif /* __VX_ARRAY_H__ */
\ No newline at end of file
diff --git a/runvx/vxConvolution.cpp b/runvx/vxConvolution.cpp
new file mode 100644
index 0000000..8c2ee68
--- /dev/null
+++ b/runvx/vxConvolution.cpp
@@ -0,0 +1,367 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxConvolution.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamConvolution
+//
+CVxParamConvolution::CVxParamConvolution()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_CONVOLUTION;
+	m_columns = 0;
+	m_rows = 0;
+	m_scale = 0;
+	// I/O configuration
+	m_readFileIsBinary = false;
+	m_writeFileIsBinary = false;
+	m_compareFileIsBinary = false;
+	m_readFileWithScale = false;
+	m_writeFileWithScale = false;
+	m_compareFileWithScale = false;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	m_bufForAccess = nullptr;
+	// vx object
+	m_convolution = nullptr;
+}
+
+CVxParamConvolution::~CVxParamConvolution()
+{
+	Shutdown();
+}
+
+int CVxParamConvolution::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: convolution COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_convolution) {
+		vxReleaseConvolution(&m_convolution);
+		m_convolution = nullptr;
+	}
+	if (m_bufForAccess) {
+		delete[] m_bufForAccess;
+		m_bufForAccess = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamConvolution::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64];
+	const char * ioParams = ScanParameters(desc, "convolution:<columns>,<rows>", "s:D,D", objType, &m_columns, &m_rows);
+	if (!_stricmp(objType, "convolution")) {
+		m_convolution = vxCreateConvolution(context, m_columns, m_rows);
+	}
+	else ReportError("ERROR: unsupported convolution type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_convolution);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: convolution creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_convolution) vxReleaseConvolution(&m_convolution);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_convolution;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamConvolution::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_convolution = (vx_convolution)m_vxObjRef;
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_COLUMNS, &m_columns, sizeof(m_columns)));
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_ROWS, &m_rows, sizeof(m_rows)));
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>[,ascii|binary|scale]
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+			m_readFileIsBinary = (m_fileNameRead.find(".txt") != m_fileNameRead.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_readFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_readFileIsBinary = true;
+				}
+				else if (!_stricmp(option, "scale")) {
+					m_readFileWithScale = true;
+				}
+				else ReportError("ERROR: invalid convolution read option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>[,ascii|binary|scale]
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			m_writeFileIsBinary = (m_fileNameWrite.find(".txt") != m_fileNameWrite.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_writeFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_writeFileIsBinary = true;
+				}
+				else if (!_stricmp(option, "scale")) {
+					m_writeFileWithScale = true;
+				}
+				else ReportError("ERROR: invalid convolution write option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare request syntax: compare,<fileName>[,ascii|binary|scale]
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_compareFileIsBinary = (m_fileNameCompare.find(".txt") != m_fileNameCompare.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_compareFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_compareFileIsBinary = true;
+				}
+				else if (!_stricmp(option, "scale")) {
+					m_compareFileWithScale = true;
+				}
+				else ReportError("ERROR: invalid convolution compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "scale"))
+		{ // write request syntax: scale,<scale>
+			ScanParameters(fileName, "<scale>", "d", &m_scale);
+			ERROR_CHECK(vxSetConvolutionAttribute(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+		}
+		else if (!_stricmp(ioType, "init"))
+		{ // write request syntax: init,{<value1>;<value2>;...<valueN>}
+			NULLPTR_CHECK(m_bufForAccess = new vx_int16[m_columns * m_rows]);
+			vx_size index = 0; char fmt[3] = "{d";
+			for (const char * s = fileName; *s && index < (m_columns * m_rows); fmt[0] = ';', index++) {
+				vx_uint32 value;
+				s = ScanParameters(s, "<value>", fmt, &value);
+				m_bufForAccess[index] = value;
+			}
+			if (index < (m_columns * m_rows)) ReportError("ERROR: convolution init have too few values: %s\n", fileName);
+			ERROR_CHECK(vxWriteConvolutionCoefficients(m_convolution, m_bufForAccess));
+		}
+		else if (!_stricmp(ioType, "directive") && !_stricmp(fileName, "readonly")) {
+			ERROR_CHECK(vxDirective((vx_reference)m_convolution, VX_DIRECTIVE_AMD_READ_ONLY));
+		}
+		else ReportError("ERROR: invalid convolution operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamConvolution::Finalize()
+{
+	// get object attributes
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_COLUMNS, &m_columns, sizeof(m_columns)));
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_ROWS, &m_rows, sizeof(m_rows)));
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+	return 0;
+}
+
+int CVxParamConvolution::ReadFrame(int frameNumber)
+{
+	// check if there is no user request to read
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// make sure buffer has been allocated
+	if (!m_bufForAccess) NULLPTR_CHECK(m_bufForAccess = new vx_int16[m_columns * m_rows]);
+
+	// for single frame reads, there is no need to read it again
+	// as it is already read into the object
+	if (!m_fileNameForReadHasIndex && frameNumber != m_captureFrameStart) {
+		return 0;
+	}
+
+	// reading data from input file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameRead.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_readFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		if (frameNumber == m_captureFrameStart) {
+			ReportError("ERROR: Unable to open: %s\n", fileName);
+		}
+		else {
+			return 1; // end of sequence detected for multiframe sequences
+		}
+	}
+	int status = 0;
+	if (m_readFileIsBinary) {
+		if (m_readFileWithScale) {
+			if (fread(&m_scale, sizeof(vx_uint32), 1, fp) == 1) {
+				ERROR_CHECK(vxSetConvolutionAttribute(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+			}
+			else status = -1;
+		}
+		if (fread(m_bufForAccess, sizeof(vx_int16), m_columns * m_rows, fp) != (m_columns * m_rows))
+			status = -1;
+	}
+	else {
+		if (m_readFileWithScale) {
+			if (fscanf(fp, "%i", &m_scale) == 1) {
+				ERROR_CHECK(vxSetConvolutionAttribute(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+			}
+			else status = -1;
+		}
+		for (vx_size index = 0; index < (m_columns * m_rows); index++) {
+			vx_uint32 value;
+			if (fscanf(fp, "%i", &value) != 1) {
+				status = -1;
+				break;
+			}
+			m_bufForAccess[index] = (vx_int16)value;
+		}
+	}
+	ERROR_CHECK(vxWriteConvolutionCoefficients(m_convolution, m_bufForAccess));
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on convolution input file: %s\n", fileName);
+
+	return status;
+}
+
+int CVxParamConvolution::WriteFrame(int frameNumber)
+{
+	// check if there is no user request to write
+	if (m_fileNameWrite.length() < 1) return 0;
+
+	// make sure buffer has been allocated and read the convolution data
+	if (!m_bufForAccess) NULLPTR_CHECK(m_bufForAccess = new vx_int16[m_columns * m_rows]);
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+	ERROR_CHECK(vxReadConvolutionCoefficients(m_convolution, m_bufForAccess));
+
+	// write data to output file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameWrite.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_writeFileIsBinary ? "wb" : "w");
+	if (!fp) ReportError("ERROR: Unable to create: %s\n", fileName);
+	if (m_writeFileIsBinary) {
+		if (m_writeFileWithScale) {
+			fwrite(&m_scale, sizeof(vx_uint32), 1, fp);
+		}
+		fwrite(m_bufForAccess, sizeof(vx_int16), m_columns * m_rows, fp);
+	}
+	else {
+		if (m_writeFileWithScale) {
+			fprintf(fp, "%d\n", m_scale);
+		}
+		for (vx_size row = 0; row < m_rows; row++) {
+			fprintf(fp, "\n");
+			for (vx_size col = 0; col < m_columns; col++) {
+				fprintf(fp, " %6d", m_bufForAccess[row * m_columns + col]);
+			}
+			fprintf(fp, "\n");
+		}
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamConvolution::CompareFrame(int frameNumber)
+{
+	// check if there is no user request to compare
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// make sure buffer has been allocated and read the convolution data
+	if (!m_bufForAccess) NULLPTR_CHECK(m_bufForAccess = new vx_int16[m_columns * m_rows]);
+	ERROR_CHECK(vxQueryConvolution(m_convolution, VX_CONVOLUTION_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+	ERROR_CHECK(vxReadConvolutionCoefficients(m_convolution, m_bufForAccess));
+
+	// reading data from reference file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_compareFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	bool mismatchDetected = false;
+	int status = 0;
+	if (m_compareFileWithScale) {
+		vx_uint32 scaleRef;
+		if (m_compareFileIsBinary) {
+			if (fread(&scaleRef, sizeof(vx_uint32), 1, fp) != 1)
+				status = -1;
+		}
+		else {
+			if (fscanf(fp, "%i", &scaleRef) != 1)
+				status = -1;
+		}
+		if (m_scale != scaleRef)
+			mismatchDetected = true;
+	}
+	for (vx_size index = 0; index < (m_columns * m_rows); index++) {
+		vx_int16 coeffValue = 0;
+		if (m_compareFileIsBinary) {
+			if (fread(&coeffValue, sizeof(coeffValue), 1, fp) != 1) {
+				status = -1;
+				break;
+			}
+		}
+		else {
+			vx_int32 value;
+			if (fscanf(fp, "%i", &value) != 1) {
+				status = -1;
+				break;
+			}
+			coeffValue = (vx_int16)value;
+		}
+		if (m_bufForAccess[index] != coeffValue) {
+			mismatchDetected = true;
+			break;
+		}
+	}
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on convolution comapre reference file: %s\n", fileName);
+
+	if (mismatchDetected) {
+		m_compareCountMismatches++;
+		printf("ERROR: convolution COMPARE MISMATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+		if (m_verbose) printf("OK: convolution COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+	}
+
+	return 0;
+}
diff --git a/runvx/vxConvolution.h b/runvx/vxConvolution.h
new file mode 100644
index 0000000..a952bb5
--- /dev/null
+++ b/runvx/vxConvolution.h
@@ -0,0 +1,64 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_CONVOLUTION_H__
+#define __VX_CONVOLUTION_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamConvolution : public CVxParameter
+{
+public:
+	CVxParamConvolution();
+	virtual ~CVxParamConvolution();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_size m_columns;
+	vx_size m_rows;
+	vx_uint32 m_scale;
+	// I/O configuration
+	bool m_readFileIsBinary;
+	bool m_writeFileIsBinary;
+	bool m_compareFileIsBinary;
+	bool m_readFileWithScale;
+	bool m_writeFileWithScale;
+	bool m_compareFileWithScale;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	vx_int16 * m_bufForAccess;
+	// vx object
+	vx_convolution m_convolution;
+};
+
+
+#endif /* __VX_CONVOLUTION_H__ */
\ No newline at end of file
diff --git a/runvx/vxDistribution.cpp b/runvx/vxDistribution.cpp
new file mode 100644
index 0000000..195af8c
--- /dev/null
+++ b/runvx/vxDistribution.cpp
@@ -0,0 +1,284 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxDistribution.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamDistribution
+//
+CVxParamDistribution::CVxParamDistribution()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_DISTRIBUTION;
+	m_numBins = 0;
+	m_offset = 0;
+	m_range = 0;
+	// I/O configuration
+	m_readFileIsBinary = false;
+	m_writeFileIsBinary = false;
+	m_compareFileIsBinary = false;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	// vx object
+	m_distribution = nullptr;
+	m_bufForCompare = nullptr;
+}
+
+CVxParamDistribution::~CVxParamDistribution()
+{
+	Shutdown();
+}
+
+int CVxParamDistribution::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: distribution COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_distribution){
+		vxReleaseDistribution(&m_distribution);
+		m_distribution = nullptr;
+	}
+	if (m_bufForCompare) {
+		delete[] m_bufForCompare;
+		m_bufForCompare = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamDistribution::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64];
+	const char * ioParams = ScanParameters(desc, "distribution:<numBins>,<offset>,<range>", "s:D,d,d", objType, &m_numBins, &m_offset, &m_range);
+	if (!_stricmp(objType, "distribution")) {
+		m_distribution = vxCreateDistribution(context, m_numBins, m_offset, m_range);
+	}
+	else ReportError("ERROR: unsupported distribution type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_distribution);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: distribution creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_distribution) vxReleaseDistribution(&m_distribution);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_distribution;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamDistribution::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_distribution = (vx_distribution)m_vxObjRef;
+	ERROR_CHECK(vxQueryDistribution(m_distribution, VX_DISTRIBUTION_ATTRIBUTE_BINS, &m_numBins, sizeof(m_numBins)));
+	ERROR_CHECK(vxQueryDistribution(m_distribution, VX_DISTRIBUTION_ATTRIBUTE_OFFSET, &m_offset, sizeof(m_offset)));
+	ERROR_CHECK(vxQueryDistribution(m_distribution, VX_DISTRIBUTION_ATTRIBUTE_RANGE, &m_range, sizeof(m_range)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>[,ascii|binary]
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+			m_readFileIsBinary = (m_fileNameRead.find(".txt") != m_fileNameRead.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_readFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_readFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid distribution read option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>[,ascii|binary]
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			m_writeFileIsBinary = (m_fileNameWrite.find(".txt") != m_fileNameWrite.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_writeFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_writeFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid distribution write option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare syntax: compare,fileName[,ascii|binary]
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_compareFileIsBinary = (m_fileNameCompare.find(".txt") != m_fileNameCompare.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_compareFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_compareFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid distribution compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "view")) {
+			m_displayName.assign(fileName);
+			m_paramList.push_back(this);
+		}
+		else ReportError("ERROR: invalid distribution operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamDistribution::Finalize()
+{
+	return 0;
+}
+
+// read file into m_bufForRead: returns 0 if successful, 1 on EOF
+int CVxParamDistribution::ReadFileIntoBuffer(FILE * fp, vx_uint32 * buf)
+{
+	// read file into m_bufForRead
+	int status = 0;
+	if (m_readFileIsBinary)
+	{ // read in BINARY mode
+		vx_size count = fread(buf, sizeof(vx_uint32), m_numBins, fp);
+		if (count != m_numBins)
+			status = 1;
+	}
+	else
+	{ // read in ASCII mode
+		for (size_t i = 0; i < m_numBins; i++){
+			if (fscanf(fp, "%i", &buf[i]) != 1) {
+				status = 1;
+				break;
+			}
+		}
+	}
+	return status;
+}
+
+int CVxParamDistribution::ReadFrame(int frameNumber)
+{
+	// check if there is no user request to read
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// for single frame reads, there is no need to read it again
+	// as it is already read into the object
+	if (!m_fileNameForReadHasIndex && frameNumber != m_captureFrameStart) {
+		return 0;
+	}
+
+	// reading data from input file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameRead.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_readFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		if (frameNumber == m_captureFrameStart) {
+			ReportError("ERROR: Unable to open: %s\n", fileName);
+		}
+		else {
+			return 1; // end of sequence detected for multiframe sequences
+		}
+	}
+	vx_uint32 * data = nullptr;
+	ERROR_CHECK(vxAccessDistribution(m_distribution, (void **)&data, VX_WRITE_ONLY));
+	int status = ReadFileIntoBuffer(fp, data);
+	ERROR_CHECK(vxCommitDistribution(m_distribution, data));
+	fclose(fp);
+
+	return status;
+}
+
+int CVxParamDistribution::WriteFrame(int frameNumber)
+{
+	// check if there is no user request to write
+	if (m_fileNameWrite.length() < 1) return 0;
+
+	// reading data from input file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameWrite.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_writeFileIsBinary ? "wb" : "w");
+	if (!fp) ReportError("ERROR: Unable to create: %s\n", fileName);
+	vx_uint32 * data = nullptr;
+	ERROR_CHECK(vxAccessDistribution(m_distribution, (void **)&data, VX_READ_ONLY));
+	if (m_writeFileIsBinary)
+	{ // write in BINARY mode
+		fwrite(data, sizeof(data[0]), m_numBins, fp);
+	}
+	else
+	{ // write in ASCII mode
+		for (size_t i = 0; i < m_numBins; i++)
+			fprintf(fp, "%8d\n", data[i]);
+	}
+	ERROR_CHECK(vxCommitDistribution(m_distribution, data));
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamDistribution::CompareFrame(int frameNumber)
+{
+	// check if there is no user request to compare
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// make sure m_bufForRead is allocated
+	if (!m_bufForCompare) NULLPTR_CHECK(m_bufForCompare = new vx_uint32[m_numBins]);
+
+	// reading data from reference file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_compareFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	int status = ReadFileIntoBuffer(fp, m_bufForCompare);
+	fclose(fp);
+	if (status) ReportError("ERROR: distribution compare reference doesn't have enough data: %s\n", fileName);
+
+	// compare and report error if mismatched
+	vx_uint32 * bufRef = nullptr;
+	ERROR_CHECK(vxAccessDistribution(m_distribution, (void **)&bufRef, VX_READ_ONLY));
+	status = memcmp(bufRef, m_bufForCompare, m_numBins * sizeof(vx_uint32)) ? -1 : 0;
+	ERROR_CHECK(vxCommitDistribution(m_distribution, bufRef));
+	if (status) {
+		m_compareCountMismatches++;
+		printf("ERROR: distribution COMPARE MISMATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+		if (m_verbose) printf("OK: distribution COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+	}
+	return 0;
+}
diff --git a/runvx/vxDistribution.h b/runvx/vxDistribution.h
new file mode 100644
index 0000000..d6290b6
--- /dev/null
+++ b/runvx/vxDistribution.h
@@ -0,0 +1,85 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_DISTRIBUTION_H__
+#define __VX_DISTRIBUTION_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+// CVxParamDistribution: wrapper for vx_distribution object
+class CVxParamDistribution : public CVxParameter
+{
+public:
+	// constructor and destructor
+	CVxParamDistribution();
+	virtual ~CVxParamDistribution();
+	virtual int Shutdown();
+
+	// Initialize: create OpenVX object and further uses InitializeIO to input/output initialization
+	//   desc: object description as specified on command-line or in script
+	//   returns 0 on SUCCESS, else error code
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+
+	// InitializeIO: performs I/O initialization using the OpenVX object already created
+	//   ref: OpenVX object already created
+	//   io_params: I/O description as specified on command-line or in script
+	//   returns 0 on SUCCESS, else error code
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+
+	// Finalize: for final initialization after vxVerifyGraph
+	//   meant for querying object parameters which are not available until vxVerifyGraph
+	virtual int Finalize();
+
+	// get OpenVX object type (e.g., VX_TYPE_IMAGE, VX_TYPE_SCALAR, ...)
+	// TBD: change getObjectType to GetObjectType
+
+	// frame-level read, write, and compare
+	//   returns 0 on SUCCESS, else error code
+	//   ReadFrame() returns +ve value to indicate data unavailability
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+
+protected:
+	// read file into m_bufForRead: returns 0 if successful, 1 on EOF
+	int ReadFileIntoBuffer(FILE * fp, vx_uint32 * buf);
+
+private:
+	// vx_distribution configuration
+	vx_size m_numBins;
+	vx_int32 m_offset;
+	vx_uint32 m_range;
+	// I/O configuration
+	bool m_readFileIsBinary;
+	bool m_writeFileIsBinary;
+	bool m_compareFileIsBinary;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	// vx object
+	vx_distribution m_distribution;
+	vx_uint32 * m_bufForCompare;
+};
+
+#endif /* __VX_DISTRIBUTION_H__ */
\ No newline at end of file
diff --git a/runvx/vxEngine.cpp b/runvx/vxEngine.cpp
new file mode 100644
index 0000000..a066722
--- /dev/null
+++ b/runvx/vxEngine.cpp
@@ -0,0 +1,700 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+#include "vxEngine.h"
+
+#define DEBUG_INFO 0
+#define DEBUG_GRAPH 0
+
+vector<string> &splittwo(const string &s, char delim, vector<string> &elems){
+	if (delim == ' ') {
+		const char * p = s.c_str();
+		while (*p) {
+			while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')
+				p++;
+			const char * q = p;
+			while (*p && !(*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r'))
+				p++;
+			if (*q){
+				char item[1024];
+				strncpy(item, q, p - q); item[p - q] = 0;
+				elems.push_back(item);
+			}
+		}
+	}
+	else {
+		stringstream ss(s);
+		string item;
+		while (getline(ss, item, delim)){
+			elems.push_back(item);
+		}
+	}
+	return elems;
+}
+
+void RemoveVirtualKeywordFromParamDescription(std::string& paramDesc)
+{
+	size_t pos = paramDesc.find("-virtual:");
+	if (pos != std::string::npos) {
+		paramDesc.erase(pos, 8);
+	}
+}
+
+static void VX_CALLBACK log_callback(vx_context context, vx_reference ref, vx_status status, const vx_char string[])
+{
+	printf("%s", string); fflush(stdout);
+}
+
+CVxEngine::CVxEngine()
+{
+	m_paramCount = 0;
+	int64_t freq = utilGetClockFrequency();
+	m_perfMultiplicationFactor = 1000.0f / freq; // to convert clock counter to ms
+	m_usingMultiFrameCapture = false;
+	m_captureFrameStart = 0;
+	m_disableVirtual = false;
+	m_verbose = false;
+}
+
+CVxEngine::~CVxEngine()
+{
+	Shutdown();
+}
+
+int CVxEngine::Initialize(int argCount, int defaultTargetAffinity, int defaultTargetInfo, bool useProcessGraph, bool disableVirtual)
+{
+	// save configuration
+	m_paramCount = argCount;
+	m_useProcessGraph = useProcessGraph;
+	vx_status ovxStatus = VX_SUCCESS;
+	m_disableVirtual = disableVirtual;
+
+	// create OpenVX context, register log_callback, and show implementation
+	m_context = vxCreateContext();
+	if (vxGetStatus((vx_reference)m_context)) { printf("ERROR: vxCreateContext failed\n"); throw - 1; }
+	vxRegisterLogCallback(m_context, log_callback, vx_false_e);
+	char name[VX_MAX_IMPLEMENTATION_NAME];
+	if (ovxStatus = vxQueryContext(m_context, VX_CONTEXT_ATTRIBUTE_IMPLEMENTATION, name, VX_MAX_IMPLEMENTATION_NAME)) {
+		printf("ERROR: vxQueryContext)VX_CONTEXT_ATTRIBUTE_IMPLEMENTATION) failed (%d:%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		return -1;
+	}
+	printf("OK: using %s\n", name);
+
+	// set default target affinity, if request
+	if (defaultTargetAffinity) {
+		AgoTargetAffinityInfo attr_affinity = { 0 };
+		attr_affinity.device_type = defaultTargetAffinity;
+		attr_affinity.device_info = defaultTargetInfo;
+		vx_status status = vxSetContextAttribute(m_context, VX_CONTEXT_ATTRIBUTE_AMD_AFFINITY, &attr_affinity, sizeof(attr_affinity));
+		if (status) {
+			printf("ERROR: vxSetContextAttribute(VX_CONTEXT_ATTRIBUTE_AMD_AFFINITY,%d) failed (%d:%s)\n", defaultTargetAffinity, status, ovxEnum2Name(status));
+			throw - 1;
+		}
+	}
+
+	// create graph
+	m_graph = vxCreateGraph(m_context);
+	if (vxGetStatus((vx_reference)m_graph)){ printf("ERROR: vxCreateGraph failed\n"); throw - 1; }
+
+	return 0;
+}
+
+int CVxEngine::SetGraphOptimizerFlags(vx_uint32 graph_optimizer_flags)
+{
+	// set optimizer flags
+	vx_status status = vxSetGraphAttribute(m_graph, VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS, &graph_optimizer_flags, sizeof(graph_optimizer_flags));
+	if (status) {
+		printf("ERROR: vxSetGraphAttribute(*,VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS,%d) failed (%d:%s)\n", graph_optimizer_flags, status, ovxEnum2Name(status));
+		throw - 1;
+	}
+	return 0;
+}
+
+vx_context CVxEngine::getContext()
+{
+	return m_context;
+}
+
+int CVxEngine::SetParameter(int index, const char * param)
+{
+	std::string paramDesc;
+	if (m_disableVirtual) {
+		paramDesc = param;
+		RemoveVirtualKeywordFromParamDescription(paramDesc);
+		param = paramDesc.c_str();
+	}
+	CVxParameter * parameter = CreateDataObject(m_context, m_graph, &m_paramMap, &m_userStructMap, param, m_captureFrameStart);
+	if (!parameter) {
+		printf("ERROR: unable to create parameter# %d\nCheck syntax: %s\n", index, param);
+		return -1;
+	}
+
+	vx_reference ref;
+	char name[16];
+	sprintf(name, "$%d", index+1);
+	m_paramMap.insert(pair<string, CVxParameter *>(name, parameter));
+	ref = m_paramMap[name]->GetVxObject();
+	vxSetReferenceName(ref, name);
+	return 0;
+}
+
+int CVxEngine::SetImportedData(vx_reference ref, const char * name, const char * params)
+{
+	if (params) {
+		CVxParameter * obj = CreateDataObject(m_context, m_graph, ref, params, m_captureFrameStart);
+		if (!obj) {
+			printf("ERROR: CreateDataObject(*,*,*,%s) failed\n", params);
+			return -1;
+		}
+		m_paramMap.insert(pair<string, CVxParameter *>(name, obj));
+		vxSetReferenceName(ref, name);
+	}
+	return 0;
+}
+
+void VX_CALLBACK CVxEngine_data_registry_callback_f(void * obj, vx_reference ref, const char * name, const char * params)
+{
+	int status = ((CVxEngine *)obj)->SetImportedData(ref, name, params);
+	if (status) {
+		printf("ERROR: SetImportedData(*,%s,%s) failed (%d)\n", name, params, status);
+		throw -1;
+	}
+}
+
+int CVxEngine::BuildGraph(char * graphScript, bool useAgoImport, bool useAgoDump)
+{
+	if (useAgoImport) {
+		vx_reference ref[64] = { 0 };
+		int num_ref = 0;
+		for (int i = 0; i < 64; i++) {
+			char name[16]; sprintf(name, "$%d", i + 1);
+			if (m_paramMap.find(name) == m_paramMap.end())
+				break;
+			ref[num_ref++] = m_paramMap[name]->GetVxObject();
+		}
+		// parse the graph using import
+		AgoGraphImportInfo info = { 0 };
+		info.text = graphScript;
+		info.ref = ref;
+		info.num_ref = num_ref;
+		info.data_registry_callback_obj = this;
+		info.data_registry_callback_f = CVxEngine_data_registry_callback_f;
+		if (useAgoDump) info.dumpToConsole = 2;
+		vx_status status = vxSetGraphAttribute(m_graph, VX_GRAPH_ATTRIBUTE_AMD_IMPORT_FROM_TEXT, &info, sizeof(info));
+		if (status != VX_SUCCESS) {
+			printf("ERROR: vxSetGraphAttribute(...,VX_GRAPH_ATTRIBUTE_AMD_IMPORT_FROM_TEXT,...) failed (%d)\n", status);
+			return -1;
+		}
+	}
+	else {
+		// parse the graph text
+		char currentLine[2048];
+		memset(currentLine, 0, 2048);
+		string currentLine_string = "";
+		bool is_comment = false;
+
+		vector<string> graphScript_vector;
+		int j = 0;
+		for (size_t i = 0; i < strlen(graphScript); i++){
+			if (graphScript[i] != '\r' && graphScript[i] != '\n'){
+				if (graphScript[i] == '#'){
+					is_comment = true;
+				}
+				if (is_comment == false){
+					currentLine[j] = graphScript[i];
+					j++;
+				}
+			}
+			else if (graphScript[i] == '\n'){
+					currentLine[j] = '\0';
+					graphScript_vector.push_back(currentLine);
+					j = 0;
+				is_comment = false;
+			}
+		}
+			currentLine[j] = '\0';
+			graphScript_vector.push_back(currentLine);
+			memset(currentLine, 0, 2048);
+#if DEBUG_GRAPH
+		printf("\n\nDEBUG Graph script here:\n");
+		for (vector<string>::size_type i = 0; i != graphScript_vector.size(); i++){
+			printf("%d: %s\n", i, graphScript_vector[i].c_str());
+		}
+#endif
+		vector<string> graphLine_vector;
+		vx_status status = 0;
+
+		//user defined structs
+		vx_size currentStructSize = 0;
+		vx_enum currentStructEnum = 0;
+
+		for (vector<string>::size_type i = 0; i != graphScript_vector.size(); ++i){
+			splittwo(graphScript_vector[i].c_str(), ' ', graphLine_vector);
+			if (graphLine_vector.size() > 0){							//crashes if graphLine_vector.size() == 0
+				if (strcmp(graphLine_vector[0].c_str(), "import") == 0){
+					if (graphLine_vector.size() != 2){
+						printf("ERROR: SYNTAX, check line with \'import %s\'\n", graphLine_vector[1].c_str());
+						return -1;
+					}
+					status |= vxLoadKernels(m_context, graphLine_vector[1].c_str());
+					if (status) {
+						printf("ERROR: vxLoadKernels(context, %s) failed(%d)\n", graphLine_vector[1].c_str(), status);
+						return -1;
+					}
+				}
+				else if (strcmp(graphLine_vector[0].c_str(), "type") == 0){
+					if (graphLine_vector.size() != 3){
+						printf("ERROR: SYNTAX, check line with \'type %s\'\n", graphLine_vector[1].c_str());
+						printf("       SYNTAX for register-user-struct creation is \'type [OBJECT NAME] [OBJECT TYPE]:[SIZE IN BYTES]'\n");
+						return -1;
+					}
+					vector<string> typeObject;
+					splittwo(graphLine_vector[2].c_str(), ':', typeObject);
+					if (strcmp(typeObject[0].c_str(), "userstruct") == 0){
+						istringstream(typeObject[1]) >> currentStructSize;
+						currentStructEnum = vxRegisterUserStruct(m_context, currentStructSize);
+						if (currentStructEnum == VX_TYPE_INVALID){
+							printf("ERROR: Could not create register-user-struct %s, returned VX_TYPE_INVALID. Exiting\n", graphLine_vector[1].c_str());
+						}
+						m_userStructMap.insert(pair<string, vx_enum>(graphLine_vector[1].c_str(), currentStructEnum));
+					}
+					else{
+						printf("ERROR: OBJECT TYPE of %s is not supported. Exiting.\n", typeObject[0].c_str());
+						return -1;
+					}
+				}
+				else if (strcmp(graphLine_vector[0].c_str(), "data") == 0){
+					if (graphLine_vector.size() != 4){
+						printf("ERROR: SYNTAX, check line with \'data %s\'\n", graphLine_vector[1].c_str());
+						return -1;
+					}
+					if ((strcmp(graphLine_vector[2].c_str(), "=") == 0)){
+						std::string paramDesc = graphLine_vector[3];
+						if (m_disableVirtual) {
+							RemoveVirtualKeywordFromParamDescription(paramDesc);
+						}
+						CVxParameter * obj = CreateDataObject(m_context, m_graph, &m_paramMap, &m_userStructMap, paramDesc.c_str(), m_captureFrameStart);
+						if (!obj) {
+							printf("ERROR: SYNTAX, \'%s\' is an INVALID object\n", graphLine_vector[3].c_str());
+							return -1;
+						}
+						m_paramMap.insert(pair<string, CVxParameter *>(graphLine_vector[1].c_str(), obj));
+						vx_reference ref = m_paramMap[graphLine_vector[1].c_str()]->GetVxObject();
+						vx_status status = VX_SUCCESS;
+						status = vxSetReferenceName(ref, graphLine_vector[1].c_str());
+						if (status != VX_SUCCESS){
+							printf("ERROR: vxSetReferenceName returned %d\n", status);
+							return -1;
+						}
+					}
+					else {
+						printf("ERROR: SYNTAX for data object creation is \'data [DATA OBJECT NAME] = [DATA OBJECT PARAMETERS]\'\n");
+						return -1;
+					}
+				}
+				else if (strcmp(graphLine_vector[0].c_str(), "node") == 0){
+					if (graphLine_vector.size() >= 3){
+						//initialize kernel
+						char thisNode[2048];
+						sprintf(thisNode, "%s", graphLine_vector[1].c_str());
+						vx_kernel current_kernel = vxGetKernelByName(m_context, thisNode);
+						if (current_kernel == 0) { printf("ERROR: vxGetKernelByName(%s) failed\n", thisNode); throw -1; }
+						vx_node current_node = vxCreateGenericNode(m_graph, current_kernel);
+						if (current_node == 0) { printf("ERROR: vxCreateGenericNode(%s) failed\n", thisNode); throw -1; }
+						int paramNum = 0;
+						int skipParam = 0;
+						for (vector<string>::size_type j = 2; j != graphLine_vector.size(); ++j){
+							skipParam = 0;
+							if (!strncmp(graphLine_vector[j].c_str(), "!", 1)){
+								//printf("DEBUG: ENUM, %s\n", graphLine_vector[j]);
+								string enum_name_s;
+								size_t length_of_enum = graphLine_vector[j].size() - 1;
+								char description[2048];
+								char desc2[2048];
+
+								enum_name_s = graphLine_vector[j].substr(1, length_of_enum);
+#if DEBUG_INFO
+								printf("DEBUG: enum_name = %s\n", enum_name_s.c_str());
+#endif				
+								strcpy(description, "scalar:enum,%s");
+								sprintf(desc2, description, enum_name_s.c_str());
+								if (m_disableVirtual) {
+									std::string paramDesc = desc2;
+									RemoveVirtualKeywordFromParamDescription(paramDesc);
+									strcpy(desc2, paramDesc.c_str());
+								}
+								CVxParameter * obj = CreateDataObject(m_context, m_graph, &m_paramMap, &m_userStructMap, desc2, m_captureFrameStart);
+								if (!obj) {
+									printf("ERROR: CreateDataObject(*,*,%s) failed\n", desc2);
+									return -1;
+								}
+								m_paramMap.insert(pair<string, CVxParameter *>(graphLine_vector[j].c_str(), obj));
+							}
+							else if (!strncmp(graphLine_vector[j].c_str(), "attr:BORDER_MODE:CONSTANT,", 26)){
+								vx_border_mode_t this_border_mode = { 0 };
+								vector<string> attributeValue_vector;
+								splittwo(graphLine_vector[j].c_str(), ',', attributeValue_vector);
+								istringstream(attributeValue_vector[1].c_str()) >> this_border_mode.constant_value;
+								this_border_mode.mode = VX_BORDER_MODE_CONSTANT;
+								vxSetNodeAttribute(current_node, VX_NODE_ATTRIBUTE_BORDER_MODE, &this_border_mode, sizeof(this_border_mode));
+								skipParam = 1;
+							}
+							else if (!strncmp(graphLine_vector[j].c_str(), "attr:BORDER_MODE:UNDEFINED", 26)){
+								vx_border_mode_t this_border_mode = { 0 };
+								vector<string> attributeValue_vector;
+								splittwo(graphLine_vector[j].c_str(), ',', attributeValue_vector);
+								this_border_mode.mode = VX_BORDER_MODE_UNDEFINED;
+								vxSetNodeAttribute(current_node, VX_NODE_ATTRIBUTE_BORDER_MODE, &this_border_mode, sizeof(this_border_mode));
+								skipParam = 1;
+							}
+							else if (!strncmp(graphLine_vector[j].c_str(), "attr:BORDER_MODE:REPLICATE", 26)){
+								vx_border_mode_t this_border_mode = { 0 };
+								vector<string> attributeValue_vector;
+								splittwo(graphLine_vector[j].c_str(), ',', attributeValue_vector);
+								this_border_mode.mode = VX_BORDER_MODE_REPLICATE;
+								vxSetNodeAttribute(current_node, VX_NODE_ATTRIBUTE_BORDER_MODE, &this_border_mode, sizeof(this_border_mode));
+								skipParam = 1;
+							}
+							else if (!strncmp(graphLine_vector[j].c_str(), "attr:affinity:CPU", 26)) {
+							}
+							else if (!strncmp(graphLine_vector[j].c_str(), "attr:affinity:GPU", 26)) {
+							}
+							if (skipParam == 0){
+#if DEBUG_INFO
+								printf("Node: %s, graphLine_vector[%d] =  %s\n", thisNode, j, graphLine_vector[j].c_str());
+#endif				
+								if (strcmp(graphLine_vector[j].c_str(), "null") != 0) {
+									if (m_paramMap.find(graphLine_vector[j].c_str()) == m_paramMap.end()) {
+										printf("ERROR: BuildGraph: parameter named %s is not found\n", graphLine_vector[j].c_str());
+										return -1;
+									}
+									else {
+										status = vxSetParameterByIndex(current_node, paramNum, m_paramMap[graphLine_vector[j].c_str()]->GetVxObject());
+										if (status) {
+											printf("ERROR: BuildGraph: vxSetParameterByIndex(...,%d,%s,...) failed (%d)\n", paramNum, graphLine_vector[j].c_str(), status);
+											return -1;
+										}
+									}
+								}
+								paramNum++;
+							}
+						}
+						memset(thisNode, 0, 2048);
+					}
+					else{
+						printf("ERROR: SYNTAX for any node is \'node [NAME OF NODE] [NODE ARGUMENT(S)]\n");
+						return -1;
+					}
+				}
+				else if (graphLine_vector[0] == "affinity") {
+					if (graphLine_vector.size() != 2 || (graphLine_vector[1] != "CPU" && graphLine_vector[1] != "GPU")) {
+						printf("ERROR: SYNTAX, check line with \'affinity %s\' (use CPU/GPU)\n", graphLine_vector[1].c_str());
+						return -1;
+					}
+					vx_uint32 defaultTargetAffinity = (graphLine_vector[1] == "GPU") ? AGO_TARGET_AFFINITY_GPU : AGO_TARGET_AFFINITY_CPU;
+					AgoTargetAffinityInfo attr_affinity = { 0 };
+					attr_affinity.device_type = defaultTargetAffinity;
+					status = vxSetGraphAttribute(m_graph, VX_GRAPH_ATTRIBUTE_AMD_AFFINITY, &attr_affinity, sizeof(attr_affinity));
+					if (status) {
+						printf("ERROR: vxSetContextAttribute(%d) failed (%d)\n", defaultTargetAffinity, status);
+						throw - 1;
+					}
+				}
+				else {
+					printf("ERROR: SYNTAX, \'%s\' is not recognized as a keyword\n", graphLine_vector[0].c_str());
+					return -1;
+				}
+			}
+			graphLine_vector.clear();
+		}
+		graphScript_vector.clear();
+	}
+
+	// dump original graph (if requested)
+	if (useAgoDump) {
+		vx_reference ref[64] = { 0 };
+		int num_ref = 0;
+		for (int i = 0; i < 64; i++) {
+			char name[16]; sprintf(name, "$%d", i + 1);
+			if (m_paramMap.find(name) == m_paramMap.end())
+				break;
+			ref[num_ref++] = m_paramMap[name]->GetVxObject();
+		}
+		AgoGraphExportInfo info = { 0 };
+		strcpy(info.fileName, "stdout");
+		info.ref = ref;
+		info.num_ref = num_ref;
+		strcpy(info.comment, "original");
+		vx_status status = vxSetGraphAttribute(m_graph, VX_GRAPH_ATTRIBUTE_AMD_EXPORT_TO_TEXT, &info, sizeof(info));
+		if (status != VX_SUCCESS) {
+			printf("ERROR: vxSetGraphAttribute(...,VX_GRAPH_ATTRIBUTE_AMD_EXPORT_TO_TEXT,...) failed (%d)\n", status);
+			return -1;
+		}
+	}
+
+	// verify the graph
+	vx_status status;
+	if ((status = vxVerifyGraph(m_graph)) != VX_SUCCESS){
+		printf("ERROR: vxVerifyGraph failed. status = %d\n", status);
+		return -1;
+	}
+
+	// dump optimized graph (if requested)
+	if (useAgoDump) {
+		vx_reference ref[64] = { 0 };
+		int num_ref = 0;
+		for (int i = 0; i < 64; i++) {
+			char name[16]; sprintf(name, "$%d", i + 1);
+			if (m_paramMap.find(name) == m_paramMap.end())
+				break;
+			ref[num_ref++] = m_paramMap[name]->GetVxObject();
+		}
+		AgoGraphExportInfo info = { 0 };
+		strcpy(info.fileName, "stdout");
+		info.ref = ref;
+		info.num_ref = num_ref;
+		strcpy(info.comment, "drama");
+		vx_status status = vxSetGraphAttribute(m_graph, VX_GRAPH_ATTRIBUTE_AMD_EXPORT_TO_TEXT, &info, sizeof(info));
+		if (status != VX_SUCCESS) {
+			printf("ERROR: vxSetGraphAttribute(...,VX_GRAPH_ATTRIBUTE_AMD_EXPORT_TO_TEXT,...) failed (%d)\n", status);
+			return -1;
+		}
+		fflush(stdout);
+	}
+
+	// Finalize() on all objects in graph
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		if (m_usingMultiFrameCapture == false){
+			m_usingMultiFrameCapture = it->second->IsUsingMultiFrameCapture();
+		}
+		it->second->Finalize();
+	}
+
+#if DEBUG_INFO
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		printf("Parameter name: %s\n", it->first);
+	}
+	fflush(stdout);
+#endif
+
+	return 0;
+}
+
+int CVxEngine::ExecuteFrame(int frameNumber)
+{
+	if (!m_useProcessGraph) {
+		vx_status graph_status;
+
+		if ((graph_status = vxScheduleGraph(m_graph)) != VX_SUCCESS){
+			if (graph_status == VX_ERROR_GRAPH_ABANDONED) {
+				printf("WARNING: graph execution abandoned by a kernel or application\n");
+			}
+			else printf("ERROR: vxScheduleGraph failed, status = %d\n", graph_status);
+			return graph_status;
+	    }
+
+		if ((graph_status = vxWaitGraph(m_graph)) != VX_SUCCESS) {
+			if (graph_status == VX_ERROR_GRAPH_ABANDONED) {
+				printf("WARNING: graph execution abandoned by a kernel or application\n");
+			}
+			else printf("ERROR: vxWaitGraph failed.\n");
+			return graph_status;
+		}
+	}
+	else {
+		vx_status status = vxProcessGraph(m_graph);
+		if (status) {
+			if (status == VX_ERROR_GRAPH_ABANDONED) {
+				printf("WARNING: graph execution abandoned by a kernel or application\n");
+			}
+			else printf("ERROR: vxProcessGraph() => %d\n", status);
+			return status;
+		}
+	}
+	
+	return 0;
+}
+
+int CVxEngine::ReadFrame(int frameNumber)
+{
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		int status = it->second->ReadFrame(frameNumber);
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+int CVxEngine::WriteFrame(int frameNumber)
+{
+	
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		int status = it->second->WriteFrame(frameNumber);
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+int CVxEngine::CompareFrame(int frameNumber)
+{
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		int status = it->second->CompareFrame(frameNumber);
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+void CVxEngine::SetVerbose(bool verbose)
+{
+	m_verbose = verbose;
+	if (verbose) {
+		for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+			it->second->SetVerbose(verbose);
+		}
+	}
+}
+
+void CVxEngine::SetAbortOnMismatch(bool abortOnMismatch)
+{
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		it->second->SetAbortOnMismatch(abortOnMismatch);
+	}
+}
+
+const char * CVxEngine::MeasureFrame(int frameNumber)
+{
+	vx_perf_t perf = { 0 };
+	float current_time = 0.0;
+	vx_status status = vxQueryGraph(m_graph, VX_GRAPH_ATTRIBUTE_PERFORMANCE, &perf, sizeof(perf));
+	if (status) {
+		printf("ERROR: vxQueryGraph(*,VX_GRAPH_ATTRIBUTE_PERFORMANCE,...) failed (%d)\n", status);
+		throw -1;
+	}
+	static char text[256];
+	sprintf(text, "%6d,%6.2f,%6.2f,%6.2f", (int)perf.num,
+		(float)perf.tmp * m_perfMultiplicationFactor,
+		(float)perf.avg * m_perfMultiplicationFactor,
+		(float)perf.min * m_perfMultiplicationFactor);
+
+	current_time = (float)perf.tmp * m_perfMultiplicationFactor;
+	m_times_vector.push_back(current_time);
+
+	AgoGraphPerfInternalInfo iperf = { 0 };
+	status = vxQueryGraph(m_graph, VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_LAST, &iperf, sizeof(iperf));
+	if (status) { printf("ERROR: vxQueryGraph(*,VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_LAST,...) failed (%d)\n", status); throw - 1; }
+	sprintf(text + strlen(text), ",%6.2f,%6.2f,%6.2f,%6.2f",
+		(float)iperf.kernel_enqueue * m_perfMultiplicationFactor,
+		(float)iperf.kernel_wait * m_perfMultiplicationFactor,
+		(float)iperf.buffer_write * m_perfMultiplicationFactor,
+		(float)iperf.buffer_read * m_perfMultiplicationFactor);
+
+	return text;
+}
+
+
+void CVxEngine::GetMedianRunTime()
+{
+	if (m_times_vector.size() > 2){
+		sort(m_times_vector.begin(), m_times_vector.end());
+		int mod_2 = m_times_vector.size() % 2;
+		float median = 0.0;
+		double middle_index = floor(m_times_vector.size() / 2);
+		int middle_index_i = (int)middle_index;
+		if (mod_2 == 0){
+
+			median = (m_times_vector[middle_index_i] + m_times_vector[middle_index_i - 1]) / 2;
+		}
+		else {
+			median = m_times_vector[(int)middle_index_i];
+		}
+		printf("Median = %.3f\n", median);
+	}
+}
+
+
+const char * CVxEngine::GetPerformanceStatistics()
+{
+	vx_perf_t perf = { 0 };
+	vx_status status = vxQueryGraph(m_graph, VX_GRAPH_ATTRIBUTE_PERFORMANCE, &perf, sizeof(perf));
+	if (status) {
+		printf("ERROR: vxQueryGraph(*,VX_GRAPH_ATTRIBUTE_PERFORMANCE,...) failed (%d)\n", status);
+		throw -1;
+	}
+	static char text[256];
+	sprintf(text, "%6d,      ,%6.2f,%6.2f", (int)perf.num,
+		(float)perf.avg * m_perfMultiplicationFactor,
+		(float)perf.min * m_perfMultiplicationFactor);
+
+	AgoGraphPerfInternalInfo iperf = { 0 };
+	status = vxQueryGraph(m_graph, VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_AVG, &iperf, sizeof(iperf));
+	if (status) { printf("ERROR: vxQueryGraph(*,VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_AVG,...) failed (%d)\n", status); throw - 1; }
+	sprintf(text + strlen(text), ",%6.2f,%6.2f,%6.2f,%6.2f",
+		(float)iperf.kernel_enqueue * m_perfMultiplicationFactor,
+		(float)iperf.kernel_wait * m_perfMultiplicationFactor,
+		(float)iperf.buffer_write * m_perfMultiplicationFactor,
+		(float)iperf.buffer_read * m_perfMultiplicationFactor);
+
+	return text;
+}
+
+vx_status CVxEngine::DumpInternalProfile(char * fileName)
+{
+	return vxQueryGraph(m_graph, VX_GRAPH_ATTRIBUTE_AMD_PERFORMANCE_INTERNAL_PROFILE, fileName, 0);
+}
+
+int CVxEngine::Shutdown()
+{
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		if (it->second){
+			delete it->second;
+		}
+	}
+	m_paramMap.clear();
+	if (m_graph){
+		vxReleaseGraph(&m_graph);
+		m_graph = nullptr;
+	}
+
+
+	if (m_context) {
+		vxReleaseContext(&m_context);
+		m_context = nullptr;
+	}
+	return 0;
+}
+
+void CVxEngine::DisableWaitForKeyPress()
+{
+	for (auto it = m_paramMap.begin(); it != m_paramMap.end(); ++it){
+		if (it->second){
+			it->second->DisableWaitForKeyPress();
+		}
+	}
+}
+
+bool CVxEngine::IsUsingMultiFrameCapture(){
+	return m_usingMultiFrameCapture;
+}
diff --git a/runvx/vxEngine.h b/runvx/vxEngine.h
new file mode 100644
index 0000000..22addc9
--- /dev/null
+++ b/runvx/vxEngine.h
@@ -0,0 +1,91 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef CVX_ENGINE_H
+#define CVX_ENGINE_H
+
+#include "vxParameter.h"
+
+class CVxEngine {
+
+public:
+	CVxEngine();
+	virtual ~CVxEngine();
+	// functions to setup
+	// returns 0 on SUCCESS, else error code
+	int Initialize(int paramCount, int defaultTargetAffinity, int defaultTargetInfo, bool useProcessGraph, bool disableVirtual);
+	int SetGraphOptimizerFlags(vx_uint32 graph_optimizer_flags);
+	vx_context getContext();
+	int SetParameter(int index, const char * param);
+	void viewParameters();
+	void GetMedianRunTime();
+	int BuildGraph(char * graphScript, bool useAgoImport, bool useAgoDump);
+	// functions for execution
+	// returns 0 on SUCCESS, else error code
+	// ReadFrame() returns +ve value to indicate data unavailability
+	int ReadFrame(int frameNumber);
+	int ExecuteFrame(int frameNumber);
+	int WriteFrame(int frameNumber);
+	int CompareFrame(int frameNumber);
+	const char * MeasureFrame(int frameNumber);
+	// functions for profiling
+	vx_status DumpInternalProfile(char * fileName);
+	const char * GetPerformanceStatistics();
+	// functions for termination
+	int Shutdown();
+	void DisableWaitForKeyPress();
+	// save data object and parameters
+	int SetImportedData(vx_reference ref, const char * name, const char * params);
+	bool IsUsingMultiFrameCapture();
+	void SetCaptureFrameStart(vx_uint32 frameStart) { m_captureFrameStart = frameStart; }
+	void SetVerbose(bool verbose);
+	void SetAbortOnMismatch(bool abortOnMismatch);
+
+private:
+	// implementation specific functions
+	// ExecuteVXU - run utility that correspond to m_vxuName - called by ExecuteFrame
+	int ExecuteVXU();
+
+private:
+	// implementation specific data
+	// m_paramMap - holds names and pointers to all data objects
+	// m_paramCount - number of data objects on command-line
+	// m_vxuName - contains vxuName when BuildUtility() is called, otherwise nullptr
+	map<string, CVxParameter *> m_paramMap;
+	map<string, vx_enum> m_userStructMap;
+	int m_paramCount;
+	const char * m_vxuName;
+	vx_context m_context;
+	vx_graph m_graph;
+	float m_perfMultiplicationFactor; // to convert vx_perf_t to milli-seconds
+	bool m_useProcessGraph;
+	vector<float> m_times_vector;
+	// to support multi-frame capture
+	bool m_usingMultiFrameCapture;
+	vx_uint32 m_captureFrameStart;
+	// disable virtual objects
+	bool m_disableVirtual;
+	// verbose flag
+	bool m_verbose;
+};
+#endif /* CVX_ENGINE_H*/
\ No newline at end of file
diff --git a/runvx/vxEngineUtil.cpp b/runvx/vxEngineUtil.cpp
new file mode 100644
index 0000000..10f30ed
--- /dev/null
+++ b/runvx/vxEngineUtil.cpp
@@ -0,0 +1,30 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include <stdio.h>
+#include "vxEngineUtil.h"
+
+int mainVXU(int argc, char * argv[]){
+	printf("mainVXU is not yet implemented\n");
+	return 0;
+}
diff --git a/runvx/vxEngineUtil.h b/runvx/vxEngineUtil.h
new file mode 100644
index 0000000..846dbe5
--- /dev/null
+++ b/runvx/vxEngineUtil.h
@@ -0,0 +1,30 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef MAIN_VXU_H
+#define MAIN_VXU_H
+
+int mainVXU(int argc, char * argv[]);
+
+
+#endif /* MAIN_VXU_H */
\ No newline at end of file
diff --git a/runvx/vxImage.cpp b/runvx/vxImage.cpp
new file mode 100644
index 0000000..eda9268
--- /dev/null
+++ b/runvx/vxImage.cpp
@@ -0,0 +1,908 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxImage.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamImage
+//
+CVxParamImage::CVxParamImage()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_IMAGE;
+	m_format = VX_DF_IMAGE_U8;
+	m_width = 0;
+	m_height = 0;
+	m_planes = 0;
+	
+	// I/O configuration
+	m_frameSize = 0;
+	m_bufForCompare = nullptr;
+	m_displayName = "";
+	m_repeatFrames = 0;
+	m_countFrames = 0;
+	m_useCheckSumForCompare = false;
+	m_generateCheckSumForCompare = false;
+	m_useSyncOpenCLWriteDirective = false;
+	m_usingDisplay = false;
+	m_usingWriter = false;
+
+#if USE_OPENCV
+	m_cvCapDev = NULL;
+	m_cvCapMat = NULL;
+	m_cvDispMat = NULL;
+	m_cvImage = NULL;
+	m_cvWriter = NULL;
+#endif
+	m_cameraName[0] = 0;
+	m_comparePixelErrorMin = 0;
+	m_comparePixelErrorMax = 0;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+
+	// vx object
+	m_image = nullptr;
+	m_vxObjRef = nullptr;
+	m_disableWaitForKeyPress = false;
+
+	// reset video capture
+	m_gotCaptureVideoSize = false;
+	m_doNotResizeCapturedImages = false;
+	m_captureWidth = 0;
+	m_captureHeight = 0;
+}
+
+CVxParamImage::~CVxParamImage()
+{
+	Shutdown();
+}
+
+int CVxParamImage::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: image %s MATCHED for %d frame(s) of %s\n", m_useCheckSumForCompare ? "CHECKSUM" : "COMPARE", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_image) {
+		vxReleaseImage(&m_image);
+		m_image = nullptr;
+	}
+	if (m_bufForCompare) {
+		delete[] m_bufForCompare;
+		m_bufForCompare = nullptr;
+	}
+#if USE_OPENCV
+	bool changed_numCvUse = false;
+	if (m_cvDispMat) {
+		if (m_usingDisplay) {
+			g_numCvUse--;
+			changed_numCvUse = true;
+		}
+		delete (Mat *)m_cvDispMat;
+		m_cvDispMat = NULL;
+	}
+	if (m_cvCapMat) {
+		g_numCvUse--;
+		changed_numCvUse = true;
+		delete (Mat *)m_cvCapMat;
+		m_cvCapMat = NULL;
+	}
+	if (m_cvCapDev) {
+		delete (VideoCapture *)m_cvCapDev;
+		m_cvCapDev = NULL;
+	}
+	if (m_cvImage){
+		g_numCvUse--;
+		changed_numCvUse = true;
+		delete (Mat *)m_cvImage;
+		m_cvImage = NULL;
+	}
+	if (m_cvWriter) {
+		delete (VideoWriter *)m_cvWriter;
+		m_cvWriter = NULL;
+	}
+	if (changed_numCvUse && g_numCvUse == 0) {
+		if (!m_disableWaitForKeyPress) {
+			printf("Abort: Press any key to exit...\n"); fflush(stdout);
+			waitKey(0);
+		}
+	}
+#endif
+
+	return 0;
+}
+
+void CVxParamImage::DisableWaitForKeyPress()
+{
+	m_disableWaitForKeyPress = true;
+}
+
+int CVxParamImage::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64];
+	const char * ioParams = ScanParameters(desc, "image|image-virtual|image-uniform|image-roi:", "s:", objType);
+	if (!_stricmp(objType, "image") || !_stricmp(objType, "image-virtual") || !_stricmp(objType, "image-uniform")) {
+		// syntax: image[-virtual]:<width>,<height>,<format>[:<io-params>]
+		ioParams = ScanParameters(ioParams, "<width>,<height>,<format>", "d,d,c", &m_width, &m_height, &m_format);
+		if (!_stricmp(objType, "image-uniform")) {
+			ioParams = ScanParameters(ioParams, "<uniform-pixel-value>", ",D", &m_uniformValue);
+			m_image = vxCreateUniformImage(context, m_width, m_height, m_format, &m_uniformValue);
+		}
+		else if (!_stricmp(objType, "image-virtual")) {
+			m_image = vxCreateVirtualImage(graph, m_width, m_height, m_format);
+		}
+		else {
+			m_image = vxCreateImage(context, m_width, m_height, m_format);
+		}
+	}
+	else if (!_stricmp(objType, "image-roi")) {
+		// syntax: image-roi:<master-image>,rect{<start-x>;<start-y>;<end-x>;<end-y>}[:<io-params>]
+		char roi[64];
+		ioParams = ScanParameters(ioParams, "<master-image>,rect{<start-x>;<start-y>;<end-x>;<end-y>}", "s,s", m_roiMasterName, roi);
+		if (_strnicmp(roi, "rect{", 5) != 0) 
+			ReportError("ERROR: invalid image-roi syntax: %s\n", desc);
+		ScanParameters(&roi[4], "{<start-x>;<start-y>;<end-x>;<end-y>}", "{d;d;d;d}", &m_roiRegion.start_x, &m_roiRegion.start_y, &m_roiRegion.end_x, &m_roiRegion.end_y);
+		auto it = m_paramMap->find(m_roiMasterName);
+		if (it == m_paramMap->end())
+			ReportError("ERROR: image [%s] doesn't exist for %s\n", m_roiMasterName, desc);
+		vx_image masterImage = (vx_image)it->second->GetVxObject();
+		m_image = vxCreateImageFromROI(masterImage, &m_roiRegion);
+	}
+	else ReportError("ERROR: unsupported image type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_image);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: image creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_image) vxReleaseImage(&m_image);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_image;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamImage::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_image = (vx_image)m_vxObjRef;
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_WIDTH, &m_width, sizeof(m_width)));
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_HEIGHT, &m_height, sizeof(m_height)));
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_FORMAT, &m_format, sizeof(m_format)));
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_PLANES, &m_planes, sizeof(m_planes)));
+
+	// initialize compare region to complete image
+	m_rectCompare.start_x = 0;
+	m_rectCompare.start_y = 0;
+	m_rectCompare.end_x = m_width;
+	m_rectCompare.end_y = m_height;
+
+	// reset capture video size
+	m_gotCaptureVideoSize = false;
+	m_captureWidth = 0;
+	m_captureHeight = 0;
+
+	// process I/O requests
+	m_doNotResizeCapturedImages = false;
+	m_repeatFrames = 0;
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		// get file extension position in fileName
+		int extpos = (int)strlen(fileName) - 1;
+		while (extpos > 0 && fileName[extpos] != '.')
+			extpos--;
+		if (!_stricmp(ioType, "read") || !_stricmp(ioType, "camera"))
+		{ // read request syntax: read,<fileNameOrURL>[,frames{<start>[;<count>;repeat]}|no-resize] or camera,<deviceNumber>
+			int cameraDevice = -1;
+			if (!_stricmp(ioType, "camera"))
+				cameraDevice = atoi(fileName);
+			// get optional repeat frame count and starting frame
+			m_repeatFrames = 0;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",frames{<start>[;<count>;repeat]}|no-resize", ",s", option);
+				if (!_strnicmp(option, "frames{", 7)) {
+					int startFrame = 0, count = 0; char repeat[64] = { 0 };
+					if (sscanf(&option[7], "%d;%d;%s", &startFrame, &count, repeat) >= 1) {
+						repeat[6] = 0; // truncate since scanf will read all characters till the end of string into repeat
+						m_captureFrameStart = startFrame;
+						if (!_stricmp(repeat, "repeat") && (count > 0))
+							m_repeatFrames = count;
+					}
+					else ReportError("ERROR: invalid image read/camera option: %s\n", option);
+				}
+				else if (!_stricmp(option, "no-resize")) {
+					m_doNotResizeCapturedImages = true;
+				}
+				else ReportError("ERROR: invalid image read/camera option: %s\n", option);
+			}
+			// check if openCV video capture need to be used
+			if (!_stricmp(&fileName[extpos], ".mp4") || !_stricmp(&fileName[extpos], ".avi") ||
+				!_stricmp(&fileName[extpos], ".jpg") || !_stricmp(&fileName[extpos], ".jpeg") ||
+				!_stricmp(&fileName[extpos], ".jpe") || !_stricmp(&fileName[extpos], ".png") ||
+				!_stricmp(&fileName[extpos], ".bmp") || !_stricmp(&fileName[extpos], ".tif") ||
+				!_stricmp(&fileName[extpos], ".ppm") || !_stricmp(&fileName[extpos], ".tiff") ||
+				!_stricmp(&fileName[extpos], ".pgm") || !_stricmp(&fileName[extpos], ".pbm") ||
+				!_strnicmp(fileName, "file://", 7) || !_strnicmp(fileName, "http://", 7) || !_strnicmp(fileName, "https://", 8) ||
+				cameraDevice >= 0)
+			{ // need OpenCV to process these read I/O requests ////////////////////
+#if USE_OPENCV
+				if (m_format == VX_DF_IMAGE_RGB) {
+					// pen video capture device and mark multi-frame capture
+					m_usingMultiFrameCapture = true;
+					VideoCapture * pCap = nullptr;
+					if (cameraDevice >= 0) {
+						pCap = new VideoCapture(cameraDevice);
+					}
+					else {
+						pCap = new VideoCapture(fileName);
+						// if single .jpg are is specified, mark as single-frame capture
+						if (strstr(fileName, "%") == NULL && !_stricmp(&fileName[strlen(fileName) - 4], ".jpg")) {
+							m_usingMultiFrameCapture = false;
+						}
+					}
+					m_cvCapDev = pCap;
+					if (!pCap->isOpened()) {
+						printf("ERROR: OpenCV device capture(%s) failed\n", fileName);
+						throw - 1;
+					}
+#if 0 // TBD: disabled the check to avoid errors with video files
+					if (pCap->get(CV_CAP_PROP_FRAME_WIDTH) != m_width || pCap->get(CV_CAP_PROP_FRAME_HEIGHT) != m_height) {
+						printf("ERROR: OpenCV capture(%s) device is %dx%d whereas requested image is %dx%d\n", fileName, pCap->get(CV_CAP_PROP_FRAME_WIDTH), pCap->get(CV_CAP_PROP_FRAME_HEIGHT), m_width, m_height);
+						throw - 1;
+					}
+#endif
+					int cvMatType = CV_8UC3;
+					m_cvCapMat = new Mat(m_width, m_height, cvMatType);
+					strcpy(m_cameraName, fileName);
+					g_numCvUse++;
+					// skip frames if requested
+					if (m_captureFrameStart > 0) {
+						printf("OK: skipping %d frames from %s\n", m_captureFrameStart, fileName); fflush(stdout);
+						for (vx_uint32 i = 0; i < m_captureFrameStart; i++) {
+							*(VideoCapture *)m_cvCapDev >> *(Mat *)m_cvCapMat;
+						}
+					}
+				}
+#else
+				printf("ERROR: This build doesn't support CAMERA option\n");
+				throw - 1;
+#endif
+			}
+			else
+			{ // raw frames reading /////////////////////////
+				m_fileNameRead.assign(RootDirUpdated(fileName));
+				m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+				// mark multi-frame capture enabled
+				m_usingMultiFrameCapture = true;
+			}
+		}
+		else if (!_stricmp(ioType, "view") || !_stricmp(ioType, "write"))
+		{ // write or view request syntax: write,<fileNameOrURL> OR view,<window-name>
+			bool needDisplay = false;
+			if (!_stricmp(ioType, "view") || !_stricmp(&fileName[extpos], ".mp4") || !_stricmp(&fileName[extpos], ".avi") || !_stricmp(&fileName[extpos], ".jpg"))
+			{ // need OpenCV to process these write I/O requests ////////////////////
+#if USE_OPENCV
+				if (!_stricmp(ioType, "view")) {
+					m_usingDisplay = true;
+					m_displayName.assign(fileName);
+					namedWindow(m_displayName, WINDOW_AUTOSIZE);
+					g_numCvUse++;
+				}
+				else {
+					m_fileNameWrite.assign(RootDirUpdated(fileName));
+					VideoWriter * writer = new VideoWriter(m_fileNameWrite.c_str(), -1, 30, Size(m_width, m_height));
+					m_cvWriter = (void *)writer;
+					m_usingWriter = true;
+				}
+				// create Mat object
+				int cvMatType = CV_8UC1;
+				if (m_format == VX_DF_IMAGE_U8 || m_format == VX_DF_IMAGE_U1_AMD) cvMatType = CV_8UC1;
+				else if (m_format == VX_DF_IMAGE_S16) cvMatType = CV_16UC1; // CV_16SC1 is not supported
+				else if (m_format == VX_DF_IMAGE_U16) cvMatType = CV_16UC1;
+				else if (m_format == VX_DF_IMAGE_RGB) cvMatType = CV_8UC3;
+				else if (m_format == VX_DF_IMAGE_RGBX) cvMatType = CV_8UC4;
+				else if (m_format == VX_DF_IMAGE_F32_AMD) cvMatType = CV_32FC1;
+				else {
+					printf("ERROR: display of image type (%4.4s) is not support. Exiting.\n", (const char *)&m_format);
+					throw - 1;
+				}
+				m_cvDispMat = new Mat(m_height, m_width, cvMatType);
+#else
+				printf("ERROR: this feature requires OpenCV missing in this build\n");
+				throw - 1;
+#endif
+			}
+			else {
+				m_fileNameWrite.assign(RootDirUpdated(fileName));
+				m_fileNameForWriteHasIndex = (m_fileNameWrite.find("%") != m_fileNameWrite.npos) ? true : false;
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare syntax: compare,fileName[,rect{<start-x>;<start-y>;<end-x>;<end-y>}][,err{<min>;<max>}][,checksum|checksum-save-instead-of-test]
+			// save the reference image fileName
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_fileNameForCompareHasIndex = (m_fileNameCompare.find("%") != m_fileNameCompare.npos) ? true : false;
+			// initialize pixel error range to exact match
+			m_comparePixelErrorMin = 0;
+			m_comparePixelErrorMax = 0;
+			// set the compare region
+			m_rectCompare.start_x = 0;
+			m_rectCompare.start_y = 0;
+			m_rectCompare.end_x = m_width;
+			m_rectCompare.end_y = m_height;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",rect{<start-x>;<start-y>;<end-x>;<end-y>}|err{<min>;<max>}|checksum|checksum-save-instead-of-test", ",s", option);
+				if (!_strnicmp(option, "rect", 4)) {
+					ScanParameters(option + 4, "{<start-x>;<start-y>;<end-x>;<end-y>}", "{d;d;d;d}", &m_rectCompare.start_x, &m_rectCompare.start_y, &m_rectCompare.end_x, &m_rectCompare.end_y);
+				}
+				else if (!_strnicmp(option, "err", 3)) {
+					ScanParameters(option + 3, "{<min>;<max>}", "{f;f}", &m_comparePixelErrorMin, &m_comparePixelErrorMax);
+					if (m_useCheckSumForCompare) ReportError("ERROR: can't support error range with checksum\n");
+				}
+				else if (!_stricmp(option, "checksum")) {
+					m_useCheckSumForCompare = true;
+					if (m_comparePixelErrorMin != m_comparePixelErrorMax) ReportError("ERROR: can't support error range with checksum\n");
+				}
+				else if (!_stricmp(option, "checksum-save-instead-of-test")) {
+					m_generateCheckSumForCompare = true;
+				}
+				else ReportError("ERROR: invalid image compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "directive") && !_stricmp(fileName, "sync-cl-write")) {
+			m_useSyncOpenCLWriteDirective = true;
+		}
+		else ReportError("ERROR: invalid image operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamImage::Finalize()
+{
+	// get object attributes
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_WIDTH, &m_width, sizeof(m_width)));
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_HEIGHT, &m_height, sizeof(m_height)));
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_FORMAT, &m_format, sizeof(m_format)));
+	ERROR_CHECK(vxQueryImage(m_image, VX_IMAGE_ATTRIBUTE_PLANES, &m_planes, sizeof(m_planes)));
+
+	// set m_rectFull to full image region
+	m_rectFull.start_x = 0;
+	m_rectFull.start_y = 0;
+	m_rectFull.end_x = m_width;
+	m_rectFull.end_y = m_height;
+
+	// initialize other parameters
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+
+	// compute frame size in bytes
+	m_frameSize = 0;
+	for (vx_uint32 plane = 0; plane < (vx_uint32)m_planes; plane++) {
+		vx_rectangle_t rect = { 0, 0, m_width, m_height };
+		vx_imagepatch_addressing_t addr = { 0 };
+		vx_uint8 * dst = NULL;
+		if (vxAccessImagePatch(m_image, &m_rectFull, plane, &addr, (void **)&dst, VX_READ_ONLY) == VX_SUCCESS) {
+			vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY;
+			vx_size height = (addr.dim_y * addr.scale_y) / VX_SCALE_UNITY;
+			vx_size width_in_bytes = (m_format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x);
+			m_frameSize += width_in_bytes * height;
+			ERROR_CHECK(vxCommitImagePatch(m_image, &m_rectFull, plane, &addr, (void *)dst));
+		}
+	}
+
+	return 0;
+}
+
+int CVxParamImage::ReadFrame(int frameNumber)
+{
+#if USE_OPENCV
+	if (m_cvCapMat && m_cvCapDev) {
+		// read image from camera
+		VideoCapture * pCap = (VideoCapture *)m_cvCapDev;
+		Mat * pMat = (Mat *)m_cvCapMat;
+		int timeout = 0;
+		*pCap >> *pMat;
+		if (!pMat->data){
+			// no data available, report that no more frames available
+			return 1;
+		}
+		else if (!m_gotCaptureVideoSize) {
+			m_captureWidth = pMat->cols;
+			m_captureHeight = pMat->rows;
+			m_gotCaptureVideoSize = true;
+			bool doResize = !m_doNotResizeCapturedImages && (pMat->cols != m_width || pMat->rows != m_height);
+			printf("OK: capturing %dx%d image(s) into %dx%d RGB image buffer%s\n", m_captureWidth, m_captureHeight, m_width, m_height, doResize ? " with resize" : "");
+		}
+
+		// resize image using bicubic interpolation, if needed
+		bool doResize = !m_doNotResizeCapturedImages && (pMat->cols != m_width || pMat->rows != m_height);
+		if (doResize) {
+			// resize the captured video to specifed buffer size
+			resize(*pMat, *pMat, Size(m_width, m_height), 0, 0, INTER_CUBIC);
+		}
+
+		// copy Mat into image
+		// NOTE: currently only supports U8, S16, RGB, RGBX image formats
+		if (m_format == VX_DF_IMAGE_U8 || m_format == VX_DF_IMAGE_S16 || m_format == VX_DF_IMAGE_RGB || m_format == VX_DF_IMAGE_RGBX) {
+			vx_rectangle_t rect = { 0, 0, min(m_width, (vx_uint32)pMat->cols), min(m_height, (vx_uint32)pMat->rows) };
+			vx_imagepatch_addressing_t addr = { 0 };
+			vx_uint8 * dst = NULL;
+			ERROR_CHECK(vxAccessImagePatch(m_image, &rect, 0, &addr, (void **)&dst, VX_WRITE_ONLY));
+			vx_int32 rowSize = ((vx_int32)pMat->step < addr.stride_y) ? (vx_int32)pMat->step : addr.stride_y;
+			for (vx_uint32 y = 0; y < rect.end_y; y++) {
+				memcpy(dst + y * addr.stride_y, pMat->data + y * pMat->step, rowSize);
+			}
+			ERROR_CHECK(vxCommitImagePatch(m_image, &rect, 0, &addr, dst));
+		}
+	}
+	else if (m_cvImage){
+		// read image from camera
+		VideoCapture * pCap = (VideoCapture *)m_cvCapDev;
+		Mat * pMat = (Mat *)m_cvImage;
+		int timeout = 0;
+		*pCap >> *pMat;
+		if (!pMat->data){
+			printf("ERROR: Can't read camera input. Camera is not supported.\n");
+			return -1;
+		}
+
+		vx_imagepatch_addressing_t addr = { 0 };
+		vx_uint8 * dst = NULL;
+		ERROR_CHECK(vxAccessImagePatch(m_image, &m_rectFull, 0, &addr, (void **)&dst, VX_WRITE_ONLY));
+		vx_int32 rowSize = ((vx_int32)pMat->step < addr.stride_y) ? (vx_int32)pMat->step : addr.stride_y;
+		for (vx_uint32 y = 0; y < m_height; y++) {
+			memcpy(dst + y * addr.stride_y, pMat->data + y * pMat->step, rowSize);
+		}
+		ERROR_CHECK(vxCommitImagePatch(m_image, &m_rectFull, 0, &addr, dst));
+	}
+#endif
+
+	// make sure that input file is open when OpenCV camera is not active and input filename is specified
+#if USE_OPENCV
+	if (!m_cvImage)
+#endif
+	if (!m_fpRead) {
+		if (m_fileNameRead.length() > 0) {
+			char fileName[MAX_FILE_NAME_LENGTH];
+			sprintf(fileName, m_fileNameRead.c_str(), frameNumber, m_width, m_height);
+			m_fpRead = fopen(fileName, "rb"); if (!m_fpRead) ReportError("ERROR: unable to open: %s\n", fileName);
+			if (!m_fileNameForReadHasIndex && m_captureFrameStart > 0) {
+				// skip to specified frame when starting frame is specified
+				fseek(m_fpRead, m_captureFrameStart*(long)m_frameSize, SEEK_SET);
+			}
+		}
+	}
+
+	if (m_fpRead) {
+		// update m_countFrames to be able to repeat after every m_repeatFrames
+		if (m_repeatFrames != 0) {
+			if (m_countFrames == m_repeatFrames) {
+				// seek back to beginning after every m_repeatFrames frames
+				fseek(m_fpRead, m_captureFrameStart*(long)m_frameSize, SEEK_SET);
+				m_countFrames = 0;
+			}
+			else {
+				m_countFrames++;
+			}
+		}
+
+		// read all image planes into vx_image and check if EOF has occured while reading
+		bool eofDetected = ReadImage(m_image, &m_rectFull, m_fpRead) ? true : false;
+
+		// close file if file names has indices (i.e., only one frame per file requested)
+		if (m_fileNameForReadHasIndex) {
+			fclose(m_fpRead);
+			m_fpRead = nullptr;
+		}
+
+		if (eofDetected) {
+			// report the caller that end of file has been detected -- no frames available in input
+			return 1;
+		}
+	}
+
+	// process user requested directives
+	if (m_useSyncOpenCLWriteDirective) {
+		ERROR_CHECK(vxDirective((vx_reference)m_image, VX_DIRECTIVE_AMD_COPY_TO_OPENCL));
+	}
+
+	return 0;
+}
+
+#if USE_OPENCV
+int CVxParamImage::ViewFrame(int frameNumber)
+{
+	if (m_cvDispMat) {
+		// NOTE: supports only U8, S16, RGB, RGBX, F32 formats
+		if (m_format == VX_DF_IMAGE_U8 || m_format == VX_DF_IMAGE_S16 || m_format == VX_DF_IMAGE_RGB || m_format == VX_DF_IMAGE_RGBX || m_format == VX_DF_IMAGE_F32_AMD || m_format == VX_DF_IMAGE_U1_AMD) {
+			// copy image into Mat
+			Mat * pMat = (Mat *)m_cvDispMat;
+			vx_imagepatch_addressing_t addr = { 0 };
+			vx_uint8 * src = NULL;
+			ERROR_CHECK(vxAccessImagePatch(m_image, &m_rectFull, 0, &addr, (void **)&src, VX_READ_ONLY));
+			if (m_format == VX_DF_IMAGE_U1_AMD) {
+				for (vx_uint32 y = 0; y < m_height; y++) {
+					vx_uint8 * pDst = (vx_uint8 *)pMat->data + y * pMat->step;
+					vx_uint8 * pSrc = (vx_uint8 *)src + y * addr.stride_y;
+					for (vx_uint32 x = 0; x < m_width; x++) {
+						pDst[x] = (pSrc[x >> 3] & (1 << (x & 3))) ? 255u : 0;
+					}
+				}
+			}
+			else {
+				vx_int32 rowSize = ((vx_int32)pMat->step < addr.stride_y) ? (vx_int32)pMat->step : addr.stride_y;
+				for (vx_uint32 y = 0; y < m_height; y++) {
+					memcpy(pMat->data + y * pMat->step, src + y * addr.stride_y, rowSize);
+				}
+			}
+			ERROR_CHECK(vxCommitImagePatch(m_image, &m_rectFull, 0, &addr, src));
+			// convert grayscale Mat pMat to RGB Mat convertedToRGB:
+			//   this is done in order to be able to plot keypoints with different colors
+			Mat convertedToRGB(pMat->rows, pMat->cols, CV_8UC3, Scalar(0, 0, 255));
+			Mat *pOutputImage = pMat;
+			if (pMat->type() == CV_8UC1) { // TBD: need to support S16 images here
+				cvtColor(*pMat, convertedToRGB, CV_GRAY2RGB);
+				pOutputImage = &convertedToRGB;
+			}
+
+			// color table for key-points
+			static int colorTable[][3] = { { 0, 255, 0 }, { 255, 0, 0 }, { 0, 255, 255 }, { 51, 51, 255 }, { 0, 0, 102 }, { 255, 255, 255 } };
+			int colorIndex = 0;
+
+			// list of golbal list
+			std::vector<ArrayItemForView> kpList;
+			// process objects with same window name as the image
+			int overlayOffsetX = 10, overlayOffsetY = 10;
+			for (auto it = m_paramList.begin(); it != m_paramList.end(); it++)
+			{
+				if (!m_displayName.compare((*it)->getDisplayName()))
+				{ // name of the window matched
+					if ((*it)->GetVxObjectType() == VX_TYPE_ARRAY)
+					{ // view the array data (processed in two steps) //////////////////////////// 
+						// get array and itemtype and numitems
+						vx_array arr = (vx_array)(*it)->GetVxObject();
+						vx_enum itemtype = VX_TYPE_INVALID;
+						vx_size arrayNumItems = 0;
+						ERROR_CHECK(vxQueryArray(arr, VX_ARRAY_ATTRIBUTE_ITEMTYPE, &itemtype, sizeof(itemtype)));
+						ERROR_CHECK(vxQueryArray(arr, VX_ARRAY_ATTRIBUTE_NUMITEMS, &arrayNumItems, sizeof(arrayNumItems)));
+						if (itemtype != VX_TYPE_KEYPOINT && itemtype != VX_TYPE_RECTANGLE && itemtype != VX_TYPE_COORDINATES2D)
+							ReportError("ERROR: doesn't support viewing of specified array type\n");
+						// add data items to the global kpList
+						CVxParameter * paramArray = *it;
+						if (paramArray->GetArrayListForViewCount() > 0)
+						{ // use data items from the shared list for view, if available
+							size_t count = paramArray->GetArrayListForViewCount();
+							int colorIndexMax = colorIndex;
+							for (auto index = 0; index < count; index++) {
+								ArrayItemForView kpItem = *paramArray->GetArrayListForViewItemAt(index);
+								// update kpItem.colorIndex and colorIndexMax
+								int id = colorIndex + kpItem.colorIndex;
+								if (id >= int(sizeof(colorTable) / sizeof(colorTable[0])))
+									id = int(sizeof(colorTable) / sizeof(colorTable[0]) - 1);
+								colorIndexMax = max(id, colorIndexMax);
+								kpItem.colorIndex = id;
+								// add the item to global list
+								kpList.push_back(kpItem);
+							}
+							// update colorIndex for next item
+							colorIndex = colorIndexMax;
+							if (colorIndex < int(sizeof(colorTable) / sizeof(colorTable[0]) - 1))
+								colorIndex++;
+							// reset the list
+							paramArray->ResetArrayListForView();
+						}
+						else if (arrayNumItems > 0)
+						{ // use the data items from the vx_array object
+							// initialize keypoint with colorIndex and update colorIndex for next keypoint set
+							ArrayItemForView kpItem = { itemtype, colorIndex, 0, 0, 0.0f, 0, 0 };
+							if (colorIndex < int(sizeof(colorTable) / sizeof(colorTable[0]) - 1))
+								colorIndex++;
+							// compute strength bounds and binSize for plotted point radius
+							vx_size stride = 0;
+							void *base = NULL;
+							ERROR_CHECK(vxAccessArrayRange(arr, 0, arrayNumItems, &stride, &base, VX_READ_ONLY));
+							if (itemtype == VX_TYPE_KEYPOINT) {
+								for (size_t i = 0; i < arrayNumItems; i++) {
+									vx_keypoint_t * kp = &vxArrayItem(vx_keypoint_t, base, i, stride);
+									kpItem.strength = kp->strength;
+									kpItem.x = kp->x;
+									kpItem.y = kp->y;
+									kpList.push_back(kpItem);
+								}
+							}
+							else if (itemtype == VX_TYPE_RECTANGLE) {
+								for (size_t i = 0; i < arrayNumItems; i++) {
+									vx_rectangle_t * kp = &vxArrayItem(vx_rectangle_t, base, i, stride);
+									kpItem.x = kp->start_x;
+									kpItem.y = kp->start_y;
+									kpItem.w = kp->end_x - kp->start_x;
+									kpItem.h = kp->end_y - kp->start_y;
+									kpList.push_back(kpItem);
+								}
+							}
+							else if (itemtype == VX_TYPE_COORDINATES2D) {
+								for (size_t i = 0; i < arrayNumItems; i++) {
+									vx_coordinates2d_t * kp = &vxArrayItem(vx_coordinates2d_t, base, i, stride);
+									kpItem.x = kp->x;
+									kpItem.y = kp->y;
+									kpList.push_back(kpItem);
+								}
+							}
+							ERROR_CHECK(vxCommitArrayRange(arr, 0, arrayNumItems, base));
+						}
+					}
+					else if ((*it)->GetVxObjectType() == VX_TYPE_DISTRIBUTION)
+					{ // view the distribution data ////////////////////////////
+						vx_distribution dist = (vx_distribution)(*it)->GetVxObject();
+						vx_size numBins = 0;
+						vx_uint32 * hist = nullptr;
+						ERROR_CHECK(vxQueryDistribution(dist, VX_DISTRIBUTION_ATTRIBUTE_BINS, &numBins, sizeof(numBins)));
+						ERROR_CHECK(vxAccessDistribution(dist, (void **)&hist, VX_READ_ONLY));
+						vx_uint32 maxValue = 0;
+						for (size_t bin = 0; bin < numBins; bin++) {
+							maxValue = max(maxValue, hist[bin]);
+						}
+						Rect box(overlayOffsetX, overlayOffsetY, 256, 100); overlayOffsetY += (box.height + 8);
+						rectangle(*pOutputImage, Rect(box.x - 2, box.y - 2, box.width + 4, box.height + 4), Scalar(0, 0, 255), 1, 8);
+						rectangle(*pOutputImage, Rect(box.x - 1, box.y - 1, box.width + 2, box.height + 2), Scalar(255, 0, 0), 1, 8);
+						if (maxValue > 0) {
+							int barWidth = box.width / (int)numBins;
+							for (int bin = 0; bin < (int)numBins; bin++) {
+								int barHeight = box.height * hist[bin] / maxValue;
+								Rect bar(box.x + bin*barWidth, box.y + box.height - barHeight, barWidth, barHeight);
+								rectangle(*pOutputImage, bar, Scalar(0, 255, 255), CV_FILLED, 8);
+							}
+						}
+						ERROR_CHECK(vxCommitDistribution(dist, hist));
+						// show the name of the object to the right
+						char message[128]; sprintf(message, "%s (distribution)", (*it)->GetVxObjectName());
+						int H = 20;
+						cv::putText(*pOutputImage, message, Point(box.x + box.width + 10, box.y + H - 6), CV_FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 255, 0), 2, 8, false);
+						cv::putText(*pOutputImage, message, Point(box.x + box.width + 12, box.y + H - 8), CV_FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255, 0), 1, 8, false);
+					}
+					else if ((*it)->GetVxObjectType() == VX_TYPE_LUT)
+					{ // view the lut data ////////////////////////////
+						vx_lut lut = (vx_lut)(*it)->GetVxObject();
+						vx_enum data_type;
+						ERROR_CHECK(vxQueryLUT(lut, VX_LUT_ATTRIBUTE_TYPE, &data_type, sizeof(data_type)));
+						if (data_type == VX_TYPE_UINT8)
+						{ // only supports 8-bit look-up tables
+							vx_size count;
+							vx_uint8 * data = nullptr;
+							ERROR_CHECK(vxQueryLUT(lut, VX_LUT_ATTRIBUTE_COUNT, &count, sizeof(count)));
+							ERROR_CHECK(vxAccessLUT(lut, (void **)&data, VX_READ_ONLY));
+							vx_uint32 maxValue = 255;
+							Rect box(overlayOffsetX, overlayOffsetY, 256, 256); overlayOffsetY += (box.height + 8);
+							rectangle(*pOutputImage, Rect(box.x - 2, box.y - 2, box.width + 4, box.height + 4), Scalar(255, 0, 255), 1, 8);
+							rectangle(*pOutputImage, Rect(box.x - 1, box.y - 1, box.width + 2, box.height + 2), Scalar(255, 255, 0), 1, 8);
+							int barWidth = box.width / (int)count;
+							for (int bin = 0; bin < (int)count; bin++) {
+								int barHeight = box.height * data[bin] / maxValue;
+								Rect bar(box.x + bin*barWidth, box.y + box.height - barHeight, barWidth, barHeight);
+								rectangle(*pOutputImage, bar, Scalar(0, 255, 255), CV_FILLED, 8);
+							}
+							ERROR_CHECK(vxCommitLUT(lut, data));
+							// show the name of the object to the right
+							char message[128]; sprintf(message, "%s (lut)", (*it)->GetVxObjectName());
+							int H = 20;
+							cv::putText(*pOutputImage, message, Point(box.x + box.width + 10, box.y + H - 6), CV_FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 0, 0, 0), 2, 8, false);
+							cv::putText(*pOutputImage, message, Point(box.x + box.width + 12, box.y + H - 8), CV_FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255, 0), 1, 8, false);
+						}
+					}
+					else if ((*it)->GetVxObjectType() == VX_TYPE_SCALAR)
+					{ // view the scalar data ////////////////////////////
+						char value[64]; 
+						vx_scalar scalar = (vx_scalar)(*it)->GetVxObject();
+						ReadScalarToString(scalar, value);
+						char message[128]; sprintf(message, "%s = %s", (*it)->GetVxObjectName(), value);
+						int H = 20;
+						cv::putText(*pOutputImage, message, Point(overlayOffsetX+0, overlayOffsetY+H-6), CV_FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0,   0, 255, 0), 2, 8, false);
+						cv::putText(*pOutputImage, message, Point(overlayOffsetX+2, overlayOffsetY+H-8), CV_FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255, 0), 1, 8, false);
+						overlayOffsetY += H;
+					}
+				}
+			}
+			// add keypoints from user specified file(s)
+			for (auto it = m_viewKeypointFilenameList.begin(); it != m_viewKeypointFilenameList.end(); it++)
+			{
+				// initialize keypoint with colorIndex and update colorIndex for next keypoint set
+				ArrayItemForView kpItem = { VX_TYPE_KEYPOINT, colorIndex, 0, 0, 0.0f, 0, 0 };
+				if (colorIndex < int(sizeof(colorTable) / sizeof(colorTable[0]) - 1))
+					colorIndex++;
+				// get list of keypoints from the user specified file
+				char fileName[512];
+				sprintf(fileName, it->c_str(), frameNumber);
+				FILE * fp = fopen(fileName, "r");
+				if (!fp) ReportError("ERROR: unable to open '%s'\n", fileName);
+				char line[256];
+				while (fgets(line, sizeof(line), fp) != NULL){
+					if (sscanf(line, "%d%d%f", &kpItem.x, &kpItem.y, &kpItem.strength) == 3) {
+						kpList.push_back(kpItem);
+					}
+				}
+				fclose(fp);
+			}
+			// compute strength bounds and binSize for computing keypoint radius
+			float minStrength = FLT_MAX, maxStrength = FLT_MIN;
+			for (auto it = kpList.begin(); it != kpList.end(); it++) {
+				if (it->itemtype == VX_TYPE_KEYPOINT) {
+					float strength = it->strength;
+					minStrength = min(strength, minStrength);
+					maxStrength = max(strength, maxStrength);
+				}
+			}
+			float binSize = (maxStrength - minStrength) / 5;
+			// plot the points
+			for (auto it = kpList.begin(); it != kpList.end(); it++) {
+				Scalar color(colorTable[it->colorIndex][0], colorTable[it->colorIndex][1], colorTable[it->colorIndex][2]);
+				if (it->itemtype == VX_TYPE_KEYPOINT) {
+					// compute the radius of point using strength and binSize
+					float strength = it->strength;
+					double radius = 2.0;
+					if (strength > minStrength) {
+						radius += 2.0 * floor((strength - minStrength) / binSize);
+					}
+					// plot the points with key-point location as center of circle
+					Point center(it->x, it->y);
+					circle(*pOutputImage, center, (int)radius, Scalar(0, 0, 0), 1, 8);
+					circle(*pOutputImage, center, (int)radius + 1, color, 1, 8);
+				}
+				else if (it->itemtype == VX_TYPE_RECTANGLE) {
+					// plot the rectangle
+					Rect rec(it->x, it->y, it->w, it->h);
+					rectangle(*pOutputImage, rec, color, 1, 8);
+				}
+				else if (it->itemtype == VX_TYPE_COORDINATES2D) {
+					// plot the points with small circle
+					float radius = 1.0;
+					Point center(it->x, it->y);
+					circle(*pOutputImage, center, (int)radius, Scalar(0, 0, 0), 1, 8);
+					circle(*pOutputImage, center, (int)radius + 1, color, 1, 8);
+				}
+			}
+			// show the image and points (if requested)
+			if (m_usingDisplay) {
+				imshow(m_displayName, *pOutputImage);
+			}
+			if (m_usingWriter) {
+				((VideoWriter *)m_cvWriter)->write(*pOutputImage);
+			}
+		}
+	}
+	return 0;
+}
+#endif
+
+int CVxParamImage::WriteFrame(int frameNumber)
+{
+#if USE_OPENCV
+	if (ViewFrame(frameNumber) < 0)
+		return -1;
+#endif
+
+	if (!m_fpWrite) {
+		if (m_fileNameWrite.length() > 0 && !m_usingWriter) {
+			char fileName[MAX_FILE_NAME_LENGTH];
+			sprintf(fileName, m_fileNameWrite.c_str(), frameNumber, m_width, m_height);
+			m_fpWrite = fopen(fileName, "wb+");
+			if (!m_fpWrite) ReportError("ERROR: unable to create: %s\n", fileName);
+		}
+	}
+
+	if (m_fpWrite) {
+		// write vx_image into file
+		WriteImage(m_image, &m_rectFull, m_fpWrite);
+
+		// close the file if one frame gets written per file
+		if (m_fileNameForWriteHasIndex && m_fpWrite){
+			fclose(m_fpWrite);
+			m_fpWrite = nullptr;
+		}
+	}
+
+	return 0;
+}
+
+int CVxParamImage::CompareFrame(int frameNumber)
+{
+	// make sure that compare reference data is opened
+	if (!m_fpCompare) {
+		if (m_fileNameCompare.length() > 0) {
+			sprintf(m_fileNameCompareCurrent, m_fileNameCompare.c_str(), frameNumber, m_width, m_height);
+			if (m_generateCheckSumForCompare) {
+				m_fpCompare = fopen(m_fileNameCompareCurrent, "w");
+				if (!m_fpCompare) ReportError("ERROR: unable to create: %s\n", m_fileNameCompareCurrent);
+			}
+			else {
+				m_fpCompare = fopen(m_fileNameCompareCurrent, "rb");
+				if (!m_fpCompare) ReportError("ERROR: unable to open: %s\n", m_fileNameCompareCurrent);
+			}
+		}
+	}
+	if (!m_fpCompare) return 0;
+
+	if (m_generateCheckSumForCompare)
+	{ // generate checksum //////////////////////////////////////////
+		char checkSumString[64];
+		ComputeChecksum(checkSumString, m_image, &m_rectCompare);
+		fprintf(m_fpCompare, "%s\n", checkSumString);
+	}
+	else if (m_useCheckSumForCompare)
+	{ // compare checksum //////////////////////////////////////////
+		char checkSumStringRef[64] = { 0 };
+		if (fscanf(m_fpCompare, "%s", checkSumStringRef) != 1) {
+			printf("ERROR: image checksum missing for frame#%d in %s\n", frameNumber, m_fileNameCompareCurrent);
+			throw - 1;
+		}
+		char checkSumString[64];
+		ComputeChecksum(checkSumString, m_image, &m_rectCompare);
+		if (!strcmp(checkSumString, checkSumStringRef)) {
+			m_compareCountMatches++;
+			if (m_verbose) printf("OK: image CHECKSUM MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, m_fileNameCompareCurrent);
+		}
+		else {
+			m_compareCountMismatches++;
+			printf("ERROR: image CHECKSUM MISMATCHED for %s with frame#%d of %s [%s instead of %s]\n", GetVxObjectName(), frameNumber, m_fileNameCompareCurrent, checkSumString, checkSumStringRef);
+			if (m_abortOnCompareMismatch) return -1;
+		}
+	}
+	else
+	{ // compare raw frames //////////////////////////////////////////
+		// make sure buffer has been allocated
+		if (!m_bufForCompare) {
+			NULLPTR_CHECK(m_bufForCompare = new vx_uint8[m_frameSize]);
+		}
+		// read data from frame
+		if (m_frameSize != fread(m_bufForCompare, 1, m_frameSize, m_fpCompare)) {
+			// no more data to compare
+			ReportError("ERROR: image data missing for frame#%d in %s\n", frameNumber, m_fileNameCompareCurrent);
+		}
+		// compare image to reference from file
+		size_t errorPixelCountTotal = CompareImage(m_image, &m_rectCompare, m_bufForCompare, m_comparePixelErrorMin, m_comparePixelErrorMax, frameNumber, m_fileNameCompareCurrent);
+		if (!errorPixelCountTotal) {
+			m_compareCountMatches++;
+			if (m_verbose) printf("OK: image COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, m_fileNameCompareCurrent);
+		}
+		else {
+			m_compareCountMismatches++;
+			if(m_abortOnCompareMismatch) return -1;
+		}
+	}
+
+	// close the file if user requested separate file for each compare data
+	if (m_fileNameForCompareHasIndex) {
+		fclose(m_fpCompare);
+		m_fpCompare = nullptr;
+	}
+
+	return 0;
+}
diff --git a/runvx/vxImage.h b/runvx/vxImage.h
new file mode 100644
index 0000000..481724e
--- /dev/null
+++ b/runvx/vxImage.h
@@ -0,0 +1,97 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_IMAGE_H__
+#define __VX_IMAGE_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamImage : public CVxParameter
+{
+public:
+	CVxParamImage();
+	virtual ~CVxParamImage();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+	virtual void DisableWaitForKeyPress();
+
+protected:
+#if USE_OPENCV
+	int ViewFrame(int frameNumber);
+#endif
+
+private:
+	// vx configuration
+	vx_df_image m_format;
+	vx_uint32 m_width;
+	vx_uint32 m_height;
+	vx_size m_planes;
+	// I/O configuration
+	int m_repeatFrames;
+	int m_countFrames;
+	char m_cameraName[256];
+#if USE_OPENCV
+	void * m_cvCapDev;
+	void * m_cvCapMat;
+	void * m_cvWriter;
+	cv::Mat * m_cvDispMat;
+	cv::Mat * m_cvImage;
+#endif
+	// vx object
+	vx_image m_image;
+	char m_roiMasterName[64];      // name of ROI image master
+	vx_rectangle_t m_roiRegion;    // rectangle used to save ROI image dimensions
+	vx_rectangle_t m_rectFull;     // rectangle with full image size for use by access/commit
+	vx_uint64 m_uniformValue;      // uniform image value
+
+	// image I/O
+	size_t m_frameSize;
+	std::list<std::string> m_viewKeypointFilenameList;
+	bool m_useSyncOpenCLWriteDirective;
+	float m_comparePixelErrorMin;
+	float m_comparePixelErrorMax;
+	vx_rectangle_t m_rectCompare;  // rectangle used to save rectangular region used for compare
+	vx_uint8 * m_bufForCompare;
+	bool m_useCheckSumForCompare;
+	bool m_generateCheckSumForCompare;
+	char m_fileNameCompareCurrent[256];
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	bool m_disableWaitForKeyPress;
+	bool m_usingDisplay;
+	bool m_usingWriter;
+	bool m_gotCaptureVideoSize;
+	bool m_doNotResizeCapturedImages;
+	vx_uint32 m_captureWidth;
+	vx_uint32 m_captureHeight;
+};
+
+
+#endif /* __VX_IMAGE_H__ */
\ No newline at end of file
diff --git a/runvx/vxLUT.cpp b/runvx/vxLUT.cpp
new file mode 100644
index 0000000..fbea3d7
--- /dev/null
+++ b/runvx/vxLUT.cpp
@@ -0,0 +1,298 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxLUT.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamLUT
+//
+CVxParamLUT::CVxParamLUT()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_LUT;
+	m_data_type = VX_TYPE_UINT8;
+	m_count = 0;
+	// I/O configuration
+	m_readFileIsBinary = false;
+	m_writeFileIsBinary = false;
+	m_compareFileIsBinary = false;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	m_useSyncOpenCLWriteDirective = false;
+	// vx object
+	m_lut = nullptr;
+	m_vxObjRef = nullptr;
+}
+
+CVxParamLUT::~CVxParamLUT()
+{
+	Shutdown();
+}
+
+int CVxParamLUT::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: lut COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_lut) {
+		vxReleaseLUT(&m_lut);
+		m_lut = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamLUT::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64], data_type[64];
+	const char * ioParams = ScanParameters(desc, "lut:<data-type>,<count>", "s:s,D", objType, data_type, &m_count);
+	if (!_stricmp(objType, "lut")) {
+		m_data_type = ovxName2Enum(data_type);
+		m_lut = vxCreateLUT(context, m_data_type, m_count);
+	}
+	else ReportError("ERROR: unsupported lut type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_lut);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: lut creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_lut) vxReleaseLUT(&m_lut);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_lut;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamLUT::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_lut = (vx_lut)m_vxObjRef;
+	ERROR_CHECK(vxQueryLUT(m_lut, VX_LUT_ATTRIBUTE_TYPE, &m_data_type, sizeof(m_data_type)));
+	ERROR_CHECK(vxQueryLUT(m_lut, VX_LUT_ATTRIBUTE_COUNT, &m_count, sizeof(m_count)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>[,ascii|binary]
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+			m_readFileIsBinary = (m_fileNameRead.find(".txt") != m_fileNameRead.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_readFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_readFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid lut read option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>[,ascii|binary]
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			m_writeFileIsBinary = (m_fileNameWrite.find(".txt") != m_fileNameWrite.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_writeFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_writeFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid lut write option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // write request syntax: compare,<fileName>[,ascii|binary]
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_compareFileIsBinary = (m_fileNameCompare.find(".txt") != m_fileNameCompare.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_compareFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_compareFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid lut compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "view")) {
+			m_displayName.assign(fileName);
+			m_paramList.push_back(this);
+		}
+		else ReportError("ERROR: invalid lut operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamLUT::Finalize()
+{
+	return 0;
+}
+
+int CVxParamLUT::ReadFrame(int frameNumber)
+{
+	// check if there is no user request to read
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// for single frame reads, there is no need to read the array again
+	// as it is already read into the object
+	if (!m_fileNameForReadHasIndex && frameNumber != m_captureFrameStart) {
+		return 0;
+	}
+
+	// reading data from input file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameRead.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_readFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		if (frameNumber == m_captureFrameStart) {
+			ReportError("ERROR: Unable to open: %s\n", fileName);
+		}
+		else {
+			return 1; // end of sequence detected for multiframe sequences
+		}
+	}
+	vx_size size; ERROR_CHECK(vxQueryLUT(m_lut, VX_LUT_ATTRIBUTE_SIZE, &size, sizeof(size)));
+	vx_uint8 * data = nullptr; ERROR_CHECK(vxAccessLUT(m_lut, (void **)&data, VX_WRITE_ONLY));
+	int status = 0;
+	if (m_readFileIsBinary) {
+		if (fread(data, 1, size, fp) != size)
+			status = -1;
+	}
+	else {
+		vx_size itemsize = size / m_count;
+		for (vx_uint32 x = 0; x < m_count; x++) {
+			vx_uint32 value;
+			if (fscanf(fp, "%i", &value) != 1) {
+				status = -1;
+				break;
+			}
+			memcpy(&data[x * itemsize], &value, itemsize);
+		}
+	}
+	ERROR_CHECK(vxCommitLUT(m_lut, data));
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on lut input file: %s\n", fileName);
+
+	// process user requested directives
+	if (m_useSyncOpenCLWriteDirective) {
+		ERROR_CHECK(vxDirective((vx_reference)m_lut, VX_DIRECTIVE_AMD_COPY_TO_OPENCL));
+	}
+
+	return status;
+}
+
+int CVxParamLUT::WriteFrame(int frameNumber)
+{
+	// check if there is no user request to write
+	if (m_fileNameWrite.length() < 1) return 0;
+	// write data to output file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameWrite.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_writeFileIsBinary ? "wb" : "w");
+	if (!fp) ReportError("ERROR: Unable to create: %s\n", fileName);
+	vx_size size; ERROR_CHECK(vxQueryLUT(m_lut, VX_LUT_ATTRIBUTE_SIZE, &size, sizeof(size)));
+	vx_uint8 * data = nullptr; ERROR_CHECK(vxAccessLUT(m_lut, (void **)&data, VX_READ_ONLY));
+	if (m_writeFileIsBinary) {
+		fwrite(data, 1, size, fp);
+	}
+	else {
+		vx_size itemsize = size / m_count;
+		for (vx_uint32 x = 0; x < m_count; x++) {
+			char value[64]; 
+			PutScalarValueToString(m_data_type, &data[x * itemsize], value);
+			fprintf(fp, "%s\n", value);
+		}
+	}
+	ERROR_CHECK(vxCommitLUT(m_lut, data));
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamLUT::CompareFrame(int frameNumber)
+{
+	// check if there is no user request to compare
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// reading data from reference file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_compareFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	vx_size size; ERROR_CHECK(vxQueryLUT(m_lut, VX_LUT_ATTRIBUTE_SIZE, &size, sizeof(size)));
+	vx_size itemsize = size / m_count;
+	vx_uint8 * data = nullptr; ERROR_CHECK(vxAccessLUT(m_lut, (void **)&data, VX_WRITE_ONLY));
+	int status = 0;
+	bool mismatchDetected = false;
+	for (vx_uint32 x = 0; x < m_count; x++) {
+		vx_uint32 value;
+		if (m_compareFileIsBinary) {
+			if (fread(&value, itemsize, 1, fp) != 1) {
+				status = -1;
+				break;
+			}
+		}
+		else {
+			if (fscanf(fp, "%i", &value) != 1) {
+				status = -1;
+				break;
+			}
+		}
+		if (memcmp(&data[x * itemsize], &value, itemsize) != 0) {
+			mismatchDetected = true;
+			break;
+		}
+	}
+	ERROR_CHECK(vxCommitLUT(m_lut, data));
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on lut reference file: %s\n", fileName);
+
+	if (mismatchDetected) {
+		m_compareCountMismatches++;
+		printf("ERROR: lut COMPARE MISMATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+		if (m_verbose) printf("OK: lut COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+	}
+
+	return 0;
+}
diff --git a/runvx/vxLUT.h b/runvx/vxLUT.h
new file mode 100644
index 0000000..1c57b9e
--- /dev/null
+++ b/runvx/vxLUT.h
@@ -0,0 +1,59 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_LUT_H__
+#define __VX_LUT_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamLUT : public CVxParameter
+{
+public:
+	CVxParamLUT();
+	virtual ~CVxParamLUT();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_enum m_data_type;
+	vx_size m_count;
+	// I/O configuration
+	bool m_readFileIsBinary;
+	bool m_writeFileIsBinary;
+	bool m_compareFileIsBinary;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	bool m_useSyncOpenCLWriteDirective;
+	// vx object
+	vx_lut m_lut;
+};
+
+#endif /* __VX_LUT_H__ */
\ No newline at end of file
diff --git a/runvx/vxMatrix.cpp b/runvx/vxMatrix.cpp
new file mode 100644
index 0000000..7c83d60
--- /dev/null
+++ b/runvx/vxMatrix.cpp
@@ -0,0 +1,371 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxMatrix.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamMatrix
+//
+CVxParamMatrix::CVxParamMatrix()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_MATRIX;
+	m_data_type = VX_TYPE_INT32;
+	m_columns = 0;
+	m_rows = 0;
+	m_size = 0;
+	// I/O configuration
+	m_readFileIsBinary = false;
+	m_writeFileIsBinary = false;
+	m_compareFileIsBinary = false;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	m_errTolerance = 0.0f;
+	m_bufForAccess = nullptr;
+	// vx object
+	m_matrix = nullptr;
+}
+
+CVxParamMatrix::~CVxParamMatrix()
+{
+	Shutdown();
+}
+
+int CVxParamMatrix::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: matrix COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	GuiTrackBarShutdown((vx_reference)m_matrix);
+	if (m_matrix) {
+		vxReleaseMatrix(&m_matrix);
+		m_matrix = nullptr;
+	}
+	if (m_bufForAccess) {
+		delete[] m_bufForAccess;
+		m_bufForAccess = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamMatrix::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64], data_type[64];
+	const char * ioParams = ScanParameters(desc, "matrix:<data-type>,<columns>,<rows>", "s:s,D,D", objType, data_type, &m_columns, &m_rows);
+	if (!_stricmp(objType, "matrix")) {
+		m_data_type = ovxName2Enum(data_type);
+		m_matrix = vxCreateMatrix(context, m_data_type, m_columns, m_rows);
+	}
+	else ReportError("ERROR: unsupported matrix type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_matrix);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: matrix creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_matrix) vxReleaseMatrix(&m_matrix);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_matrix;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamMatrix::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_matrix = (vx_matrix)m_vxObjRef;
+	ERROR_CHECK(vxQueryMatrix(m_matrix, VX_MATRIX_ATTRIBUTE_TYPE, &m_data_type, sizeof(m_data_type)));
+	ERROR_CHECK(vxQueryMatrix(m_matrix, VX_MATRIX_ATTRIBUTE_COLUMNS, &m_columns, sizeof(m_columns)));
+	ERROR_CHECK(vxQueryMatrix(m_matrix, VX_MATRIX_ATTRIBUTE_ROWS, &m_rows, sizeof(m_rows)));
+	ERROR_CHECK(vxQueryMatrix(m_matrix, VX_MATRIX_ATTRIBUTE_SIZE, &m_size, sizeof(m_size)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>[,ascii|binary]
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+			m_readFileIsBinary = (m_fileNameRead.find(".txt") != m_fileNameRead.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_readFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_readFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid matrix read option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>[,ascii|binary]
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			m_writeFileIsBinary = (m_fileNameWrite.find(".txt") != m_fileNameWrite.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_writeFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_writeFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid matrix write option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare request syntax: compare,<fileName>[,ascii|binary][,err{<tolerance>}]
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_compareFileIsBinary = (m_fileNameCompare.find(".txt") != m_fileNameCompare.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary|err{<tolerance>}", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_compareFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_compareFileIsBinary = true;
+				}
+				else if (!_strnicmp(option, "err{", 4)) {
+					ScanParameters(&option[3], "{<tolerance>}", "{f}", &m_errTolerance);
+				}
+				else ReportError("ERROR: invalid matrix compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "init"))
+		{ // write request syntax: init,{<value1>;<value2>;...<valueN>}
+			NULLPTR_CHECK(m_bufForAccess = new vx_uint8[m_size]);
+			vx_size index = 0; char fmt[3] = { '{', (m_data_type == VX_TYPE_FLOAT32) ? 'f' : 'd', 0 };
+			for (const char * s = fileName; *s && index < (m_columns * m_rows); fmt[0] = ';', index++) {
+				if (m_data_type == VX_TYPE_INT32 || m_data_type == VX_TYPE_UINT8) {
+					vx_uint32 value;
+					s = ScanParameters(s, "<value>", fmt, &value);
+					if (m_data_type == VX_TYPE_UINT8) ((vx_uint8 *)m_bufForAccess)[index] = (vx_uint8)value;
+					else ((vx_int32 *)m_bufForAccess)[index] = value;
+				}
+				else if (m_data_type == VX_TYPE_FLOAT32) {
+					s = ScanParameters(s, "<value>", fmt, &((vx_float32 *)m_bufForAccess)[index]);
+				}
+				else ReportError("ERROR: matrix init option not support for data_type of %s\n", GetVxObjectName());
+			}
+			if (index < (m_columns * m_rows)) ReportError("ERROR: matrix init have too few values: %s\n", fileName);
+			ERROR_CHECK(vxWriteMatrix(m_matrix, m_bufForAccess));
+		}
+		else if (!_stricmp(ioType, "directive") && !_stricmp(fileName, "readonly")) {
+			ERROR_CHECK(vxDirective((vx_reference)m_matrix, VX_DIRECTIVE_AMD_READ_ONLY));
+		}
+		else if (!_stricmp(ioType, "ui") && !_strnicmp(fileName, "f", 1) && m_data_type == VX_TYPE_FLOAT32 && m_columns == 3 && m_rows == 3) {
+			int id = 0;
+			float valueR = 200.0f, valueInc = 0.5f;
+			if (sscanf(&fileName[1], "%d,%g,%g", &id, &valueR, &valueInc) != 3) {
+				printf("ERROR: invalid matrix UI configuration '%s'\n", fileName);
+				return -1;
+			}
+			id--;
+			GuiTrackBarInitializeMatrix((vx_reference)m_matrix, id, valueR, valueInc);
+			GuiTrackBarProcessKey(0); // just initialize the matrix
+		}
+		else ReportError("ERROR: invalid matrix operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamMatrix::Finalize()
+{
+	return 0;
+}
+
+int CVxParamMatrix::ReadFrame(int frameNumber)
+{
+	// check if there is no user request to read
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// make sure buffer has been allocated
+	if (!m_bufForAccess) NULLPTR_CHECK(m_bufForAccess = new vx_uint8[m_size]);
+
+	// for single frame reads, there is no need to read it again
+	// as it is already read into the object
+	if (!m_fileNameForReadHasIndex && frameNumber != m_captureFrameStart) {
+		return 0;
+	}
+
+	// reading data from input file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameRead.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_readFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		if (frameNumber == m_captureFrameStart) {
+			ReportError("ERROR: Unable to open: %s\n", fileName);
+		}
+		else {
+			return 1; // end of sequence detected for multiframe sequences
+		}
+	}
+	int status = 0;
+	if (m_readFileIsBinary) {
+		if (fread(m_bufForAccess, 1, m_size, fp) != m_size)
+			status = -1;
+	}
+	else {
+		for (vx_size index = 0; index < (m_columns * m_rows); index++) {
+			if (m_data_type == VX_TYPE_INT32 || m_data_type == VX_TYPE_UINT8) {
+				vx_uint32 value;
+				if (fscanf(fp, "%i", &value) != 1) {
+					status = -1;
+					break;
+				}
+				if (m_data_type == VX_TYPE_UINT8) ((vx_uint8 *)m_bufForAccess)[index] = (vx_uint8)value;
+				else ((vx_int32 *)m_bufForAccess)[index] = value;
+			}
+			else if (m_data_type == VX_TYPE_FLOAT32) {
+				if (fscanf(fp, "%g", &((vx_float32 *)m_bufForAccess)[index]) != 1) {
+					status = -1;
+					break;
+				}
+			}
+			else ReportError("ERROR: matrix ascii read option not support for data_type of %s\n", GetVxObjectName());
+		}
+	}
+	ERROR_CHECK(vxWriteMatrix(m_matrix, m_bufForAccess));
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on matrix input file: %s\n", fileName);
+
+	return status;
+}
+
+int CVxParamMatrix::WriteFrame(int frameNumber)
+{
+	// check if there is no user request to write
+	if (m_fileNameWrite.length() < 1) return 0;
+
+	// make sure buffer has been allocated and read the matrix data
+	if (!m_bufForAccess) NULLPTR_CHECK(m_bufForAccess = new vx_uint8[m_size]);
+	ERROR_CHECK(vxReadMatrix(m_matrix, m_bufForAccess));
+
+	// write data to output file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameWrite.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_writeFileIsBinary ? "wb" : "w");
+	if (!fp) ReportError("ERROR: Unable to create: %s\n", fileName);
+	if (m_writeFileIsBinary) {
+		fwrite(m_bufForAccess, 1, m_size, fp);
+	}
+	else {
+		for (vx_size index = 0; index < m_columns * m_rows; index++) {
+			if (m_data_type == VX_TYPE_INT32) fprintf(fp, "%d ", ((vx_int32 *)m_bufForAccess)[index]);
+			else if (m_data_type == VX_TYPE_FLOAT32) fprintf(fp, "%g ", ((vx_float32 *)m_bufForAccess)[index]);
+			else if (m_data_type == VX_TYPE_UINT8) fprintf(fp, "%d ", ((vx_uint8 *)m_bufForAccess)[index]);
+			else ReportError("ERROR: matrix ascii write option not support for data_type of %s\n", GetVxObjectName());
+		}
+		fprintf(fp, "\n");
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamMatrix::CompareFrame(int frameNumber)
+{
+	// check if there is no user request to compare
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// make sure buffer has been allocated and read the matrix data
+	if (!m_bufForAccess) NULLPTR_CHECK(m_bufForAccess = new vx_uint8[m_size]);
+	ERROR_CHECK(vxReadMatrix(m_matrix, m_bufForAccess));
+
+	// reading data from reference file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_compareFileIsBinary ? "rb" : "r");
+	if (!fp) {
+		ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	bool mismatchDetected = false;
+	int status = 0, errTolerance = (int)m_errTolerance;
+	vx_size itemsize = m_size / (m_columns * m_rows);
+	for (vx_size index = 0; index < (m_columns * m_rows); index++) {
+		union {
+			vx_int32 i32; 
+			vx_float32 f32; 
+			vx_uint8 u8;
+		} item;
+		if (m_compareFileIsBinary) {
+			if (fread(&item, itemsize, 1, fp) != 1) {
+				status = -1;
+				break;
+			}
+		}
+		else {
+			if (m_data_type == VX_TYPE_INT32 || m_data_type == VX_TYPE_UINT8) {
+				if (fscanf(fp, "%i", &item.i32) != 1) {
+					status = -1;
+					break;
+				}
+			}
+			else if (m_data_type == VX_TYPE_FLOAT32) {
+				if (fscanf(fp, "%g", &item.f32) != 1) {
+					status = -1;
+					break;
+				}
+			}
+			else ReportError("ERROR: matrix ascii compare option not support for data_type of %s\n", GetVxObjectName());
+		}
+		if (m_data_type == VX_TYPE_INT32) {
+			if (abs(item.i32 - ((vx_int32 *)m_bufForAccess)[index]) > errTolerance)
+				mismatchDetected = true;
+		}
+		else if (m_data_type == VX_TYPE_FLOAT32) {
+			if (fabsf(item.f32 - ((vx_float32 *)m_bufForAccess)[index]) > m_errTolerance)
+				mismatchDetected = true;
+		}
+		else if (m_data_type == VX_TYPE_UINT8) {
+			if (abs((int)item.u8 - (int)((vx_uint8 *)m_bufForAccess)[index]) > errTolerance)
+				mismatchDetected = true;
+		}
+		if (mismatchDetected)
+			break;
+	}
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on matrix comapre reference file: %s\n", fileName);
+
+	if (mismatchDetected) {
+		m_compareCountMismatches++;
+		printf("ERROR: matrix COMPARE MISMATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+		if (m_verbose) printf("OK: matrix COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+	}
+
+	return 0;
+}
diff --git a/runvx/vxMatrix.h b/runvx/vxMatrix.h
new file mode 100644
index 0000000..5d9d33f
--- /dev/null
+++ b/runvx/vxMatrix.h
@@ -0,0 +1,62 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_MATRIX_H__
+#define __VX_MATRIX_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamMatrix : public CVxParameter
+{
+public:
+	CVxParamMatrix();
+	virtual ~CVxParamMatrix();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_enum m_data_type;
+	vx_size m_columns;
+	vx_size m_rows;
+	vx_size m_size;
+	// I/O configuration
+	bool m_readFileIsBinary;
+	bool m_writeFileIsBinary;
+	bool m_compareFileIsBinary;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	float m_errTolerance;
+	vx_uint8 * m_bufForAccess;
+	// vx object
+	vx_matrix m_matrix;
+};
+
+#endif /* __VX_MATRIX_H__ */
\ No newline at end of file
diff --git a/runvx/vxParamHelper.cpp b/runvx/vxParamHelper.cpp
new file mode 100644
index 0000000..889a8f5
--- /dev/null
+++ b/runvx/vxParamHelper.cpp
@@ -0,0 +1,176 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxParamHelper.h"
+
+int g_numCvUse = 0;
+
+// input track bars connected to scalar objects
+#define MAX_INPUT_TRACK_BARS  12
+static int g_trackBarActive = 0;
+static vx_reference g_trackBarObj[MAX_INPUT_TRACK_BARS] = { 0 };
+static float g_trackBarValueMin[MAX_INPUT_TRACK_BARS], g_trackBarValueMax[MAX_INPUT_TRACK_BARS], g_trackBarValueInc[MAX_INPUT_TRACK_BARS];
+static float g_trackBarValueR[MAX_INPUT_TRACK_BARS] = { 0.0f }, g_trackBarAngle[MAX_INPUT_TRACK_BARS][3] = { 0.0f, 0.0f, 0.0f };
+
+/////////////////////////////////
+// input track bars connected to scalar objects
+int GuiTrackBarInitializeScalar(vx_reference obj, int id, float valueMin, float valueMax, float valueInc)
+{
+	if (id < 0 || id >= MAX_INPUT_TRACK_BARS)
+		return -1;
+	if (!obj) {
+		g_trackBarObj[id] = nullptr;
+		return 0;
+	}
+	if (g_trackBarObj[id] && g_trackBarObj[id] != obj)
+		return -1;
+	g_trackBarObj[id] = obj;
+	g_trackBarValueMin[id] = valueMin;
+	g_trackBarValueMax[id] = valueMax;
+	g_trackBarValueInc[id] = valueInc;
+	return 0;
+}
+int GuiTrackBarInitializeMatrix(vx_reference obj, int id, float valueR, float valueInc)
+{
+	if (id < 0 || id >= MAX_INPUT_TRACK_BARS)
+		return -1;
+	if (!obj) {
+		g_trackBarObj[id] = nullptr;
+		return 0;
+	}
+	if (g_trackBarObj[id] && g_trackBarObj[id] != obj)
+		return -1;
+	g_trackBarObj[id] = obj;
+	g_trackBarValueR[id] = valueR;
+	g_trackBarValueInc[id] = valueInc;
+	return 0;
+}
+int GuiTrackBarShutdown(vx_reference obj)
+{
+	for (int id = 0; id < MAX_INPUT_TRACK_BARS; id++) {
+		if (g_trackBarObj[id] == obj) {
+			g_trackBarObj[id] = nullptr;
+			return 0;
+		}
+	}
+	return -1;
+}
+int GuiTrackBarProcessKey(int key)
+{
+	int keyInc = '+', keyDec = '-';
+	int id = g_trackBarActive;
+	if (key >= 0x00700000 && key <= 0x007b0000)
+	{ // use F1..F12 to select UIs
+		id = (key >> 16) & 15;
+		if (id >= 0 && id < MAX_INPUT_TRACK_BARS) {
+			g_trackBarActive = id;
+		}
+		return 0;
+	}
+	if (g_trackBarObj[id]) {
+		vx_enum obj_type = VX_ERROR_INVALID_TYPE;
+		vxQueryReference(g_trackBarObj[id], VX_REF_ATTRIBUTE_TYPE, &obj_type, sizeof(obj_type));
+		if (obj_type == VX_TYPE_SCALAR) {
+			if (key == 0x00250000) id = 0, key = '-'; // left arrow: hardcoded to id#0 (F1) dec
+			else if (key == 0x00260000) id = 1, key = '+'; // up arrow: hardcoded to id#1 (F2) inc
+			else if (key == 0x00270000) id = 0, key = '+'; // right arrow: hardcoded to id#0 (F1) inc
+			else if (key == 0x00280000) id = 1, key = '-'; // down arrow: hardcoded to id#1 (F2) dec
+			else if (key == '_') key = '-'; // easy keys to avoid Shift dependency
+			else if (key == '=') key = '+'; // easy keys to avoid Shift dependency
+			if (key == keyInc || key == keyDec) {
+				vx_enum format = VX_TYPE_FLOAT32;
+				vxQueryScalar((vx_scalar)g_trackBarObj[id], VX_SCALAR_ATTRIBUTE_TYPE, &format, sizeof(format));
+				float value = g_trackBarValueMin[id];
+				if (format == VX_TYPE_FLOAT32) { vxReadScalarValue((vx_scalar)g_trackBarObj[id], &value); }
+				else if (format == VX_TYPE_INT32) { vx_int32 v;  vxReadScalarValue((vx_scalar)g_trackBarObj[id], &v); value = (vx_float32)v; }
+				else if (format == VX_TYPE_UINT32) { vx_uint32 v;  vxReadScalarValue((vx_scalar)g_trackBarObj[id], &v); value = (vx_float32)v; }
+				float value_earlier = value;
+				if (key == keyInc) value += g_trackBarValueInc[id];
+				else if (key == keyDec) value -= g_trackBarValueInc[id];
+				if (value < g_trackBarValueMin[id]) value = g_trackBarValueMin[id];
+				else if (value > g_trackBarValueMax[id]) value = g_trackBarValueMax[id];
+				if (format == VX_TYPE_FLOAT32) { vxWriteScalarValue((vx_scalar)g_trackBarObj[id], &value); }
+				else if (format == VX_TYPE_INT32) { vx_int32 v;  vxWriteScalarValue((vx_scalar)g_trackBarObj[id], &v); value = (vx_float32)v; }
+				else if (format == VX_TYPE_UINT32) { vx_uint32 v;  vxWriteScalarValue((vx_scalar)g_trackBarObj[id], &v); value = (vx_float32)v; }
+				if (value != value_earlier) printf("OK: Scalar:UI,F%-2d => %g\n", id + 1, value);
+			}
+		}
+		else if (obj_type == VX_TYPE_MATRIX) {
+			if (key == 0x00250000) g_trackBarAngle[id][0] -= g_trackBarValueInc[id]; // left arrow: H(yaw) dec
+			else if (key == 0x00270000) g_trackBarAngle[id][0] += g_trackBarValueInc[id]; // right arrow: H(yaw) inc
+			else if (key == 0x00280000) g_trackBarAngle[id][1] -= g_trackBarValueInc[id]; // down arrow: P(pitch) dec
+			else if (key == 0x00260000) g_trackBarAngle[id][1] += g_trackBarValueInc[id]; // up arrow: P(pitch) inc
+			else if (key == '-' || key == '_') g_trackBarAngle[id][2] -= g_trackBarValueInc[id]; // B(round) dec
+			else if (key == '+' || key == '=') g_trackBarAngle[id][2] += g_trackBarValueInc[id]; // B(round) inc
+			// convert angles to matrix
+			float H = g_trackBarAngle[id][0];
+			float P = g_trackBarAngle[id][1];
+			float B = g_trackBarAngle[id][2];
+			printf("OK: Matrix:UI,F%-2d => H:%g P:%g B:%g\n", id + 1, H, P, B);
+			H *= (float)M_PI / 180.0f;
+			P *= (float)M_PI / 180.0f;
+			B *= (float)M_PI / 180.0f;
+			vx_float32 mat[3][3] = { { 0.0f } };
+			vxReadMatrix((vx_matrix)g_trackBarObj[id], &mat);
+			// create perspective transform using H/P/B
+			// TBD
+			mat[0][0] = cosf(H);
+			mat[0][1] = sinf(H);
+			mat[0][2] = 0.0f;
+			mat[1][0] = -sinf(H);
+			mat[1][1] = cosf(H);
+			mat[1][2] = 0.0f;
+			mat[2][0] = 0.0f;
+			mat[2][1] = 0.0f;
+			mat[2][2] = 1.0f;
+			vxWriteMatrix((vx_matrix)g_trackBarObj[id], &mat);
+		}
+	}
+	return 0;
+}
+
+/////////////////////////////////
+// global OpenCV image count and specified read inputs count
+int ProcessCvWindowKeyRefresh()
+{
+#if USE_OPENCV
+	if (g_numCvUse > 0) {
+		// process keyboard
+		int key = waitKey(1);
+		if (key == 'q' || key == 27) {
+			return 1;
+		}
+		else if (key == ' ') {
+			printf("Paused: Press spacebar to continue...\n"); fflush(stdout);
+			while (waitKey(0) != ' ')
+				;
+		}
+		else if (key >= 0) {
+			GuiTrackBarProcessKey(key);
+		}
+	}
+#endif
+	return 0;
+}
\ No newline at end of file
diff --git a/runvx/vxParamHelper.h b/runvx/vxParamHelper.h
new file mode 100644
index 0000000..c850509
--- /dev/null
+++ b/runvx/vxParamHelper.h
@@ -0,0 +1,45 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_PARAMHELPER_H__
+#define __VX_PARAMHELPER_H__
+
+#include "vxUtils.h"
+
+// global OpenCV image count
+extern int g_numCvUse;
+
+// process OpenCV window key refresh
+int ProcessCvWindowKeyRefresh();
+
+// input track bars connected to scalar objects
+int GuiTrackBarInitializeScalar(vx_reference obj, int id, float valueMin, float valueMax, float valueInc);
+
+int GuiTrackBarInitializeMatrix(vx_reference obj, int id, float valueR, float valueInc);
+
+int GuiTrackBarShutdown(vx_reference obj);
+
+int GuiTrackBarProcessKey(int key);
+
+
+#endif /* __VX_PARAMHELPER_H__ */
\ No newline at end of file
diff --git a/runvx/vxParameter.cpp b/runvx/vxParameter.cpp
new file mode 100644
index 0000000..6873fc5
--- /dev/null
+++ b/runvx/vxParameter.cpp
@@ -0,0 +1,409 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxArray.h"
+#include "vxConvolution.h"
+#include "vxDistribution.h"
+#include "vxImage.h"
+#include "vxLUT.h"
+#include "vxMatrix.h"
+#include "vxPyramid.h"
+#include "vxRemap.h"
+#include "vxScalar.h"
+#include "vxThreshold.h"
+
+#include <stdio.h>
+
+#define DEBUG_INFO 0
+#define DEBUG_FRAME 0
+#define DEBUG_COMPARE 0
+#define DEBUG_FILES 0
+
+#define VX_MAX_FILE_NAME 128
+
+////////////////////////////////////////
+// parameter objects
+CVxParameter::CVxParameter()
+{
+	// initialize local variables
+	m_paramMap = nullptr;
+	m_userStructMap = nullptr;
+	m_vxObjType = VX_TYPE_INVALID;
+	m_vxObjRef = nullptr;
+	m_vxObjName[0] = '\0';
+	m_fileNameForReadHasIndex = false;
+	m_fileNameForWriteHasIndex = false;
+	m_fileNameForCompareHasIndex = false;
+	m_fpRead = nullptr;
+	m_fpWrite = nullptr;
+	m_fpCompare = nullptr;
+	m_verbose = false;
+	m_abortOnCompareMismatch = false;
+	m_usingMultiFrameCapture = false;
+	m_captureFrameStart = false;
+}
+
+CVxParameter::~CVxParameter()
+{
+	if (m_fpRead) fclose(m_fpRead);
+	if (m_fpWrite) fclose(m_fpWrite);
+	if (m_fpCompare) fclose(m_fpCompare);
+}
+
+const char * CVxParameter::GetVxObjectName()
+{
+	if (m_vxObjRef) {
+		vxGetReferenceName(m_vxObjRef, m_vxObjName, sizeof(m_vxObjName));
+	}
+	return m_vxObjName;
+}
+
+void CVxParameter::DisableWaitForKeyPress()
+{
+}
+
+void CVxParameter::ResetArrayListForView()
+{
+	m_arrayListForView.clear();
+}
+
+void CVxParameter::AddToArrayListForView(int colorIndex, int x, int y, float strength)
+{
+	if (m_displayName.length() > 0) {
+		ArrayItemForView kpItem = { VX_TYPE_KEYPOINT, colorIndex, x, y, strength, 0, 0 };
+		m_arrayListForView.push_back(kpItem);
+	}
+}
+
+void CVxParameter::AddToArrayListForView(int colorIndex, int x, int y)
+{
+	if (m_displayName.length() > 0) {
+		ArrayItemForView kpItem = { VX_TYPE_COORDINATES2D, colorIndex, x, y, 0.0f, 0, 0 };
+		m_arrayListForView.push_back(kpItem);
+	}
+}
+
+list<CVxParameter *> CVxParameter::m_paramList;
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CVxParameter * CreateDataObject(vx_context context, vx_graph graph, std::map<std::string, CVxParameter *> * m_paramMap, map<string, vx_enum> * m_userStructMap, const char * desc, vx_uint32 captureFrameStart)
+{
+	// create the object based on the description
+	if (!strncmp(desc, "image:", 6) || !strncmp(desc, "image-virtual:", 14) || !strncmp(desc, "image-uniform:", 14) || !strncmp(desc, "image-roi:", 10)) {
+		CVxParamImage *this_image = new CVxParamImage();
+		this_image->SetCaptureFrameStart(captureFrameStart);
+		this_image->SetParamMap(m_paramMap);
+		int status = this_image->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_image;
+	}
+	else if (!strncmp(desc, "array:", 6) || !strncmp(desc, "array-virtual:", 14)){
+		CVxParamArray *this_array = new CVxParamArray();
+		this_array->SetCaptureFrameStart(captureFrameStart);
+		this_array->SetUserStructMap(m_userStructMap);
+		int status = this_array->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_array;
+	}
+	else if (!strncmp(desc, "pyramid:", 8) || !strncmp(desc, "pyramid-virtual:", 16)) {
+		CVxParamPyramid *this_pyramid = new CVxParamPyramid();
+		this_pyramid->SetCaptureFrameStart(captureFrameStart);
+		int status = this_pyramid->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_pyramid;
+	}
+	else if (!strncmp(desc, "distribution:", 13)){
+		CVxParamDistribution *this_distribution = new CVxParamDistribution();
+		this_distribution->SetCaptureFrameStart(captureFrameStart);
+		int status = this_distribution->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_distribution;
+	}
+	else if (!strncmp(desc, "convolution:", 12)){
+		CVxParamConvolution *this_convolution = new CVxParamConvolution();
+		this_convolution->SetCaptureFrameStart(captureFrameStart);
+		int status = this_convolution->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_convolution;
+	}
+	else if (!strncmp(desc, "lut:", 4)){
+		CVxParamLUT *this_LUT = new CVxParamLUT();
+		this_LUT->SetCaptureFrameStart(captureFrameStart);
+		int status = this_LUT->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_LUT;
+	}
+	else if (!strncmp(desc, "matrix:", 7)){
+		CVxParamMatrix *this_matrix = new CVxParamMatrix();
+		this_matrix->SetCaptureFrameStart(captureFrameStart);
+		int status = this_matrix->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_matrix;
+	}
+	else if (!strncmp(desc, "remap:", 6)){
+		CVxParamRemap *this_remap = new CVxParamRemap();
+		this_remap->SetCaptureFrameStart(captureFrameStart);
+		int status = this_remap->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_remap;
+	}
+	else if (!strncmp(desc, "scalar:", 7) || !strncmp(desc, "!", 1)){
+		if (!strncmp(desc, "!", 1)){
+			char enum_name[2048];
+			char description[2048];
+			char desc2[2048];
+			int i = 1;
+			int j = 0;
+			while (desc[i] != '\0'){
+				enum_name[j] = desc[i];
+				i++;
+				j++;
+			}
+			enum_name[j] = '\0';
+			strcpy(description, "scalar:enum,%s");
+			sprintf(desc2, description, enum_name);
+			//printf("DEBUG: %s\n", desc3);
+			CVxParamScalar *this_scalar = new CVxParamScalar();
+			this_scalar->SetCaptureFrameStart(captureFrameStart);
+			int status = this_scalar->Initialize(context, graph, desc2);
+			if (status)
+				return NULL;
+			return this_scalar;
+		}
+		else {
+			CVxParamScalar *this_scalar = new CVxParamScalar();
+			this_scalar->SetCaptureFrameStart(captureFrameStart);
+			int status = this_scalar->Initialize(context, graph, desc);
+			if (status)
+				return NULL;
+			return this_scalar;
+		}
+	}
+	else if (!strncmp(desc, "threshold:", 10)){
+		CVxParamThreshold *this_threshold = new CVxParamThreshold();
+		this_threshold->SetCaptureFrameStart(captureFrameStart);
+		int status = this_threshold->Initialize(context, graph, desc);
+		if (status)
+			return NULL;
+		return this_threshold;
+	}
+	else return nullptr;
+}
+
+CVxParameter * CreateDataObject(vx_context context, vx_graph graph, vx_reference ref, const char * params, vx_uint32 captureFrameStart)
+{
+	// create the object based on the ref
+	vx_enum type;
+	vx_status status = vxQueryReference(ref, VX_REF_ATTRIBUTE_TYPE, &type, sizeof(type));
+	if (status) {
+		printf("ERROR: CreateDataObject: vxQueryReference(*,VX_REF_ATTRIBUTE_TYPE,...) failed(%d)\n", status);
+		throw -1;
+	}
+	if (type == VX_TYPE_IMAGE) {
+		CVxParamImage *this_image = new CVxParamImage();
+		this_image->SetCaptureFrameStart(captureFrameStart);
+		if (this_image->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_image;
+	}
+	else if (type == VX_TYPE_ARRAY) {
+		CVxParamArray *this_array = new CVxParamArray();
+		this_array->SetCaptureFrameStart(captureFrameStart);
+		if (this_array->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_array;
+	}
+	else if (type == VX_TYPE_PYRAMID) {
+		CVxParamPyramid *this_pyramid = new CVxParamPyramid();
+		this_pyramid->SetCaptureFrameStart(captureFrameStart);
+		if (this_pyramid->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_pyramid;
+	}
+	else if (type == VX_TYPE_DISTRIBUTION) {
+		CVxParamDistribution *this_distribution = new CVxParamDistribution();
+		this_distribution->SetCaptureFrameStart(captureFrameStart);
+		if (this_distribution->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_distribution;
+	}
+	else if (type == VX_TYPE_CONVOLUTION) {
+		CVxParamConvolution *this_convolution = new CVxParamConvolution();
+		this_convolution->SetCaptureFrameStart(captureFrameStart);
+		if (this_convolution->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_convolution;
+	}
+	else if (type == VX_TYPE_LUT) {
+		CVxParamLUT *this_LUT = new CVxParamLUT();
+		this_LUT->SetCaptureFrameStart(captureFrameStart);
+		if (this_LUT->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_LUT;
+	}
+	else if (type == VX_TYPE_MATRIX) {
+		CVxParamMatrix *this_matrix = new CVxParamMatrix();
+		this_matrix->SetCaptureFrameStart(captureFrameStart);
+		if (this_matrix->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_matrix;
+	}
+	else if (type == VX_TYPE_REMAP) {
+		CVxParamRemap *this_remap = new CVxParamRemap();
+		this_remap->SetCaptureFrameStart(captureFrameStart);
+		if (this_remap->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_remap;
+	}
+	else if (type == VX_TYPE_SCALAR) {
+		CVxParamScalar *this_scalar = new CVxParamScalar();
+		this_scalar->SetCaptureFrameStart(captureFrameStart);
+		if (this_scalar->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_scalar;
+	}
+	else if (type == VX_TYPE_THRESHOLD) {
+		CVxParamThreshold *this_threshold = new CVxParamThreshold();
+		this_threshold->SetCaptureFrameStart(captureFrameStart);
+		if (this_threshold->InitializeIO(context, graph, ref, params))
+			return NULL;
+		return this_threshold;
+	}
+	else return nullptr;
+}
+
+/*! \brief Parse parameter strings.
+* \details This creates a top-level object context for OpenVX.
+* \param [in] s The input string.
+* \param [in] syntax The syntax description for error messaging.
+* \param [in] fmt The format string: d(32-bit integer) D(64-bit integer) f(float) F(double) c(color-format) s(string upto 64-chars) S(string upto 256-chars).
+* \param [in] ... Pointers to list of parameters.
+* \return pointer to input string after processing the all the parameters
+*/
+const char * ScanParameters(const char * s_, const char * syntax, const char * fmt_, ...)
+{
+	va_list argp;
+	va_start(argp, fmt_);
+	const char *s = s_;
+	for (const char * fmt = fmt_; *fmt != '\0'; fmt++) {
+		const char * t = s;
+		if (*s != '\0') {
+			if (*fmt == 'd' || *fmt == 'D') { // 32-bit/64-bit integer in decimal or hexadecimal
+				int64_t value = 0;
+				if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
+					// parse hexadecimal: 0x...
+					s += 2;
+					for (; (*s >= '0' && *s <= '9') || (*s >= 'a' && *s <= 'f') || (*s >= 'A' && *s <= 'F'); s++) {
+						if (*s >= 'a' && *s <= 'f') {
+							value = value * 16 + *s - 'a' + 10;
+						}
+						else if (*s >= 'A' && *s <= 'F') {
+							value = value * 16 + *s - 'A' + 10;
+						}
+						else {
+							value = value * 16 + *s - '0';
+						}
+					}
+				}
+				else {
+					// parse decimal string
+					int sign = 1;
+					if (*s == '-') {
+						sign = -1;
+						s++;
+					}
+					for (; *s >= '0' && *s <= '9'; s++)
+						value = value * 10 + *s - '0';
+					value *= sign;
+				}
+				if (*fmt == 'd') {
+					// 32-bit integer
+					*(va_arg(argp, int32_t *)) = (int32_t)value;
+				}
+				else {
+					// 64-bit integer
+					*(va_arg(argp, int64_t *)) = value;
+				}
+			}
+			else if (*fmt == 'f' || *fmt == 'F') { // 32-bit/64-bit floating-point
+				char buf[64] = { 0 };
+				for (int i = 0; i < ((int)sizeof(buf) - 1) && ((*s >= '0' && *s <= '9') || *s == '.' || *s == '-' || *s == 'e'); i++)
+					buf[i] = *s++;
+				if (*fmt == 'f') {
+					// 32-bit float
+					*(va_arg(argp, float *)) = (float)atof(buf);
+				}
+				else {
+					// 64-bit double
+					*(va_arg(argp, double *)) = atof(buf);
+				}
+			}
+			else if (*fmt == 'c') { // color format
+				if (s[0] && s[1] && s[2] && s[3]) {
+					*(va_arg(argp, vx_df_image *)) = (vx_df_image)VX_DF_IMAGE(s[0], s[1], s[2], s[3]);
+					s += 4;
+				}
+			}
+			else if (*fmt == 's' || *fmt == 'S') { // string of upto 64-bytes/256-bytes until ',', ':', or end-of-string
+				int maxStringBufferLength = (*fmt == 'S') ? 256 : 64;
+				char * p = va_arg(argp, char *);
+				if (!_strnicmp(s, "https://", 8) || !_strnicmp(s, "http://", 7) || !_strnicmp(s, "file://", 7) || 
+					(((s[0] >= 'a' && s[0] <= 'z') || (s[0] >= 'A' && s[0] <= 'Z')) && s[1] == ':' && s[2] == '\\'))
+				{
+					// started with drive letter or url, so copy prefix string to avoid use of ':' as end marker
+					int len = (s[1] == ':') ? 3 : ((s[4] == ':') ? 7 : 8);
+					strncpy(p, s, len);
+					p += len;  s += len;
+					maxStringBufferLength -= len;
+				}
+				// copy till end of string or ',' or ':'
+				for (; (*s != '\0') && (*s != ',') && (*s != ':') && (--maxStringBufferLength > 0);)
+					*p++ = *s++;
+				*p = 0;
+			}
+			else if (*fmt == *s) { // skip matched seperators in fmt
+				s++;
+			}
+		}
+		// check to make sure that at least one character from input has been used for parsing the current parameter
+		if (s == t) {
+			printf("ERROR: ScanParameters: invalid string syntax=[%s] fmt=[%s] s=[%s]\n", syntax, fmt_, s_);
+			throw - 1;
+		}
+	}
+	va_end(argp);
+	return s;
+}
diff --git a/runvx/vxParameter.h b/runvx/vxParameter.h
new file mode 100644
index 0000000..f91586d
--- /dev/null
+++ b/runvx/vxParameter.h
@@ -0,0 +1,148 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_PARAMETER_H__
+#define __VX_PARAMETER_H__
+
+#include "vxUtils.h"
+
+// constants local to vxParameter.h/cpp
+#define MAX_FILE_NAME_LENGTH    1024
+#define MAX_MODE_STRING_LENGTH    16
+
+// CVxParameter: base-class for all type of objects
+class CVxParameter
+{
+public:
+	// constructor and destructor
+	CVxParameter();
+	virtual ~CVxParameter();
+
+	// mechanism to pass global (vxEngine) parameter map to help access other objects by name
+	void SetParamMap(std::map<std::string, CVxParameter *> * paramMap) { m_paramMap = paramMap; }
+	void SetUserStructMap(std::map<std::string, vx_enum> * userStructMap){ m_userStructMap = userStructMap;  }
+	bool IsUsingMultiFrameCapture(){ return m_usingMultiFrameCapture; }
+	void SetCaptureFrameStart(vx_uint32 frameStart) { m_captureFrameStart = frameStart; }
+	void SetVerbose(bool verbose) { m_verbose = verbose; }
+	void SetAbortOnMismatch(bool abortOnCompareMismatch) { m_abortOnCompareMismatch = abortOnCompareMismatch; }
+
+	// Initialize: create OpenVX object and further uses InitializeIO to input/output initialization
+	//   desc: object description as specified on command-line or in script
+	//   returns 0 on SUCCESS, else error code
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc) = 0;
+
+	// InitializeIO: performs I/O initialization using the OpenVX object already created
+	//   ref: OpenVX object already created
+	//   io_params: I/O description as specified on command-line or in script
+	//   returns 0 on SUCCESS, else error code
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params) = 0;
+
+	// Finalize: for final initialization after vxVerifyGraph
+	//   meant for querying object parameters which are not available until vxVerifyGraph
+	virtual int Finalize() = 0;
+
+	// get OpenVX object reference
+	vx_reference& GetVxObject() { return m_vxObjRef; }
+	const char * GetVxObjectName();
+
+	// get OpenVX object type (e.g., VX_TYPE_IMAGE, VX_TYPE_SCALAR, ...)
+	vx_enum GetVxObjectType() { return m_vxObjType; }
+
+	// frame-level read, write, and compare
+	//   returns 0 on SUCCESS, else error code
+	//   ReadFrame() returns +ve value to indicate data unavailability
+	virtual int ReadFrame(int frameNumber) = 0;
+	virtual int WriteFrame(int frameNumber) = 0;
+	virtual int CompareFrame(int frameNumber) = 0;
+
+	// helper functions
+	//   GetDisplayName -- returns DISPLAY name specified as part of ":W,DISPLAY-<name>" I/O request
+	//   DisableWaitForKeyPress -- mark that there is no need to wait at the end 
+	//                             showing the last image output on OpenCV window
+	// TBD: change getDisplayName to GetDisplayName
+	// TBD: remove DisableWaitForKeyPress and have the final wait in top-level (i.e,. vxEngine)
+	string getDisplayName() { return m_displayName; }
+	virtual void DisableWaitForKeyPress();
+
+protected:
+	// global parameter map to access VX objects by name
+	std::map<std::string, CVxParameter *> * m_paramMap;
+	// keep track of objects for cross referencing across them (e.g., image needs arrays for displaying keypoints)
+	static list<CVxParameter *> m_paramList;
+	// global user defined struct map to access user defined structs
+	std::map<std::string, vx_enum> * m_userStructMap;
+	// DISPLAY name specified as part of ":W,DISPLAY-<name>" I/O request
+	// NOTE: when not specified, this will be an empty string
+	string m_displayName;
+	// VX Object Type
+	vx_enum m_vxObjType;
+	// VX Object Reference
+	vx_reference m_vxObjRef;
+	char m_vxObjName[64];
+	// I/O variables
+	// TBD: add comment describing purpose of each of the variables below
+	string m_fileNameRead;
+	string m_fileNameWrite;
+	string m_fileNameCompare;
+	bool m_fileNameForReadHasIndex;
+	bool m_fileNameForWriteHasIndex;
+	bool m_fileNameForCompareHasIndex;
+	FILE * m_fpRead;
+	FILE * m_fpWrite;
+	FILE * m_fpCompare;
+	bool m_verbose;
+	bool m_abortOnCompareMismatch;
+	// for multi-frame capture support
+	bool m_usingMultiFrameCapture;
+	vx_uint32 m_captureFrameStart;
+	// Data shared for viewing
+	struct ArrayItemForView { vx_enum itemtype; int colorIndex; int x, y; float strength; int w, h; };
+	std::vector<ArrayItemForView> m_arrayListForView;
+
+public:
+	// utility functions for m_arrayListForView
+	void ResetArrayListForView();
+	void AddToArrayListForView(int colorIndex, int x, int y, float strength); // adds keypoint
+	void AddToArrayListForView(int colorIndex, int x, int y); // adds coordinates2d
+	size_t GetArrayListForViewCount() { return m_arrayListForView.size(); }
+	const ArrayItemForView * GetArrayListForViewItemAt(size_t index) { return &m_arrayListForView[index]; }
+};
+
+// parse the description of a data object and create parameter object: this function
+// creates different kinds of CVxParamTYPE class objects depending upon the prefix
+// in desc - for example when desc is "image:..." an object of type CVxParamImage
+// will be created and initialized. It will return nullptr on error.
+CVxParameter * CreateDataObject(vx_context context, vx_graph graph, map<string, CVxParameter *> * m_paramMap, map<string, vx_enum> * m_userStructMap, const char * desc, vx_uint32 captureFrameStart);
+CVxParameter * CreateDataObject(vx_context context, vx_graph graph, vx_reference ref, const char * params, vx_uint32 captureFrameStart);
+
+/*! \brief Parse parameter strings.
+* \details This creates a top-level object context for OpenVX.
+* \param [in] s The input string.
+* \param [in] syntax The syntax description for error messaging.
+* \param [in] fmt The format string: d(32-bit integer) D(64-bit integer) f(float) F(double) c(color-format) s(string upto 64-chars) S(string upto 256-chars).
+* \param [in] ... Pointers to list of parameters.
+* \return pointer to input string after processing the all the parameters
+*/
+const char * ScanParameters(const char * s, const char * syntax, const char * fmt, ...);
+
+#endif /* __VX_PARAMETER_H__ */
\ No newline at end of file
diff --git a/runvx/vxPyramid.cpp b/runvx/vxPyramid.cpp
new file mode 100644
index 0000000..f02e6ed
--- /dev/null
+++ b/runvx/vxPyramid.cpp
@@ -0,0 +1,395 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxPyramid.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamPyramid
+//
+CVxParamPyramid::CVxParamPyramid()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_PYRAMID;
+	m_format = VX_DF_IMAGE_VIRT;
+	m_width = 0;
+	m_height = 0;
+	m_numLevels = 0;
+	m_scale = 0;
+	// vx object
+	m_pyramid = nullptr;
+	// I/O configuration
+	m_comparePixelErrorMin = 0;
+	m_comparePixelErrorMax = 0;
+	m_bufForCompare = nullptr;
+	m_imageFrameSize = nullptr;
+	m_rectFullLevel = nullptr;
+	m_rectCompareLevel = nullptr;
+	m_fpReadImage = nullptr;
+	m_fpWriteImage = nullptr;
+	m_fpCompareImage = nullptr;
+}
+
+CVxParamPyramid::~CVxParamPyramid()
+{
+	Shutdown();
+}
+
+int CVxParamPyramid::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: pyramid %s MATCHED for %d frame(s) of %s\n", m_useCheckSumForCompare ? "CHECKSUM" : "COMPARE", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_pyramid) {
+		vxReleasePyramid(&m_pyramid);
+		m_pyramid = nullptr;
+	}
+	if (m_bufForCompare) {
+		delete[] m_bufForCompare;
+		m_bufForCompare = nullptr;
+	}
+	if (m_imageFrameSize) {
+		delete[] m_imageFrameSize;
+		m_imageFrameSize = nullptr;
+	}
+	if (m_rectFullLevel) {
+		delete[] m_rectFullLevel;
+		m_rectFullLevel = nullptr;
+	}
+	if (m_rectCompareLevel) {
+		delete[] m_rectCompareLevel;
+		m_rectCompareLevel = nullptr;
+	}
+	if (m_fpReadImage) {
+		for (vx_size level = 0; level < m_numLevels; level++)
+			if (m_fpReadImage[level]) fclose(m_fpReadImage[level]);
+		m_fpReadImage = nullptr;
+	}
+	if (m_fpWriteImage) {
+		for (vx_size level = 0; level < m_numLevels; level++)
+			if (m_fpWriteImage[level]) fclose(m_fpWriteImage[level]);
+		m_fpWriteImage = nullptr;
+	}
+	if (m_fpCompareImage) {
+		for (vx_size level = 0; level < m_numLevels; level++)
+			if (m_fpCompareImage[level]) fclose(m_fpCompareImage[level]);
+		m_fpCompareImage = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamPyramid::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters: syntax: pyramid[-virtual]:<numLevels>,half|orb|<scale-factor>,<width>,<height>,<format>[:<io-params>]
+	char objType[64], scaleFactor[64];
+	const char * ioParams = ScanParameters(desc, "pyramid|pyramid-virtual:<numLevels>,half|orb|<scale-factor>,<width>,<height>,<format>", "s:D,s,d,d,c", objType, &m_numLevels, scaleFactor, &m_width, &m_height, &m_format);
+	if (!_strnicmp(scaleFactor, "half", 4)) m_scale = VX_SCALE_PYRAMID_HALF;
+	else if (!_strnicmp(scaleFactor, "orb", 3)) m_scale = VX_SCALE_PYRAMID_ORB;
+	else m_scale = (float)atof(scaleFactor);
+
+	// create pyarmid object
+	m_pyramid = nullptr;
+	if (!_stricmp(objType, "pyramid")) {
+		m_pyramid = vxCreatePyramid(context, m_numLevels, m_scale, m_width, m_height, m_format);
+	}
+	else if (!_stricmp(objType, "pyramid-virtual")) {
+		m_pyramid = vxCreateVirtualPyramid(graph, m_numLevels, m_scale, m_width, m_height, m_format);
+	}
+	else ReportError("ERROR: invalid pyramid type: %s\n", objType);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_pyramid);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: pyramid creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_pyramid) vxReleasePyramid(&m_pyramid);
+		throw - 1;
+	}
+
+	// io initialize
+	return InitializeIO(context, graph, (vx_reference)m_pyramid, ioParams);
+}
+
+int CVxParamPyramid::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_pyramid = (vx_pyramid)m_vxObjRef;
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_FORMAT, &m_format, sizeof(m_format)));
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_WIDTH, &m_width, sizeof(m_width)));
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_HEIGHT, &m_height, sizeof(m_height)));
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_LEVELS, &m_numLevels, sizeof(m_numLevels)));
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_SCALE, &m_scale, sizeof(m_scale)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_fileNameForReadHasIndex = (m_fileNameRead.find("%") != m_fileNameRead.npos) ? true : false;
+			if (!m_fileNameForReadHasIndex) ReportError("ERROR: invalid pyramid input fileName (expects %%d format for each level): %s\n", ioType);
+			// mark multi-frame capture enabled
+			m_usingMultiFrameCapture = true;
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>
+			bool needDisplay = false;
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			m_fileNameForWriteHasIndex = (m_fileNameWrite.find("%") != m_fileNameWrite.npos) ? true : false;
+			if (!m_fileNameForWriteHasIndex) ReportError("ERROR: invalid pyramid output fileName (expects %%d format for each level): %s\n", ioType);
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare syntax: compare,fileName[,rect{<start-x>;<start-y>;<end-x>;<end-y>}][,err{<min>;<max>}][,checksum|checksum-save-instead-of-test]
+			// save the reference image fileName
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			m_fileNameForCompareHasIndex = (m_fileNameCompare.find("%") != m_fileNameCompare.npos) ? true : false;
+			if (!m_fileNameForCompareHasIndex) ReportError("ERROR: invalid pyramid compare fileName (expects %%d format for each level): %s\n", ioType);
+			// initialize pixel error range for exact match
+			m_comparePixelErrorMin = 0;
+			m_comparePixelErrorMax = 0;
+			// set the compare region
+			m_rectCompare.start_x = 0;
+			m_rectCompare.start_y = 0;
+			m_rectCompare.end_x = m_width;
+			m_rectCompare.end_y = m_height;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",rect{<start-x>;<start-y>;<end-x>;<end-y>}|err{<min>;<max>}|checksum|checksum-save-instead-of-test", ",s", option);
+				if (!_strnicmp(option, "rect", 4)) {
+					ScanParameters(option + 4, "{<start-x>;<start-y>;<end-x>;<end-y>}", "{d;d;d;d}", &m_rectCompare.start_x, &m_rectCompare.start_y, &m_rectCompare.end_x, &m_rectCompare.end_y);
+				}
+				else if (!_strnicmp(option, "err", 3)) {
+					ScanParameters(option + 3, "{<min>;<max>}", "{f;f}", &m_comparePixelErrorMin, &m_comparePixelErrorMax);
+					if (m_useCheckSumForCompare) ReportError("ERROR: can't support error range with checksum\n");
+				}
+				else if (!_stricmp(option, "checksum")) {
+					m_useCheckSumForCompare = true;
+					if (m_comparePixelErrorMin != m_comparePixelErrorMax) ReportError("ERROR: can't support error range with checksum\n");
+				}
+				else if (!_stricmp(option, "checksum-save-instead-of-test")) {
+					m_generateCheckSumForCompare = true;
+				}
+				else ReportError("ERROR: invalid compare option: %s\n", option);
+			}
+		}
+		else ReportError("ERROR: invalid pyramid operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamPyramid::Finalize()
+{
+	// get object attributes
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_FORMAT, &m_format, sizeof(m_format)));
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_WIDTH, &m_width, sizeof(m_width)));
+	ERROR_CHECK(vxQueryPyramid(m_pyramid, VX_PYRAMID_ATTRIBUTE_HEIGHT, &m_height, sizeof(m_height)));
+
+	// initialize other parameters
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+
+	// compute frame size in bytes
+	m_pyramidFrameSize = 0;
+	m_imageFrameSize = new size_t[m_numLevels];
+	m_rectFullLevel = new vx_rectangle_t[m_numLevels];
+	m_rectCompareLevel = new vx_rectangle_t[m_numLevels];
+	for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+		// get image at current level
+		vx_image image = vxGetPyramidLevel(m_pyramid, level);
+		// get attributes and initialize pyramid level rectangles
+		vx_uint32 width, height; size_t num_planes;
+		ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, &width, sizeof(width)));
+		ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, &height, sizeof(height)));
+		ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes)));
+		m_rectFullLevel[level].start_x = 0;
+		m_rectFullLevel[level].start_y = 0;
+		m_rectFullLevel[level].end_x = width;
+		m_rectFullLevel[level].end_y = height;
+		m_rectCompareLevel[level].start_x = (vx_uint32)ceil((double)m_rectCompare.start_x * width / m_width);
+		m_rectCompareLevel[level].start_y = (vx_uint32)ceil((double)m_rectCompare.start_y * height / m_height);
+		m_rectCompareLevel[level].end_x = (vx_uint32)floor((double)m_rectCompare.end_x * width / m_width);
+		m_rectCompareLevel[level].end_y = (vx_uint32)floor((double)m_rectCompare.end_y * height / m_height);
+		// compute image level frame size
+		m_imageFrameSize[level] = 0;
+		for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++) {
+			vx_imagepatch_addressing_t addr = { 0 };
+			vx_uint8 * dst = NULL;
+			if (vxAccessImagePatch(image, &m_rectFullLevel[level], plane, &addr, (void **)&dst, VX_READ_ONLY) == VX_SUCCESS) {
+				vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY;
+				vx_size height = (addr.dim_y * addr.scale_y) / VX_SCALE_UNITY;
+				vx_size width_in_bytes = (m_format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x);
+				m_imageFrameSize[level] += width_in_bytes * height;
+				ERROR_CHECK(vxCommitImagePatch(image, &m_rectFullLevel[level], plane, &addr, (void *)dst));
+			}
+		}
+		ERROR_CHECK(vxReleaseImage(&image));
+		// update pyramid level frame size
+		m_pyramidFrameSize += m_imageFrameSize[level];
+	}
+
+	// open files for read/write/compare
+	if (m_fileNameRead.length() > 0) {
+		m_fpReadImage = new FILE *[m_numLevels]();
+		for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+			// get width and height of current level
+			vx_uint32 width = 0, height = 0;
+			vx_image image = vxGetPyramidLevel(m_pyramid, level);
+			ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, &width, sizeof(width)));
+			ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, &height, sizeof(height)));
+			ERROR_CHECK(vxReleaseImage(&image));
+			// generate fileName with level, width, height in formatting and open the file
+			char fileName[256]; sprintf(fileName, m_fileNameRead.c_str(), level, width, height);
+			m_fpReadImage[level] = fopen(fileName, "rb");
+			if (!m_fpReadImage[level]) ReportError("ERROR: Unable to open: %s\n", fileName);
+		}
+	}
+	if (m_fileNameWrite.length() > 0) {
+		m_fpWriteImage = new FILE *[m_numLevels]();
+		for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+			// get width and height of current level
+			vx_uint32 width = 0, height = 0;
+			vx_image image = vxGetPyramidLevel(m_pyramid, level);
+			ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, &width, sizeof(width)));
+			ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, &height, sizeof(height)));
+			ERROR_CHECK(vxReleaseImage(&image));
+			// generate fileName with level, width, height in formatting and open the file
+			char fileName[256]; sprintf(fileName, m_fileNameWrite.c_str(), level, width, height);
+			m_fpWriteImage[level] = fopen(fileName, "wb");
+			if (!m_fpWriteImage[level]) ReportError("ERROR: Unable to create: %s\n", fileName);
+		}
+	}
+	if (m_fileNameCompare.length() > 0) {
+		m_fpCompareImage = new FILE *[m_numLevels]();
+		for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+			// get width and height of current level
+			vx_uint32 width = 0, height = 0;
+			vx_image image = vxGetPyramidLevel(m_pyramid, level);
+			ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, &width, sizeof(width)));
+			ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, &height, sizeof(height)));
+			ERROR_CHECK(vxReleaseImage(&image));
+			// generate fileName with level, width, height in formatting and open the file
+			char fileName[256]; sprintf(fileName, m_fileNameCompare.c_str(), level, width, height);
+			m_fpCompareImage[level] = fopen(fileName, m_generateCheckSumForCompare ? "w" : (m_useCheckSumForCompare ? "r" : "rb"));
+			if (!m_fpCompareImage[level]) ReportError("ERROR: Unable to %s: %s\n", m_generateCheckSumForCompare ? "create" : "open", fileName);
+		}
+		// allocate buffer for comparision
+		if (!m_useCheckSumForCompare && !m_generateCheckSumForCompare) {
+			NULLPTR_CHECK(m_bufForCompare = new vx_uint8[m_imageFrameSize[0]]);
+		}
+	}
+
+	return 0;
+}
+
+int CVxParamPyramid::ReadFrame(int frameNumber)
+{
+	if (!m_fpReadImage) return VX_SUCCESS;
+
+	for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+		// get image for current level and read image
+		vx_image image = vxGetPyramidLevel(m_pyramid, level);
+		int status = ReadImage(image, &m_rectFullLevel[level], m_fpReadImage[level]);
+		vxReleaseImage(&image);
+		if (status) return status;
+	}
+
+	return 0;
+}
+
+int CVxParamPyramid::WriteFrame(int frameNumber)
+{
+	if (!m_fpWriteImage) return VX_SUCCESS;
+
+	for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+		// get image for current level and write image
+		vx_image image = vxGetPyramidLevel(m_pyramid, level);
+		int status = WriteImage(image, &m_rectFullLevel[level], m_fpWriteImage[level]);
+		vxReleaseImage(&image);
+		if (status) return status;
+	}
+
+	return 0;
+}
+
+int CVxParamPyramid::CompareFrame(int frameNumber)
+{
+	if (!m_fpCompareImage) return VX_SUCCESS;
+
+	for (vx_uint32 level = 0; level < (vx_uint32)m_numLevels; level++) {
+		// get image and fp for current level
+		vx_image image = vxGetPyramidLevel(m_pyramid, level);
+		FILE * fp = m_fpCompareImage[level];
+
+		if (m_generateCheckSumForCompare)
+		{ // generate checksum //////////////////////////////////////////
+			char checkSumString[64];
+			ComputeChecksum(checkSumString, image, &m_rectCompareLevel[level]);
+			fprintf(fp, "%s\n", checkSumString);
+		}
+		else if (m_useCheckSumForCompare)
+		{ // compare checksum //////////////////////////////////////////
+			char checkSumStringRef[64] = { 0 };
+			if (fscanf(fp, "%s", checkSumStringRef) != 1) {
+				printf("ERROR: pyramid level#%d checksum missing for %s with frame#%d\n", level, GetVxObjectName(), frameNumber);
+				throw - 1;
+			}
+			char checkSumString[64];
+			ComputeChecksum(checkSumString, image, &m_rectCompareLevel[level]);
+			if (!strcmp(checkSumString, checkSumStringRef)) {
+				m_compareCountMatches++;
+				if (m_verbose) printf("OK: pyramid level#%d CHECKSUM MATCHED for %s with frame#%d\n", level, GetVxObjectName(), frameNumber);
+			}
+			else {
+				m_compareCountMismatches++;
+				printf("ERROR: pyramid level#%d CHECKSUM MISMATCHED for %s with frame#%d [%s instead of %s]\n", level, GetVxObjectName(), frameNumber, checkSumString, checkSumStringRef);
+				if (m_abortOnCompareMismatch) return -1;
+			}
+		}
+		else
+		{ // compare raw frames //////////////////////////////////////////
+			// read data from frame
+			size_t bytesRead = fread(m_bufForCompare, 1, m_imageFrameSize[level], fp);
+			if (m_imageFrameSize[level] != bytesRead) {
+				// no more data to compare
+				ReportError("ERROR: pyramid level#%d data missing for %s in frame#%d\n", level, GetVxObjectName(), frameNumber);
+			}
+			// compare image to reference from file
+			size_t errorPixelCountTotal = CompareImage(image, &m_rectCompareLevel[level], m_bufForCompare, m_comparePixelErrorMin, m_comparePixelErrorMax, frameNumber, nullptr);
+			if (!errorPixelCountTotal) {
+				m_compareCountMatches++;
+				if (m_verbose) printf("OK: pyramid level#%d COMPARE MATCHED for %s with frame#%d\n", level, GetVxObjectName(), frameNumber);
+			}
+			else {
+				m_compareCountMismatches++;
+				if (m_abortOnCompareMismatch) return -1;
+			}
+		}
+		vxReleaseImage(&image);
+	}
+
+	return 0;
+}
diff --git a/runvx/vxPyramid.h b/runvx/vxPyramid.h
new file mode 100644
index 0000000..f1908b5
--- /dev/null
+++ b/runvx/vxPyramid.h
@@ -0,0 +1,71 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_PYRAMID_H__
+#define __VX_PYRAMID_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamPyramid : public CVxParameter
+{
+public:
+	CVxParamPyramid();
+	virtual ~CVxParamPyramid();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_df_image m_format;
+	vx_uint32 m_width;
+	vx_uint32 m_height;
+	vx_size m_numLevels;
+	vx_float32 m_scale;
+	// vx object
+	vx_pyramid m_pyramid;
+	// I/O configuration
+	float m_comparePixelErrorMin;
+	float m_comparePixelErrorMax;
+	vx_rectangle_t m_rectCompare;        // rectangle used to save rectangular region used for compare
+	vx_rectangle_t * m_rectFullLevel;    // rectangle for full image size for use by access/commit
+	vx_rectangle_t * m_rectCompareLevel; // rectangle for image compate at each level
+	vx_uint8 * m_bufForCompare;
+	bool m_useCheckSumForCompare;
+	bool m_generateCheckSumForCompare;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	size_t m_pyramidFrameSize;
+	size_t * m_imageFrameSize;
+	FILE ** m_fpReadImage;
+	FILE ** m_fpWriteImage;
+	FILE ** m_fpCompareImage;
+};
+
+#endif /* __VX_PYRAMID_H__ */
\ No newline at end of file
diff --git a/runvx/vxRemap.cpp b/runvx/vxRemap.cpp
new file mode 100644
index 0000000..d927ef4
--- /dev/null
+++ b/runvx/vxRemap.cpp
@@ -0,0 +1,344 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxRemap.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamRemap
+//
+CVxParamRemap::CVxParamRemap()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_REMAP;
+	m_srcWidth = 0;
+	m_srcHeight = 0;
+	m_dstWidth = 0;
+	m_dstHeight = 0;
+	// I/O configuration
+	m_readFileIsBinary = false;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	m_useSyncOpenCLWriteDirective = false;
+	m_xyErr[0] = m_xyErr[1] = 0.0f;
+	// vx object
+	m_remap = nullptr;
+}
+
+CVxParamRemap::~CVxParamRemap()
+{
+	Shutdown();
+}
+
+int CVxParamRemap::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: remap COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	if (m_remap) {
+		vxReleaseRemap(&m_remap);
+		m_remap = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamRemap::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	//   syntax: remap:<srcWidth>,<srcHeight>,<dstWidth>,<dstHeight>[:<io-params>]
+	char objType[64];
+	const char * ioParams = ScanParameters(desc, "remap:<srcWidth>,<srcHeight>,<dstWidth>,<dstHeight>", "s:d,d,d,d", objType, &m_srcWidth, &m_srcHeight, &m_dstWidth, &m_dstHeight);
+	if (!_stricmp(objType, "remap")) {
+		m_remap = vxCreateRemap(context, m_srcWidth, m_srcHeight, m_dstWidth, m_dstHeight);
+	}
+	else ReportError("ERROR: invalid remap type: %s\n", objType);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_remap);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: pyramid creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_remap) vxReleaseRemap(&m_remap);
+		throw - 1;
+	}
+
+	// io initialize
+	return InitializeIO(context, graph, (vx_reference)m_remap, ioParams);
+}
+
+int CVxParamRemap::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_remap = (vx_remap)m_vxObjRef;
+	ERROR_CHECK(vxQueryRemap(m_remap, VX_REMAP_ATTRIBUTE_SOURCE_WIDTH, &m_srcWidth, sizeof(m_srcWidth)));
+	ERROR_CHECK(vxQueryRemap(m_remap, VX_REMAP_ATTRIBUTE_SOURCE_HEIGHT, &m_srcHeight, sizeof(m_srcHeight)));
+	ERROR_CHECK(vxQueryRemap(m_remap, VX_REMAP_ATTRIBUTE_DESTINATION_WIDTH, &m_dstWidth, sizeof(m_dstWidth)));
+	ERROR_CHECK(vxQueryRemap(m_remap, VX_REMAP_ATTRIBUTE_DESTINATION_HEIGHT, &m_dstHeight, sizeof(m_dstHeight)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read")) {
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			m_usingMultiFrameCapture = (m_fileNameRead.find("%") != std::string::npos) ? true : false;
+			m_readFileIsBinary = (m_fileNameRead.find(".txt") != m_fileNameRead.npos) ? false : true;
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",ascii|binary", ",s", option);
+				if (!_stricmp(option, "ascii")) {
+					m_readFileIsBinary = false;
+				}
+				else if (!_stricmp(option, "binary")) {
+					m_readFileIsBinary = true;
+				}
+				else ReportError("ERROR: invalid remap read option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "write")) {
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare request syntax: compare,<fileName>[,err{<x>;<y>}]
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",err{<x>;<y>}", ",s", option);
+				if (!_strnicmp(option, "err{", 4)) {
+					ScanParameters(&option[3], "{<errX>;<errY>}", "{f;f}", &m_xyErr[0], &m_xyErr[1]);
+				}
+				else ReportError("ERROR: invalid remap compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "directive,sync-cl-write")) {
+			m_useSyncOpenCLWriteDirective = true;
+		}
+		else if (!_stricmp(ioType, "init")) {
+			const char * patternName = fileName;
+			if (!_stricmp(patternName, "same")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (vx_float32)x, sy = (vx_float32)y;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else if (!_stricmp(patternName, "rotate-90")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (vx_float32)m_dstHeight - 1 - y, sy = (vx_float32)x;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else if (!_stricmp(patternName, "rotate-180")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (vx_float32)m_dstWidth - 1 - x, sy = (vx_float32)m_dstHeight - 1 - y;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else if (!_stricmp(patternName, "rotate-270")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (vx_float32)y, sy = (vx_float32)m_dstWidth - 1 - x;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else if (!_stricmp(patternName, "scale")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (x + 0.5f) * (vx_float32)m_srcWidth / (vx_float32)m_dstWidth - 0.5f;
+						vx_float32 sy = (y + 0.5f) * (vx_float32)m_srcHeight / (vx_float32)m_dstHeight - 0.5f;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else if (!_stricmp(patternName, "hflip")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (vx_float32)m_dstWidth - 1 - x, sy = (vx_float32)y;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else if (!_stricmp(patternName, "vflip")) {
+				for (vx_uint32 y = 0; y < m_dstHeight; y++){
+					for (vx_uint32 x = 0; x < m_dstWidth; x++){
+						vx_float32 sx = (vx_float32)x, sy = (vx_float32)m_dstHeight - 1 - y;
+						vx_status status = vxSetRemapPoint(m_remap, x, y, sx, sy);
+						if (status) {
+							printf("ERROR: vxSetRemapPoint(*,%d,%d,%g,%g) failed, status = %d\n", x, y, sx, sy, status);
+							return -1;
+						}
+					}
+				}
+			}
+			else {
+				printf("ERROR: invalid remap initiazation pattern name: %s\n", patternName);
+				return -1;
+			}
+		}
+		else {
+			printf("ERROR: invalid remap I/O operation: %s\n", ioType);
+			return -1;
+		}
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+	return 0;
+}
+
+int CVxParamRemap::Finalize()
+{
+	if (m_useSyncOpenCLWriteDirective) {
+		vxDirective((vx_reference)m_remap, VX_DIRECTIVE_AMD_COPY_TO_OPENCL);
+	}
+	return 0;
+}
+
+int CVxParamRemap::ReadFrame(int frameNumber)
+{
+	if (m_fileNameRead.length() < 1) return 0;
+
+	if (!m_usingMultiFrameCapture && frameNumber != m_captureFrameStart) {
+		// for single frame reads, there is no need to read the array again
+		// as it is already read into the object
+		return 0;
+	}
+
+	// read from user specified file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameRead.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, m_readFileIsBinary ? "rb" : "r");
+	if (!fp) ReportError("ERROR: unable to open: %s\n", fileName);
+	for (vx_uint32 y = 0; y < m_dstHeight; y++){
+		for (vx_uint32 x = 0; x < m_dstWidth; x++){
+			vx_float32 src_xy[2];
+			if (m_readFileIsBinary) {
+				if (fread(src_xy, sizeof(src_xy), 1, fp) != 1)
+					ReportError("ERROR: detected EOF at (%d,%d) on remap input file: %s\n", x, y, fileName);
+			}
+			else {
+				if (fscanf(fp, "%g%g", &src_xy[0], &src_xy[1]) != 2)
+					ReportError("ERROR: detected EOF at (%d,%d) on remap input file: %s (ASCII)\n", x, y, fileName);
+			}
+			ERROR_CHECK(vxSetRemapPoint(m_remap, x, y, src_xy[0], src_xy[1]));
+		}
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamRemap::WriteFrame(int frameNumber)
+{
+	if (m_fileNameWrite.length() < 1) return 0;
+
+	// write output into user specified file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameWrite.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, "wb");
+	if (!fp) ReportError("ERROR: unable to create: %s\n", fileName);
+	for (vx_uint32 y = 0; y < m_dstHeight; y++){
+		for (vx_uint32 x = 0; x < m_dstWidth; x++){
+			vx_float32 src_xy[2];
+			ERROR_CHECK(vxGetRemapPoint(m_remap, x, y, &src_xy[0], &src_xy[1]));
+			fwrite(src_xy, sizeof(src_xy), 1, fp);
+		}
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+int CVxParamRemap::CompareFrame(int frameNumber)
+{
+	// check if there is no user request to compare
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// reading data from reference file
+	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
+	FILE * fp = fopen(fileName, "rb");
+	if (!fp) {
+		ReportError("ERROR: Unable to open: %s\n", fileName);
+	}
+	bool mismatchDetected = false;
+	int status = 0;
+	for (vx_uint32 y = 0; y < m_dstHeight; y++){
+		for (vx_uint32 x = 0; x < m_dstWidth; x++){
+			vx_float32 xy[2];
+			ERROR_CHECK(vxGetRemapPoint(m_remap, x, y, &xy[0], &xy[1]));
+			vx_float32 xyRef[2];
+			if (fread(xyRef, sizeof(xyRef), 1, fp) != 1) {
+				status = -1;
+				break;
+			}
+			if (fabsf(xy[0] - xyRef[0]) > m_xyErr[0] || fabsf(xy[1] - xyRef[1]) > m_xyErr[1]) {
+				mismatchDetected = true;
+				break;
+			}
+		}
+		if (status || mismatchDetected)
+			break;
+	}
+	fclose(fp);
+	if (status < 0)
+		ReportError("ERROR: detected EOF on remap comapre reference file: %s\n", fileName);
+
+	if (mismatchDetected) {
+		m_compareCountMismatches++;
+		printf("ERROR: remap COMPARE MISMATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+		if (m_verbose) printf("OK: remap COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
+	}
+
+	return 0;
+}
diff --git a/runvx/vxRemap.h b/runvx/vxRemap.h
new file mode 100644
index 0000000..ea331ea
--- /dev/null
+++ b/runvx/vxRemap.h
@@ -0,0 +1,60 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_REMAP_H__
+#define __VX_REMAP_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamRemap : public CVxParameter
+{
+public:
+	CVxParamRemap();
+	virtual ~CVxParamRemap();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_uint32 m_srcWidth;
+	vx_uint32 m_srcHeight;
+	vx_uint32 m_dstWidth;
+	vx_uint32 m_dstHeight;
+	// I/O configuration
+	bool m_readFileIsBinary;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	float m_xyErr[2];
+	bool m_useSyncOpenCLWriteDirective;
+	// vx object
+	vx_remap m_remap;
+};
+
+#endif /* __VX_REMAP_H__ */
\ No newline at end of file
diff --git a/runvx/vxScalar.cpp b/runvx/vxScalar.cpp
new file mode 100644
index 0000000..4445170
--- /dev/null
+++ b/runvx/vxScalar.cpp
@@ -0,0 +1,268 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxScalar.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamScalar
+//
+CVxParamScalar::CVxParamScalar()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_SCALAR;
+	m_format = VX_TYPE_INVALID;
+	// I/O configuration
+	m_compareRangeInRef = false;
+	m_compareCountMatches = 0;
+	m_compareCountMismatches = 0;
+	// vx object
+	m_scalar = nullptr;
+}
+
+CVxParamScalar::~CVxParamScalar()
+{
+	Shutdown();
+}
+
+int CVxParamScalar::Shutdown(void)
+{
+	if (m_compareCountMatches > 0 && m_compareCountMismatches == 0) {
+		printf("OK: scalar COMPARE MATCHED for %d frame(s) of %s\n", m_compareCountMatches, GetVxObjectName());
+	}
+	GuiTrackBarShutdown((vx_reference)m_scalar);
+	if (m_scalar){
+		vxReleaseScalar(&m_scalar);
+		m_scalar = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamScalar::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64], format[64], value[256];
+	const char * ioParams = ScanParameters(desc, "scalar:<type>,<value>", "s:s,S", objType, format, value);
+	if (!_stricmp(objType, "scalar")) {
+		m_format = ovxName2Enum(format);
+		if (m_format == VX_TYPE_STRING_AMD) {
+			m_scalar = vxCreateScalar(context, m_format, value);
+		}
+		else {
+			vx_uint64 v = 0;
+			if (!GetScalarValueFromString(m_format, value, &v)) {
+				m_scalar = vxCreateScalar(context, m_format, &v);
+			}
+			else ReportError("ERROR: unsupported scalar value: %s [%s:0x%08x]\n", value, format, m_format);
+		}
+	}
+	else ReportError("ERROR: unsupported scalar type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_scalar);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: scalar creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_scalar) vxReleaseScalar(&m_scalar);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_scalar;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamScalar::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_scalar = (vx_scalar)m_vxObjRef;
+	ERROR_CHECK(vxQueryScalar(m_scalar, VX_SCALAR_ATTRIBUTE_TYPE, &m_format, sizeof(m_format)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			if (*io_params == ',') {
+				ReportError("ERROR: invalid scalar read option: %s\n", io_params);
+			}
+		}
+		else if (!_stricmp(ioType, "write"))
+		{ // write request syntax: write,<fileName>
+			m_fileNameWrite.assign(RootDirUpdated(fileName));
+			if (*io_params == ',') {
+				ReportError("ERROR: invalid scalar read option: %s\n", io_params);
+			}
+		}
+		else if (!_stricmp(ioType, "compare"))
+		{ // compare syntax: compare,fileName[,range]
+			m_fileNameCompare.assign(RootDirUpdated(fileName));
+			while (*io_params == ',') {
+				char option[64];
+				io_params = ScanParameters(io_params, ",range", ",s", option);
+				if (!_stricmp(option, "range")) {
+					m_compareRangeInRef = true;
+				}
+				else ReportError("ERROR: invalid scalar compare option: %s\n", option);
+			}
+		}
+		else if (!_stricmp(ioType, "view")) {
+			m_displayName.assign(fileName);
+			m_paramList.push_back(this);
+		}
+		else if (!_stricmp(ioType, "ui") && (m_format == VX_TYPE_FLOAT32)) {
+			int id = 0;
+			float valueMin = 0.0f, valueMax = 1.0f, valueInc = 0.1f;
+			ScanParameters(fileName, "{<id>;<min>;<max>;<inc>}", "{d;f;f;f}", &id, &valueMin, &valueMax, &valueInc);
+			GuiTrackBarInitializeScalar((vx_reference)m_scalar, id-1, valueMin, valueMax, valueInc);
+		}
+		else ReportError("ERROR: invalid scalar operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamScalar::Finalize()
+{
+	// get attributes
+	ERROR_CHECK(vxQueryScalar(m_scalar, VX_SCALAR_ATTRIBUTE_TYPE, &m_format, sizeof(m_format)));
+
+	return 0;
+}
+
+int CVxParamScalar::ReadFrame(int frameNumber)
+{
+	// check if there is no user request to read
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// make sure to open the input file
+	if (!m_fpRead) {
+		const char * fileName = m_fileNameRead.c_str();
+		if (!(m_fpRead = fopen(fileName, "r")))
+			ReportError("ERROR: unable to open: %s\n", fileName);
+	}
+
+	// read the next word and set the scalar value
+	char str[256];
+	if (fscanf(m_fpRead, "%s", str) != 1) {
+		// end of file reached
+		return 1;
+	}
+	return WriteScalarFromString(m_scalar, str);
+}
+
+int CVxParamScalar::WriteFrame(int frameNumber)
+{
+	// check if there is no user request to write
+	if (m_fileNameWrite.length() < 1) return 0;
+
+	// make sure to create the output file
+	if (!m_fpWrite) {
+		const char * fileName = m_fileNameWrite.c_str();
+		if (!(m_fpWrite = fopen(fileName, "w")))
+			ReportError("ERROR: unable to create: %s\n", fileName);
+	}
+
+	// write scalar value
+	char str[256];
+	if (ReadScalarToString(m_scalar, str) < 0)
+		return -1;
+	fprintf(m_fpWrite, "%s\n", str);
+
+	return 0;
+}
+
+int CVxParamScalar::CompareFrame(int frameNumber)
+{
+	// check if there is no user request to compare
+	if (m_fileNameCompare.length() < 1) return 0;
+
+	// make sure to open the input file
+	if (!m_fpCompare) {
+		const char * fileName = m_fileNameCompare.c_str();
+		if (!(m_fpCompare = fopen(fileName, "r")))
+			ReportError("ERROR: unable to open: %s\n", fileName);
+	}
+
+	// read the next item for compare
+	char strMin[256], strMax[256];
+	vx_uint64 valueRefMin = 0, valueRefMax = 0;
+	if (!m_compareRangeInRef) {
+		// read one value and set it as min as well as max
+		if (fscanf(m_fpCompare, "%s", strMin) != 1)
+			ReportError("ERROR: compare: missing data item for %s\n", GetVxObjectName());
+		if (GetScalarValueFromString(m_format, strMin, &valueRefMin) < 0)
+			ReportError("ERROR: compare: invalid data item for %s: %s\n", GetVxObjectName(), strMin);
+		valueRefMax = valueRefMin;
+		strcpy(strMax, strMin);
+	}
+	else {
+		// read min and max values for range compare
+		if (fscanf(m_fpCompare, "%s%s", strMin, strMax) != 2)
+			ReportError("ERROR: compare: missing data item for %s\n", GetVxObjectName());
+		if (GetScalarValueFromString(m_format, strMin, &valueRefMin) < 0)
+			ReportError("ERROR: compare: invalid data item for %s: %s\n", GetVxObjectName(), strMin);
+		if (GetScalarValueFromString(m_format, strMax, &valueRefMax) < 0)
+			ReportError("ERROR: compare: invalid data item for %s: %s\n", GetVxObjectName(), strMax);
+	}
+	// compare the value to be within the range
+	vx_uint64 value = 0;
+	ERROR_CHECK(vxReadScalarValue(m_scalar, &value));
+	bool mismatchDetected = true;
+	if (((m_format == VX_TYPE_FLOAT32) && (*(vx_float32 *)&value >= *(vx_float32 *)&valueRefMin) && (*(vx_float32 *)&value <= *(vx_float32 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_FLOAT64) && (*(vx_float64 *)&value >= *(vx_float64 *)&valueRefMin) && (*(vx_float64 *)&value <= *(vx_float64 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_DF_IMAGE) && (*(vx_df_image *)&value >= *(vx_df_image *)&valueRefMin) && (*(vx_df_image *)&value <= *(vx_df_image *)&valueRefMax))
+		|| ((m_format == VX_TYPE_SIZE) && (*(vx_size *)&value >= *(vx_size *)&valueRefMin) && (*(vx_size *)&value <= *(vx_size *)&valueRefMax))
+		|| ((m_format == VX_TYPE_ENUM) && (*(vx_enum *)&value >= *(vx_enum *)&valueRefMin) && (*(vx_enum *)&value <= *(vx_enum *)&valueRefMax))
+		|| ((m_format == VX_TYPE_BOOL) && (*(vx_bool *)&value >= *(vx_bool *)&valueRefMin) && (*(vx_bool *)&value <= *(vx_bool *)&valueRefMax))
+		|| ((m_format == VX_TYPE_UINT64) && (*(vx_uint64 *)&value >= *(vx_uint64 *)&valueRefMin) && (*(vx_uint64 *)&value <= *(vx_uint64 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_UINT32) && (*(vx_uint32 *)&value >= *(vx_uint32 *)&valueRefMin) && (*(vx_uint32 *)&value <= *(vx_uint32 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_UINT16) && (*(vx_uint16 *)&value >= *(vx_uint16 *)&valueRefMin) && (*(vx_uint16 *)&value <= *(vx_uint16 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_UINT8) && (*(vx_uint8 *)&value >= *(vx_uint8 *)&valueRefMin) && (*(vx_uint8 *)&value <= *(vx_uint8 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_INT64) && (*(vx_int64 *)&value >= *(vx_int64 *)&valueRefMin) && (*(vx_int64 *)&value <= *(vx_int64 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_INT32) && (*(vx_int32 *)&value >= *(vx_int32 *)&valueRefMin) && (*(vx_int32 *)&value <= *(vx_int32 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_INT16) && (*(vx_int16 *)&value >= *(vx_int16 *)&valueRefMin) && (*(vx_int16 *)&value <= *(vx_int16 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_INT8) && (*(vx_int8 *)&value >= *(vx_int8 *)&valueRefMin) && (*(vx_int8 *)&value <= *(vx_int8 *)&valueRefMax))
+		|| ((m_format == VX_TYPE_CHAR) && (*(vx_char *)&value >= *(vx_char *)&valueRefMin) && (*(vx_char *)&value <= *(vx_char *)&valueRefMax)))
+	{
+		mismatchDetected = false;
+	}
+
+	char str[256];
+	ReadScalarToString(m_scalar, str);
+	if (mismatchDetected) {
+		m_compareCountMismatches++;
+		printf("ERROR: scalar COMPARE MISMATCHED for %s with frame#%d: %s in [%s .. %s]\n", GetVxObjectName(), frameNumber, str, strMin, strMax);
+		if (m_abortOnCompareMismatch) return -1;
+	}
+	else {
+		m_compareCountMatches++;
+		if (m_verbose) printf("OK: scalar COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, str);
+	}
+
+	return 0;
+}
diff --git a/runvx/vxScalar.h b/runvx/vxScalar.h
new file mode 100644
index 0000000..e7d5674
--- /dev/null
+++ b/runvx/vxScalar.h
@@ -0,0 +1,56 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_SCALAR_H__
+#define __VX_SCALAR_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamScalar : public CVxParameter
+{
+public:
+	CVxParamScalar();
+	virtual ~CVxParamScalar();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_enum m_format;
+	// I/O configuration
+	bool m_compareRangeInRef;
+	int m_compareCountMatches;
+	int m_compareCountMismatches;
+	// vx object
+	vx_scalar m_scalar;
+};
+
+
+#endif /* __VX_SCALAR_H__ */
\ No newline at end of file
diff --git a/runvx/vxThreshold.cpp b/runvx/vxThreshold.cpp
new file mode 100644
index 0000000..e52d59a
--- /dev/null
+++ b/runvx/vxThreshold.cpp
@@ -0,0 +1,163 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxThreshold.h"
+
+///////////////////////////////////////////////////////////////////////
+// class CVxParamThreshold
+//
+CVxParamThreshold::CVxParamThreshold()
+{
+	// vx configuration
+	m_vxObjType = VX_TYPE_THRESHOLD;
+	m_thresh_type = VX_THRESHOLD_TYPE_BINARY;
+	m_data_type = VX_TYPE_UINT8;
+	// vx object
+	m_threshold = nullptr;
+}
+
+CVxParamThreshold::~CVxParamThreshold()
+{
+	Shutdown();
+}
+
+int CVxParamThreshold::Shutdown(void)
+{
+	if (m_threshold) {
+		vxReleaseThreshold(&m_threshold);
+		m_threshold = nullptr;
+	}
+	return 0;
+}
+
+int CVxParamThreshold::Initialize(vx_context context, vx_graph graph, const char * desc)
+{
+	// get object parameters and create object
+	char objType[64], thresh_type[64], data_type[64];
+	const char * ioParams = ScanParameters(desc, "threshold:<thresh-type>,<data-type>", "s:s,s", objType, thresh_type, data_type);
+	if (!_stricmp(objType, "threshold")) {
+		m_thresh_type = ovxName2Enum(thresh_type);
+		m_data_type = ovxName2Enum(data_type);
+		m_threshold = vxCreateThreshold(context, m_thresh_type, m_data_type);
+	}
+	else ReportError("ERROR: unsupported threshold type: %s\n", desc);
+	vx_status ovxStatus = vxGetStatus((vx_reference)m_threshold);
+	if (ovxStatus != VX_SUCCESS){
+		printf("ERROR: threshold creation failed => %d (%s)\n", ovxStatus, ovxEnum2Name(ovxStatus));
+		if (m_threshold) vxReleaseThreshold(&m_threshold);
+		throw - 1;
+	}
+	m_vxObjRef = (vx_reference)m_threshold;
+
+	// io initialize
+	return InitializeIO(context, graph, m_vxObjRef, ioParams);
+}
+
+int CVxParamThreshold::InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params)
+{
+	// save reference object and get object attributes
+	m_vxObjRef = ref;
+	m_threshold = (vx_threshold)m_vxObjRef;
+	ERROR_CHECK(vxQueryThreshold(m_threshold, VX_THRESHOLD_ATTRIBUTE_TYPE, &m_thresh_type, sizeof(m_thresh_type)));
+	ERROR_CHECK(vxQueryThreshold(m_threshold, VX_THRESHOLD_ATTRIBUTE_DATA_TYPE, &m_data_type, sizeof(m_data_type)));
+
+	// process I/O parameters
+	if (*io_params == ':') io_params++;
+	while (*io_params) {
+		char ioType[64], fileName[256];
+		io_params = ScanParameters(io_params, "<io-operation>,<parameter>", "s,S", ioType, fileName);
+		if (!_stricmp(ioType, "read"))
+		{ // read request syntax: read,<fileName>
+			m_fileNameRead.assign(RootDirUpdated(fileName));
+			if (*io_params == ',') {
+				ReportError("ERROR: invalid threshold read option: %s\n", io_params);
+			}
+		}
+		else if (!_stricmp(ioType, "init")) {
+			if (m_thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+				vx_int32 lower = 0, upper = 0;
+				ScanParameters(fileName, "<threshold-lower>", "d", &lower);
+				io_params = ScanParameters(io_params, ",<threshold-upper>", ",d", &upper);
+				ERROR_CHECK(vxSetThresholdAttribute(m_threshold, VX_THRESHOLD_ATTRIBUTE_THRESHOLD_LOWER, &lower, sizeof(vx_int32)));
+				ERROR_CHECK(vxSetThresholdAttribute(m_threshold, VX_THRESHOLD_ATTRIBUTE_THRESHOLD_UPPER, &upper, sizeof(vx_int32)));
+			}
+			else {
+				vx_int32 value = 0;
+				ScanParameters(fileName, "<threshold-value>", "d", &value);
+				ERROR_CHECK(vxSetThresholdAttribute(m_threshold, VX_THRESHOLD_ATTRIBUTE_THRESHOLD_VALUE, &value, sizeof(vx_int32)));
+			}
+		}
+		else ReportError("ERROR: invalid threshold operation: %s\n", ioType);
+		if (*io_params == ':') io_params++;
+		else if (*io_params) ReportError("ERROR: unexpected character sequence in parameter specification: %s\n", io_params);
+	}
+
+	return 0;
+}
+
+int CVxParamThreshold::Finalize()
+{
+	return 0;
+}
+
+int CVxParamThreshold::ReadFrame(int frameNumber)
+{
+	// check if there is no user request to read
+	if (m_fileNameRead.length() < 1) return 0;
+
+	// make sure to open the input file
+	if (!m_fpRead) {
+		const char * fileName = m_fileNameRead.c_str();
+		if (!(m_fpRead = fopen(fileName, "r")))
+			ReportError("ERROR: unable to open: %s\n", fileName);
+	}
+
+	// read the next word(s) and set the threshold
+	if (m_thresh_type == VX_THRESHOLD_TYPE_RANGE) {
+		vx_int32 lower = 0, upper = 0;
+		if (fscanf(m_fpRead, "%i%i", &lower, &upper) != 2)
+			return 1; // end of file reached
+		ERROR_CHECK(vxSetThresholdAttribute(m_threshold, VX_THRESHOLD_ATTRIBUTE_THRESHOLD_LOWER, &lower, sizeof(vx_int32)));
+		ERROR_CHECK(vxSetThresholdAttribute(m_threshold, VX_THRESHOLD_ATTRIBUTE_THRESHOLD_UPPER, &upper, sizeof(vx_int32)));
+	}
+	else {
+		vx_int32 value = 0;
+		if (fscanf(m_fpRead, "%i", &value) != 1)
+			return 1; // end of file reached
+		ERROR_CHECK(vxSetThresholdAttribute(m_threshold, VX_THRESHOLD_ATTRIBUTE_THRESHOLD_VALUE, &value, sizeof(vx_int32)));
+	}
+
+	return 0;
+}
+
+int CVxParamThreshold::WriteFrame(int frameNumber)
+{
+	return 0;
+}
+
+int CVxParamThreshold::CompareFrame(int frameNumber)
+{
+	return 0;
+}
diff --git a/runvx/vxThreshold.h b/runvx/vxThreshold.h
new file mode 100644
index 0000000..f615760
--- /dev/null
+++ b/runvx/vxThreshold.h
@@ -0,0 +1,52 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_THRESHOLD_H__
+#define __VX_THRESHOLD_H__
+
+#include "vxParameter.h"
+#include "vxParamHelper.h"
+#include "vxUtils.h"
+
+class CVxParamThreshold : public CVxParameter
+{
+public:
+	CVxParamThreshold();
+	~CVxParamThreshold();
+	virtual int Initialize(vx_context context, vx_graph graph, const char * desc);
+	virtual int InitializeIO(vx_context context, vx_graph graph, vx_reference ref, const char * io_params);
+	virtual int Finalize();
+	virtual int ReadFrame(int frameNumber);
+	virtual int WriteFrame(int frameNumber);
+	virtual int CompareFrame(int frameNumber);
+	virtual int Shutdown();
+
+private:
+	// vx configuration
+	vx_enum m_thresh_type;
+	vx_enum m_data_type;
+	// vx object
+	vx_threshold m_threshold;
+};
+
+#endif /* __VX_THRESHOLD_H__ */
\ No newline at end of file
diff --git a/runvx/vxUtils.cpp b/runvx/vxUtils.cpp
new file mode 100644
index 0000000..0ea59ac
--- /dev/null
+++ b/runvx/vxUtils.cpp
@@ -0,0 +1,722 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "vxUtils.h"
+
+#define IS_ALPHA(c) (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z'))
+#define TO_UPPER(c) ((c) & 0xDF)
+
+// enumeration constants
+static struct { const char * name; vx_enum value; } s_table_constants[] = {
+	{ "CHANNEL_0|VX_CHANNEL_0", VX_CHANNEL_0 },
+	{ "CHANNEL_1|VX_CHANNEL_1", VX_CHANNEL_1 },
+	{ "CHANNEL_2|VX_CHANNEL_2", VX_CHANNEL_2 },
+	{ "CHANNEL_3|VX_CHANNEL_3", VX_CHANNEL_3 },
+	{ "CHANNEL_R|VX_CHANNEL_R", VX_CHANNEL_R },
+	{ "CHANNEL_G|VX_CHANNEL_G", VX_CHANNEL_G },
+	{ "CHANNEL_B|VX_CHANNEL_B", VX_CHANNEL_B },
+	{ "CHANNEL_A|VX_CHANNEL_A", VX_CHANNEL_A },
+	{ "CHANNEL_Y|VX_CHANNEL_Y", VX_CHANNEL_Y },
+	{ "CHANNEL_U|VX_CHANNEL_U", VX_CHANNEL_U },
+	{ "CHANNEL_V|VX_CHANNEL_V", VX_CHANNEL_V },
+	{ "WRAP|VX_CONVERT_POLICY_WRAP", VX_CONVERT_POLICY_WRAP },
+	{ "SATURATE|VX_CONVERT_POLICY_SATURATE", VX_CONVERT_POLICY_SATURATE },
+	{ "NEAREST_NEIGHBOR|VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR", VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR },
+	{ "BILINEAR|VX_INTERPOLATION_TYPE_BILINEAR", VX_INTERPOLATION_TYPE_BILINEAR },
+	{ "AREA|VX_INTERPOLATION_TYPE_AREA", VX_INTERPOLATION_TYPE_AREA },
+	{ "BINARY|VX_THRESHOLD_TYPE_BINARY", VX_THRESHOLD_TYPE_BINARY },
+	{ "RANGE|VX_THRESHOLD_TYPE_RANGE", VX_THRESHOLD_TYPE_RANGE },
+	{ "NORM_L1|VX_NORM_L1", VX_NORM_L1 },
+	{ "NORM_L2|VX_NORM_L2", VX_NORM_L2 },
+	{ "ROUND_POLICY_TO_ZERO|VX_ROUND_POLICY_TO_ZERO", VX_ROUND_POLICY_TO_ZERO },
+	{ "ROUND_POLICY_TO_NEAREST_EVEN|VX_ROUND_POLICY_TO_NEAREST_EVEN", VX_ROUND_POLICY_TO_NEAREST_EVEN },
+	{ "CRITERIA_ITERATIONS|VX_TERM_CRITERIA_ITERATIONS", VX_TERM_CRITERIA_ITERATIONS },
+	{ "CRITERIA_EPSILON|VX_TERM_CRITERIA_EPSILON", VX_TERM_CRITERIA_EPSILON },
+	{ "CRITERIA_BOTH|VX_TERM_CRITERIA_BOTH", VX_TERM_CRITERIA_BOTH },
+	{ "RECTANGLE|VX_TYPE_RECTANGLE", VX_TYPE_RECTANGLE },
+	{ "KEYPOINT|VX_TYPE_KEYPOINT", VX_TYPE_KEYPOINT },
+	{ "COORDINATES2D|VX_TYPE_COORDINATES2D", VX_TYPE_COORDINATES2D },
+	{ "COORDINATES3D|VX_TYPE_COORDINATES3D", VX_TYPE_COORDINATES3D },
+	{ "ENUM|VX_TYPE_ENUM", VX_TYPE_ENUM },
+	{ "UINT64|VX_TYPE_UINT64", VX_TYPE_UINT64 },
+	{ "INT64|VX_TYPE_INT64", VX_TYPE_INT64 },
+	{ "UINT32|VX_TYPE_UINT32", VX_TYPE_UINT32 },
+	{ "INT32|VX_TYPE_INT32", VX_TYPE_INT32 },
+	{ "UINT16|VX_TYPE_UINT16", VX_TYPE_UINT16 },
+	{ "INT16|VX_TYPE_INT16", VX_TYPE_INT16 },
+	{ "UINT8|VX_TYPE_UINT8", VX_TYPE_UINT8 },
+	{ "INT8|VX_TYPE_INT8", VX_TYPE_INT8 },
+	{ "FLOAT32|VX_TYPE_FLOAT32", VX_TYPE_FLOAT32 },
+	{ "FLOAT64|VX_TYPE_FLOAT64", VX_TYPE_FLOAT64 },
+	{ "SIZE|VX_TYPE_SIZE", VX_TYPE_SIZE },
+	{ "BOOL|VX_TYPE_BOOL", VX_TYPE_BOOL },
+	{ "CHAR|VX_TYPE_CHAR", VX_TYPE_CHAR },
+	{ "STRING|VX_TYPE_STRING_AMD", VX_TYPE_STRING_AMD },
+	{ "BORDER_MODE_UNDEFINED|VX_BORDER_MODE_UNDEFINED", VX_BORDER_MODE_UNDEFINED },
+	{ "BORDER_MODE_REPLICATE|VX_BORDER_MODE_REPLICATE", VX_BORDER_MODE_REPLICATE },
+	{ "BORDER_MODE_CONSTANT|VX_BORDER_MODE_CONSTANT", VX_BORDER_MODE_CONSTANT },
+	{ "VX_DIRECTIVE_DISABLE_LOGGING", VX_DIRECTIVE_DISABLE_LOGGING },
+	{ "VX_DIRECTIVE_ENABLE_LOGGING", VX_DIRECTIVE_ENABLE_LOGGING },
+	{ "VX_DIRECTIVE_READ_ONLY", VX_DIRECTIVE_AMD_READ_ONLY },
+	// error codes
+	{ "VX_ERROR_REFERENCE_NONZERO", VX_ERROR_REFERENCE_NONZERO },
+	{ "VX_ERROR_MULTIPLE_WRITERS", VX_ERROR_MULTIPLE_WRITERS },
+	{ "VX_ERROR_GRAPH_ABANDONED", VX_ERROR_GRAPH_ABANDONED },
+	{ "VX_ERROR_GRAPH_SCHEDULED", VX_ERROR_GRAPH_SCHEDULED },
+	{ "VX_ERROR_INVALID_SCOPE", VX_ERROR_INVALID_SCOPE },
+	{ "VX_ERROR_INVALID_NODE", VX_ERROR_INVALID_NODE },
+	{ "VX_ERROR_INVALID_GRAPH", VX_ERROR_INVALID_GRAPH },
+	{ "VX_ERROR_INVALID_TYPE", VX_ERROR_INVALID_TYPE },
+	{ "VX_ERROR_INVALID_VALUE", VX_ERROR_INVALID_VALUE },
+	{ "VX_ERROR_INVALID_DIMENSION", VX_ERROR_INVALID_DIMENSION },
+	{ "VX_ERROR_INVALID_FORMAT", VX_ERROR_INVALID_FORMAT },
+	{ "VX_ERROR_INVALID_LINK", VX_ERROR_INVALID_LINK },
+	{ "VX_ERROR_INVALID_REFERENCE", VX_ERROR_INVALID_REFERENCE },
+	{ "VX_ERROR_INVALID_MODULE", VX_ERROR_INVALID_MODULE },
+	{ "VX_ERROR_INVALID_PARAMETERS", VX_ERROR_INVALID_PARAMETERS },
+	{ "VX_ERROR_OPTIMIZED_AWAY", VX_ERROR_OPTIMIZED_AWAY },
+	{ "VX_ERROR_NO_MEMORY", VX_ERROR_NO_MEMORY },
+	{ "VX_ERROR_NO_RESOURCES", VX_ERROR_NO_RESOURCES },
+	{ "VX_ERROR_NOT_COMPATIBLE", VX_ERROR_NOT_COMPATIBLE },
+	{ "VX_ERROR_NOT_ALLOCATED", VX_ERROR_NOT_ALLOCATED },
+	{ "VX_ERROR_NOT_SUFFICIENT", VX_ERROR_NOT_SUFFICIENT },
+	{ "VX_ERROR_NOT_SUPPORTED", VX_ERROR_NOT_SUPPORTED },
+	{ "VX_ERROR_NOT_IMPLEMENTED", VX_ERROR_NOT_IMPLEMENTED },
+	// for debug purposes only
+	{ "KEYPOINT_XYS", AGO_TYPE_KEYPOINT_XYS },
+	{ "VX_TYPE_LUT", VX_TYPE_LUT },
+	{ "VX_TYPE_DISTRIBUTION", VX_TYPE_DISTRIBUTION },
+	{ "VX_TYPE_PYRAMID", VX_TYPE_PYRAMID },
+	{ "VX_TYPE_THRESHOLD", VX_TYPE_THRESHOLD },
+	{ "VX_TYPE_MATRIX", VX_TYPE_MATRIX },
+	{ "VX_TYPE_CONVOLUTION", VX_TYPE_CONVOLUTION },
+	{ "VX_TYPE_SCALAR", VX_TYPE_SCALAR },
+	{ "VX_TYPE_ARRAY", VX_TYPE_ARRAY },
+	{ "VX_TYPE_IMAGE", VX_TYPE_IMAGE },
+	{ "VX_TYPE_REMAP", VX_TYPE_REMAP },
+	{ "VX_TYPE_STRING", VX_TYPE_STRING_AMD },
+	{ "AGO_TYPE_MEANSTDDEV_DATA", AGO_TYPE_MEANSTDDEV_DATA },
+	{ "AGO_TYPE_MINMAXLOC_DATA", AGO_TYPE_MINMAXLOC_DATA },
+	{ "AGO_TYPE_CANNY_STACK", AGO_TYPE_CANNY_STACK },
+	{ "AGO_TYPE_SCALE_MATRIX", AGO_TYPE_SCALE_MATRIX },
+	{ NULL, 0 }
+};
+
+//  ovxEnum2Name -- the returns a global pointer, so returned string has to be saved by caller immediately
+const char * ovxEnum2Name(vx_enum e)
+{
+	for (vx_uint32 i = 0; s_table_constants[i].name; i++) {
+		if (s_table_constants[i].value == e) {
+			static char name[128]; strcpy(name, s_table_constants[i].name);
+			for (int j = 0; name[j]; j++) {
+				if (name[j] == '|') {
+					name[j] = '\0';
+					break;
+				}
+			}
+			return s_table_constants[i].name;
+		}
+	}
+	return NULL;
+}
+
+//  ovxEnum2String -- return enum name or hex value as a string
+void ovxEnum2String(vx_enum e, char str[])
+{
+	const char * name = ovxEnum2Name(e);
+	if (e) strcpy(str, name);
+	else sprintf(str, "0x%x", e);
+}
+
+//  ovxName2Enum -- returns enum corresponding to name or hex value in the input string
+vx_enum ovxName2Enum(const char * name)
+{
+	for (vx_uint32 i = 0; s_table_constants[i].name; i++) {
+		char nameList[128]; strcpy(nameList, s_table_constants[i].name);
+		// search for name in '|' separated nameList:
+		//   s - points to beginning of current name in nameList
+		//   t - running pointer in nameList
+		for (char * s = nameList, *t = nameList;; t++) {
+			if (*t == '|' || *t == '\0') {
+				char tc = *t; *t = '\0';
+				if (!_stricmp(s, name)) {
+					// found name, so return corresponding enum value
+					return s_table_constants[i].value;
+				}
+				if (tc == '\0')
+					break; // reached end of nameList, so abort searching
+				else
+					s = t + 1; // make s point to beginning of next name in nameList
+			}
+		}
+	}
+	// if none found, try reading as an integer (may be user wanted to specify a hex value for enum)
+	vx_enum value = 0;
+	(void)sscanf(name, "%i", &value);
+	return value;
+}
+
+const char * stristr(const char * str1, const char * str2)
+{
+	if (!*str2) return str1;
+	for (const char * cp = str1; *cp; cp++) {
+		const char * s2 = str2;
+		for (const char * s1 = cp; *s1 && *s2 && (IS_ALPHA(*s1) && IS_ALPHA(*s2)) ? !(TO_UPPER(*s1) - TO_UPPER(*s2)) : !(*s1 - *s2); s1++)
+			++s2;
+		if (!*s2)
+			return cp;
+	}
+	return nullptr;
+}
+
+///////////////////////////////////////////
+// For supporting ~ in R/W file names
+static char s_rootDir[512] = ".";
+void SetRootDir(const char * rootDir)
+{
+	strcpy(s_rootDir, rootDir);
+}
+const char * RootDirUpdated(const char * filePath)
+{
+	static char updatedFilePath[8192];
+	int j = 0;
+	for (int i = 0; filePath[i]; i++) {
+		if (filePath[i] != '~')
+			updatedFilePath[j++] = filePath[i];
+		else {
+			for (int k = 0; s_rootDir[k]; k++)
+				updatedFilePath[j++] = s_rootDir[k];
+		}
+	}
+	updatedFilePath[j] = 0;
+	return updatedFilePath;
+}
+
+vector<string> &split(const string &s, char delim, vector<string> &elems){
+	if (delim == ' ') {
+		const char * p = s.c_str();
+		while (*p) {
+			while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')
+				p++;
+			const char * q = p;
+			while (*p && !(*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r'))
+				p++;
+			if (*q){
+				char item[1024];
+				strncpy(item, q, p - q); item[p - q] = 0;
+				elems.push_back(item);
+			}
+		}
+	}
+	else {
+		stringstream ss(s);
+		string item;
+		while (getline(ss, item, delim)){
+			elems.push_back(item);
+		}
+	}
+	return elems;
+}
+
+int convert_image_format(string format){
+	if (format.size() == 4){
+		return ((format[0]) | (format[1] << 8) | (format[2] << 16) | (format[3] << 24));
+	}
+	else{
+		printf("ERROR: %s is not a proper image format\n", format.c_str());
+		throw - 1;
+	}
+}
+
+CHasher::CHasher(){
+	for (int i = 0; i < 32; i++)
+		m_checkSum[i] = '0';
+	m_checkSum[32] = '\0';
+}
+
+CHasher::~CHasher(){
+	Shutdown();
+}
+
+void CHasher::Initialize(){
+#if _WIN32
+	
+	DWORD dwStatus = 0;
+
+	if (!CryptAcquireContext(&m_cryptProv, NULL, MS_DEF_PROV, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT))
+	{
+		dwStatus = GetLastError();
+		printf("CryptAcquireContext failed: %d\n", dwStatus);
+		throw - 1;
+	}
+
+	if (!CryptCreateHash(m_cryptProv, CALG_MD5, 0, 0, &m_cryptHash))
+	{
+		dwStatus = GetLastError();
+		printf("CryptCreateHash failed: %d\n", dwStatus);
+		CryptReleaseContext(m_cryptProv, 0);
+		throw - 1;
+	}
+#else
+	if (!MD5_Init(&m_handle)) {
+		printf("ERROR: MD5_Init() failed\n");
+	}
+#endif
+}
+
+void CHasher::Process(vx_uint8 * data_ptr, vx_size count){
+#if _WIN32
+
+	DWORD dwStatus = 0;
+	if (!CryptHashData(m_cryptHash, (BYTE*)data_ptr, (DWORD)count, 0))
+	{
+		dwStatus = GetLastError();
+		printf("CryptHashData failed: %d\n", dwStatus);
+		CryptReleaseContext(m_cryptProv, 0);
+		CryptDestroyHash(m_cryptHash);
+		throw - 1;
+	}
+#else
+	if (!MD5_Update(&m_handle, (unsigned char*)data_ptr, count)) {
+		printf("ERROR: MD5_Update(*,*,%d) failed\n", (int)count);
+	}
+#endif
+}
+
+const char * CHasher::GetCheckSum(){
+
+#if _WIN32
+	DWORD cbHash = 16;
+	DWORD dwStatus = 0;
+	if (!CryptGetHashParam(m_cryptHash, HP_HASHVAL, m_hash, &cbHash, 0)){
+		dwStatus = GetLastError();
+		printf("CryptGetHashParam failed: %d\n", dwStatus);
+		CryptReleaseContext(m_cryptProv, 0);
+		CryptDestroyHash(m_cryptHash);
+		throw - 1;
+	}
+#else
+	if (!MD5_Final(m_hash, &m_handle)) {
+		printf("ERROR: MD5_Final() failed\n");
+	}
+#endif
+	char hex[] = "0123456789abcdef";
+	for (int i = 0; i < 16; i++){
+		m_checkSum[i * 2] = hex[m_hash[i] >> 4];
+		m_checkSum[(i * 2) + 1] = hex[m_hash[i] & 0xF];
+	}
+	return m_checkSum;
+}
+
+void CHasher::Shutdown(){
+#if _WIN32
+	CryptReleaseContext(m_cryptProv, 0);
+	CryptDestroyHash(m_cryptHash);
+#endif
+}
+
+// Compute checksum of rectangular region specified within an image
+void ComputeChecksum(char checkSumString[64], vx_image image, vx_rectangle_t * rectRegion)
+{
+	// get number of planes
+	vx_df_image format = VX_DF_IMAGE_VIRT;
+	vx_size num_planes = 0;
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)));
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes)));
+	// compute checksum
+	CHasher checksum; checksum.Initialize();
+	for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++) {
+		vx_imagepatch_addressing_t addr;
+		vx_uint8 * base_ptr = nullptr;
+		ERROR_CHECK(vxAccessImagePatch(image, rectRegion, plane, &addr, (void **)&base_ptr, VX_READ_ONLY));
+		vx_uint32 width = ((addr.dim_x * addr.scale_x) / VX_SCALE_UNITY);
+		vx_uint32 height = ((addr.dim_y * addr.scale_y) / VX_SCALE_UNITY);
+		vx_uint32 width_in_bytes = (format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x);
+		for (vx_uint32 y = 0; y < height; y++) {
+			checksum.Process(base_ptr + y * addr.stride_y, width_in_bytes);
+		}
+		ERROR_CHECK(vxCommitImagePatch(image, rectRegion, plane, &addr, base_ptr));
+	}
+	// copy the checksum string
+	strcpy(checkSumString, checksum.GetCheckSum());
+}
+
+// <template>ComparePixels -- compares an image with a reference image
+template<typename PixelType, typename CompareType>
+size_t ComparePixels(PixelType * pImg_, size_t img_stride_y, PixelType * pRef_, size_t ref_stride_y, vx_uint32 width, vx_uint32 height, CompareType errLimitMin, CompareType errLimitMax)
+{
+	const vx_uint8 * pImg = (const vx_uint8 *)pImg_;
+	const vx_uint8 * pRef = (const vx_uint8 *)pRef_;
+	size_t errorPixelCount = 0;
+	for (vx_uint32 y = 0; y < height; y++) {
+		const PixelType * p = (const PixelType *)pImg;
+		const PixelType * q = (const PixelType *)pRef;
+		for (size_t x = 0; x < width; x++) {
+			CompareType err = (CompareType)p[x] - (CompareType)q[x];
+			if (err < errLimitMin || err > errLimitMax)
+				errorPixelCount++;
+		}
+		pImg += img_stride_y;
+		pRef += ref_stride_y;
+	}
+	return errorPixelCount;
+}
+
+size_t ComparePixelsU001(vx_uint8 * pImg, size_t img_stride_y, vx_uint8 * pRef, size_t ref_stride_y, vx_uint32 width, vx_uint32 height)
+{
+	size_t errorPixelCount = 0;
+	for (vx_uint32 y = 0; y < height; y++) {
+		const vx_uint8 * p = (const vx_uint8 *)pImg;
+		const vx_uint8 * q = (const vx_uint8 *)pRef;
+		for (size_t x = 0; x < width; x++) {
+			size_t bytepos = x >> 3, bitpos = x & 7;
+			if ((p[bytepos] ^ q[bytepos]) & (1 << bitpos)) {
+				errorPixelCount++;
+			}
+		}
+		pImg += img_stride_y;
+		pRef += ref_stride_y;
+	}
+	return errorPixelCount;
+}
+
+// Compare rectangular region specified within an image and return number of pixels mismatching
+size_t CompareImage(vx_image image, vx_rectangle_t * rectRegion, vx_uint8 * refImage, float errLimitMin, float errLimitMax, int frameNumber, const char * fileNameRef)
+{
+	// get number of planes, image format, and pixel type
+	vx_df_image format = VX_DF_IMAGE_VIRT;
+	vx_size num_planes = 0; vx_uint32 image_width = 0, image_height = 0;
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, &image_width, sizeof(image_width)));
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, &image_height, sizeof(image_height)));
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)));
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes)));
+	// set pixel type and compute frame size in bytes
+	vx_enum pixelType = VX_TYPE_UINT8; // default
+	if (format == VX_DF_IMAGE_S16) pixelType = VX_TYPE_INT16;
+	else if (format == VX_DF_IMAGE_U16) pixelType = VX_TYPE_UINT16;
+	else if (format == VX_DF_IMAGE_S32) pixelType = VX_TYPE_INT32;
+	else if (format == VX_DF_IMAGE_U32) pixelType = VX_TYPE_UINT32;
+	else if (format == VX_DF_IMAGE_F32_AMD || format == VX_DF_IMAGE_F32x3_AMD) pixelType = VX_TYPE_FLOAT32;
+	// compare plane by plane
+	vx_size errorPixelCountTotal = 0;
+	vx_uint8 * pRefPlane = refImage;
+	for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++) {
+		vx_imagepatch_addressing_t addr = { 0 };
+		vx_uint8 * base_ptr = nullptr;
+		ERROR_CHECK(vxAccessImagePatch(image, rectRegion, plane, &addr, (void **)&base_ptr, VX_READ_ONLY));
+		vx_uint32 region_width = ((addr.dim_x * addr.scale_x) / VX_SCALE_UNITY);
+		vx_uint32 region_height = (addr.dim_y * addr.scale_y) / VX_SCALE_UNITY;
+		vx_uint32 plane_width = ((image_width * addr.scale_x) / VX_SCALE_UNITY);
+		vx_uint32 plane_height = ((image_height * addr.scale_y) / VX_SCALE_UNITY);
+		vx_uint32 plane_width_in_bytes = (format == VX_DF_IMAGE_U1_AMD) ? ((plane_width + 7) >> 3) : (plane_width * addr.stride_x);
+		vx_uint32 start_x = ((rectRegion->start_x * addr.scale_x) / VX_SCALE_UNITY);
+		vx_uint32 start_y = ((rectRegion->start_y * addr.scale_y) / VX_SCALE_UNITY);
+		vx_uint8 * pRef = pRefPlane + start_y * plane_width_in_bytes + start_x * addr.stride_x;
+		vx_size errorPixelCount = 0;
+		if (pixelType == VX_TYPE_INT16) {
+			errorPixelCount = ComparePixels((vx_int16 *)base_ptr, addr.stride_y, (vx_int16 *)pRef, plane_width_in_bytes, region_width, region_height, (vx_int32)errLimitMin, (vx_int32)errLimitMax);
+		}
+		else if (pixelType == VX_TYPE_UINT16) {
+			errorPixelCount = ComparePixels((vx_uint16 *)base_ptr, addr.stride_y, (vx_uint16 *)pRef, plane_width_in_bytes, region_width, region_height, (vx_int32)errLimitMin, (vx_int32)errLimitMax);
+		}
+		else if (pixelType == VX_TYPE_INT32) {
+			errorPixelCount = ComparePixels((vx_int32 *)base_ptr, addr.stride_y, (vx_int32 *)pRef, plane_width_in_bytes, region_width, region_height, (vx_int64)errLimitMin, (vx_int64)errLimitMax);
+		}
+		else if (pixelType == VX_TYPE_UINT32) {
+			errorPixelCount = ComparePixels((vx_uint32 *)base_ptr, addr.stride_y, (vx_uint32 *)pRef, plane_width_in_bytes, region_width, region_height, (vx_int64)errLimitMin, (vx_int64)errLimitMax);
+		}
+		else if (pixelType == VX_TYPE_FLOAT32) {
+			errorPixelCount = ComparePixels((vx_float32 *)base_ptr, addr.stride_y, (vx_float32 *)pRef, plane_width_in_bytes, region_width, region_height, (vx_float32)errLimitMin, (vx_float32)errLimitMax);
+		}
+		else if (pixelType == VX_DF_IMAGE_U1_AMD) {
+			errorPixelCount = ComparePixelsU001((vx_uint8 *)base_ptr, addr.stride_y, (vx_uint8 *)pRef, plane_width_in_bytes, region_width, region_height);
+		}
+		else {
+			errorPixelCount = ComparePixels((vx_uint8 *)base_ptr, addr.stride_y, (vx_uint8 *)pRef, plane_width_in_bytes, region_width, region_height, (vx_int32)errLimitMin, (vx_int32)errLimitMax);
+		}
+		ERROR_CHECK(vxCommitImagePatch(image, rectRegion, plane, &addr, base_ptr));
+		// report results
+		errorPixelCountTotal += errorPixelCount;
+		if (errorPixelCount > 0) {
+			char name[64]; vxGetReferenceName((vx_reference)image, name, sizeof(name));
+			printf("ERROR: Image COMPARE MISMATCHED %s plane#%d " VX_FMT_SIZE "-pixel(s) with frame#%d of %s\n", name, plane, errorPixelCount, frameNumber, fileNameRef ? fileNameRef : "???");
+		}
+		// skip to begnning of next plane
+		pRefPlane += plane_height * plane_width_in_bytes;
+	}
+	return errorPixelCountTotal;
+}
+
+// read image
+int ReadImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp)
+{
+	// get number of planes, image format, and pixel type
+	vx_df_image format = VX_DF_IMAGE_VIRT;
+	vx_size num_planes = 0;
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)));
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes)));
+	// read all image planes into vx_image and check if EOF has occured while reading
+	bool eofDetected = false;
+	for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++){
+		vx_imagepatch_addressing_t addr;
+		vx_uint8 * src = NULL;
+		ERROR_CHECK(vxAccessImagePatch(image, rectFull, plane, &addr, (void **)&src, VX_WRITE_ONLY));
+		vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY;
+		vx_size width_in_bytes = (format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x);
+		for (vx_uint32 y = 0; y < addr.dim_y; y += addr.step_y){
+			vx_uint8 *srcp = (vx_uint8 *)vxFormatImagePatchAddress2d(src, 0, y, &addr);
+			if (fread(srcp, 1, width_in_bytes, fp) != width_in_bytes) {
+				eofDetected = true;
+				break;
+			}
+		}
+		ERROR_CHECK(vxCommitImagePatch(image, rectFull, plane, &addr, src));
+	}
+	// return 1 if EOF detected, other 0
+	return eofDetected ? 1 : 0;
+}
+
+// write image
+int WriteImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp)
+{
+	// get number of planes, image format, and pixel type
+	vx_df_image format = VX_DF_IMAGE_VIRT;
+	vx_size num_planes = 0;
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)));
+	ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes)));
+	// write all image planes from vx_image
+	bool eofDetected = false;
+	for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++){
+		vx_imagepatch_addressing_t addr;
+		vx_uint8 * src = NULL;
+		ERROR_CHECK(vxAccessImagePatch(image, rectFull, plane, &addr, (void **)&src, VX_READ_ONLY));
+		vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY;
+		vx_size width_in_bytes = (format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x);
+		for (vx_uint32 y = 0; y < addr.dim_y; y += addr.step_y){
+			vx_uint8 *srcp = (vx_uint8 *)vxFormatImagePatchAddress2d(src, 0, y, &addr);
+			fwrite(srcp, 1, width_in_bytes, fp);
+		}
+		ERROR_CHECK(vxCommitImagePatch(image, rectFull, plane, &addr, src));
+	}
+	return 0;
+}
+
+// read scalar value into a string
+int ReadScalarToString(vx_scalar scalar, char str[])
+{
+	vx_enum type; ERROR_CHECK(vxQueryScalar(scalar, VX_SCALAR_ATTRIBUTE_TYPE, &type, sizeof(type)));
+	if (type == VX_TYPE_FLOAT32) {
+		float v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%g", v);
+	}
+	else if (type == VX_TYPE_FLOAT64) {
+		double v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%lg", v);
+	}
+	else if (type == VX_TYPE_SIZE) {
+		vx_size v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, VX_FMT_SIZE, v);
+	}
+	else if (type == VX_TYPE_INT8 || type == VX_TYPE_CHAR) {
+		vx_int8 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%d", v);
+	}
+	else if (type == VX_TYPE_INT16) {
+		vx_int16 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%d", v);
+	}
+	else if (type == VX_TYPE_INT32 || type == VX_TYPE_BOOL) {
+		vx_int32 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%d", v);
+	}
+	else if (type == VX_TYPE_INT64) {
+		vx_int64 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%" PRId64, v);
+	}
+	else if (type == VX_TYPE_UINT8) {
+		vx_uint8 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%d", v);
+	}
+	else if (type == VX_TYPE_UINT16) {
+		vx_uint16 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%d", v);
+	}
+	else if (type == VX_TYPE_UINT32) {
+		vx_uint32 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%d", v);
+	}
+	else if (type == VX_TYPE_UINT64) {
+		vx_uint64 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "%" PRIu64, v);
+	}
+	else if (type == VX_TYPE_ENUM) {
+		vx_enum v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		const char * name = ovxEnum2Name(v);
+		if (name) strcpy(str, name);
+		else sprintf(str, "0x%x", v);
+	}
+	else if (type == VX_TYPE_DF_IMAGE || type == VX_TYPE_STRING_AMD) {
+		str[4] = 0; // needed for VX_TYPE_DF_IMAGE
+		ERROR_CHECK(vxReadScalarValue(scalar, str));
+	}
+	else {
+		// unknown types will be printed in hex
+		vx_uint64 v = 0; ERROR_CHECK(vxReadScalarValue(scalar, &v));
+		sprintf(str, "0x%" PRIx64, v);
+	}
+	return 0;
+}
+
+// get scalar value from string
+int GetScalarValueFromString(vx_enum type, const char str[], vx_uint64 * value)
+{
+	if (type == VX_TYPE_FLOAT32) {
+		float v = 0; (void)sscanf(str, "%g", &v);
+		*(float *)value = v;
+	}
+	else if (type == VX_TYPE_FLOAT64) {
+		double v = 0; (void)sscanf(str, "%lg", &v);
+		*(double *)value = v;
+	}
+	else if (type == VX_TYPE_SIZE) {
+		vx_size v = 0; (void)sscanf(str, VX_FMT_SIZE, &v);
+		*(vx_size *)value = v;
+	}
+	else if (type == VX_TYPE_INT8 || type == VX_TYPE_INT16 || type == VX_TYPE_INT32 ||
+		type == VX_TYPE_UINT8 || type == VX_TYPE_UINT16 || type == VX_TYPE_UINT32 ||
+		type == VX_TYPE_CHAR || type == VX_TYPE_BOOL || type == VX_TYPE_DF_IMAGE)
+	{
+		vx_int32 v = 0; (void)sscanf(str, "%i", &v);
+		*(vx_int32 *)value = v;
+	}
+	else if (type == VX_TYPE_INT64 || type == VX_TYPE_UINT64) {
+		vx_int64 v = 0; (void)sscanf(str, "%" PRIi64, &v);
+		*(vx_int64 *)value = v;
+	}
+	else if (type == VX_TYPE_ENUM) {
+		vx_enum v = ovxName2Enum(str);
+		*(vx_enum *)value = v;
+	}
+	else {
+		return -1;
+	}
+	return 0;
+}
+
+// write scalar value from a string
+int WriteScalarFromString(vx_scalar scalar, const char str[])
+{
+	vx_enum type; ERROR_CHECK(vxQueryScalar(scalar, VX_SCALAR_ATTRIBUTE_TYPE, &type, sizeof(type)));
+	if (type == VX_TYPE_FLOAT32) {
+		float v = 0; (void)sscanf(str, "%g", &v);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	else if (type == VX_TYPE_FLOAT64) {
+		double v = 0; (void)sscanf(str, "%lg", &v);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	else if (type == VX_TYPE_SIZE) {
+		vx_size v = 0; (void)sscanf(str, VX_FMT_SIZE, &v);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	else if (type == VX_TYPE_INT8 || type == VX_TYPE_INT16 || type == VX_TYPE_INT32 || 
+		     type == VX_TYPE_UINT8 || type == VX_TYPE_UINT16 || type == VX_TYPE_UINT32 ||
+			 type == VX_TYPE_CHAR || type == VX_TYPE_BOOL)
+	{
+		vx_int32 v = 0; (void)sscanf(str, "%i", &v);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	else if (type == VX_TYPE_INT64 || type == VX_TYPE_UINT64) {
+		vx_int64 v = 0; (void)sscanf(str, "%" PRIi64, &v);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	else if (type == VX_TYPE_ENUM) {
+		vx_enum v = ovxName2Enum(str);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	else if (type == VX_TYPE_DF_IMAGE || type == VX_TYPE_STRING_AMD) {
+		ERROR_CHECK(vxWriteScalarValue(scalar, str));
+	}
+	else {
+		// unknown types will be assumed to be in hex format
+		vx_int64 v = 0; (void)sscanf(str, "%" PRIi64, &v);
+		ERROR_CHECK(vxWriteScalarValue(scalar, &v));
+	}
+	return 0;
+}
+
+// put scalar value to string
+int PutScalarValueToString(vx_enum type, const void * value, char str[])
+{
+	if (type == VX_TYPE_FLOAT32) {
+		sprintf(str, "%g", *(float *)value);
+	}
+	else if (type == VX_TYPE_FLOAT64) {
+		sprintf(str, "%lg", *(double *)value);
+	}
+	else if (type == VX_TYPE_SIZE) {
+		sprintf(str, VX_FMT_SIZE, *(vx_size *)value);
+	}
+	else if (type == VX_TYPE_INT8 || type == VX_TYPE_CHAR) {
+		sprintf(str, "%d", *(vx_int8 *)value);
+	}
+	else if (type == VX_TYPE_INT16) {
+		sprintf(str, "%d", *(vx_int16 *)value);
+	}
+	else if (type == VX_TYPE_INT32 || type == VX_TYPE_BOOL) {
+		sprintf(str, "%d", *(vx_int32 *)value);
+	}
+	else if (type == VX_TYPE_INT64) {
+		sprintf(str, "%" PRId64, *(vx_int64 *)value);
+	}
+	else if (type == VX_TYPE_UINT8) {
+		sprintf(str, "%u", *(vx_uint8 *)value);
+	}
+	else if (type == VX_TYPE_UINT16) {
+		sprintf(str, "%u", *(vx_uint16 *)value);
+	}
+	else if (type == VX_TYPE_UINT32) {
+		sprintf(str, "%u", *(vx_uint32 *)value);
+	}
+	else if (type == VX_TYPE_UINT64) {
+		sprintf(str, "%" PRIu64, *(vx_uint64 *)value);
+	}
+	else if (type == VX_TYPE_ENUM) {
+		vx_enum v = *(vx_enum *)value;
+		const char * name = ovxEnum2Name(v);
+		if (name) strcpy(str, name);
+		else sprintf(str, "0x%x", v);
+	}
+	else if (type == VX_TYPE_DF_IMAGE || type == VX_TYPE_STRING_AMD) {
+		if (type == VX_TYPE_DF_IMAGE) {
+			str[4] = 0; strncpy(str, (const char *)value, 4);
+		}
+		else strcpy(str, (const char *)value);
+	}
+	else {
+		return -1;
+	}
+	return 0;
+}
diff --git a/runvx/vxUtils.h b/runvx/vxUtils.h
new file mode 100644
index 0000000..2de1692
--- /dev/null
+++ b/runvx/vxUtils.h
@@ -0,0 +1,172 @@
+/* 
+Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+ 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef __VX_UTILS_H__
+#define __VX_UTILS_H__
+
+#include <VX/vx.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <sstream>
+#include <string>
+#include <vector>
+#include <map>
+#include <list>
+#include <algorithm>
+
+#if _WIN32
+#include <Windows.h>
+#include <wincrypt.h>
+#else
+#include <chrono>
+#include <openssl/hmac.h>
+#include <openssl/md5.h>
+#include <unistd.h>
+
+#include <strings.h>
+#define _strnicmp strncasecmp
+#define _stricmp  strcasecmp
+#endif
+
+#include "vx_ext_amd.h"
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+#ifndef USE_OPENCV
+#define USE_OPENCV 1
+#endif
+#if USE_OPENCV
+#include <opencv2/opencv.hpp>
+#include <opencv/cv.h>
+#include <opencv/highgui.h>
+using namespace cv;
+#endif
+
+using namespace std;
+
+///////////////////////////////////////////////////////////////////////////
+// macros
+///////////////////////////////////////////////////////////////////////////
+
+// error check/report macros
+#define ReportError(...) { printf(__VA_ARGS__); throw -1; }
+#define ERROR_CHECK(call) { vx_status status = call; if(status) ReportError("ERROR: " #call "=> %d (%s) [" __FILE__ "#%d]\n", status, ovxEnum2Name(status), __LINE__); }
+#define NULLPTR_CHECK(call) if((call) == nullptr) ReportError("ERROR: " #call "=> nullptr [" __FILE__ "#%d]\n", __LINE__)
+
+///////////////////////////////////////////////////////////////////////////
+// platform independent functions
+///////////////////////////////////////////////////////////////////////////
+
+// get clock counter
+inline int64_t utilGetClockCounter()
+{
+#if _WIN32
+	LARGE_INTEGER v;
+	QueryPerformanceCounter(&v);
+	return v.QuadPart;
+#else
+	return chrono::high_resolution_clock::now().time_since_epoch().count();
+#endif
+}
+
+// get clock frequency
+inline int64_t utilGetClockFrequency()
+{
+#if _WIN32
+	LARGE_INTEGER v;
+	QueryPerformanceFrequency(&v);
+	return v.QuadPart;
+#else
+	return chrono::high_resolution_clock::period::den / chrono::high_resolution_clock::period::num;
+#endif
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// class CHasher for checksum computation
+///////////////////////////////////////////////////////////////////////////
+class CHasher {
+public:
+	CHasher();
+	~CHasher();
+
+	void Initialize();
+	void Process(vx_uint8 * data_ptr, vx_size count);
+	const char * GetCheckSum();
+	void Shutdown();
+
+private:
+#if _WIN32
+	HCRYPTPROV m_cryptProv;
+	HCRYPTHASH m_cryptHash;
+#else
+	MD5_CTX m_handle;
+#endif
+	vx_uint8 m_hash[16];
+	char m_checkSum[33];
+
+};
+
+///////////////////////////////////////////////////////////////////////////
+// utility functions
+///////////////////////////////////////////////////////////////////////////
+
+// Utility functions to replace ~ with a user specified directory
+//   SetRootDir -- set the root directory (default ".")
+//   RootDirUpdated -- replace ~ in filePath with root directory
+void SetRootDir(const char * rootDir);
+const char * RootDirUpdated(const char * filePath);
+
+// enumeration constants
+//  ovxEnum2Name -- the returns a global pointer, so returned string has to be saved by caller immediately
+//  ovxEnum2String -- return enum name or hex value as a string
+//  ovxName2Enum -- returns enum corresponding to name or hex value in the input string
+const char * ovxEnum2Name(vx_enum e);
+void ovxEnum2String(vx_enum e, char str[]);
+vx_enum ovxName2Enum(const char * name);
+
+// compute checksum of rectangular region specified within an image
+void ComputeChecksum(char checkSumString[64], vx_image image, vx_rectangle_t * rectRegion);
+// compare rectangular region specified within an image and return number of pixels mismatching
+size_t CompareImage(vx_image image, vx_rectangle_t * rectRegion, vx_uint8 * refImage, float errLimitMin, float errLimitMax, int frameNumber, const char * fileNameRef);
+// read image
+int ReadImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp);
+// write image
+int WriteImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp);
+
+// read & write scalar value to/from a string
+int ReadScalarToString(vx_scalar scalar, char str[]);
+int WriteScalarFromString(vx_scalar scalar, const char str[]);
+int GetScalarValueFromString(vx_enum type, const char str[], vx_uint64 * value);
+int PutScalarValueToString(vx_enum type, const void * value, char str[]);
+
+// useful utility functions:
+//   stristr -- case insensitive version of strstr
+const char * stristr(const char * str1, const char * str2);
+vector<string> &split(const string &s, char delim, vector<string> &elems);
+int convert_image_format(string format);
+
+#endif
\ No newline at end of file