diff --git a/mvect-cuda.py b/mvect-cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..05e418641d4df1d43f593dddf2d109525417980e --- /dev/null +++ b/mvect-cuda.py @@ -0,0 +1,255 @@ +import pycuda.driver as cuda +import pycuda.autoinit +import pycuda.gpuarray +from pycuda.compiler import SourceModule +import cv2 as cv +import numpy as np +import sys +import os +import time +import math + +# +# Simplified Motion vector code that does the color coding and velocity coding using Jetson NANO GPUs. +# Primarily done as a test of functionality and speed, without regard to programming style or completeness +# Specifically in the areas of error checking and recovery. Also, I'm a complete neophyte in Python. so no laughing! + +downsizeRatio = 0.5 + +# +# CUDA Code: +# +mod = SourceModule(""" + #include <stdint.h> + #include <cuda.h> + #include <math_constants.h> + __global__ void imageProc(int32_t n, // input - count of items in the arrays + float *flow, // input - coordinants from the optical flow + uint8_t *color, // output - color angles + uint8_t *velocity, // output - velocity + uint8_t *prev_velocity, // output/input - previous velocity + uint8_t *deltav) // output - delta velocity + { + const float vmax = 255.0/12.0; + const uint8_t vCutOff = 3; // minimum pixel velocity to display colors for + uint8_t vel; + float clr; + + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if(idx < n) + { + int color_idx = idx * 3; + int flow_idx = idx * 2; + + float v = vmax * sqrtf((flow[flow_idx]*flow[flow_idx]) + (flow[flow_idx+1]*flow[flow_idx+1])); + + // OpenCV returns 0-360, rather than the more common -180 - 180 degrees. so, make it like opencv + float ang = (180.0/CUDART_PI_F) * (CUDART_PI_F + atan2f(flow[flow_idx+1],flow[flow_idx])); + + vel = 0xff & __float2uint_rn(v); + if(v > 255.0) { + vel = 255; + } + velocity[idx] = vel; + deltav[idx] = 0xff & __float2uint_rn(abs(vel - prev_velocity[idx])); + prev_velocity[idx] = velocity[idx]; + color[color_idx + 0] = 0; // red + color[color_idx + 1] = 0; // green + color[color_idx + 2] = 0; // blue + + if(vel > vCutOff) { + clr=(ang/360.) * 4./6.; + if(clr<=(1./6.)) { + color[color_idx + 0] = 0; // red + color[color_idx + 1] = 255 & int(255*(6*clr)); // green + color[color_idx + 2] = 255; // blue + } else if(clr<=(2./6.)) { + color[color_idx + 0] = 0; // red + color[color_idx + 1] = 255; // green + color[color_idx + 2] = 255 & int(255*(2-6*clr)); // blue + } else if(clr<=(3./6.)) { + color[color_idx + 0] = 255 & int(255*(6*clr-2)); // red + color[color_idx + 1] = 255; // green + color[color_idx + 2] = 0; // blue + } else if(clr<=(4./6.)) { + color[color_idx + 0] = 255; // red + color[color_idx + 1] = 255 & int(255*(4-6*clr)); // green + color[color_idx + 2] = 0; // blue + } else if(clr<=(5./6.)) { + color[color_idx + 0] = 255; // red + color[color_idx + 1] = 0; // green + color[color_idx + 2] = 255 & int(255*(6*clr-4)); // blue + } else { + color[color_idx + 0] = 255 & int(255*(6-6*clr)); // red + color[color_idx + 1] = 0; // green + color[color_idx + 2] = 255; // blue + } + } + } + } + """) + + + +atitle = (sys.argv) +title = atitle[1] +#cap = cv.VideoCapture('http://login:tinkering@132.239.4.196/control/faststream.jpg?stream=full') +cap = cv.VideoCapture(title) +#ret, first_frame = cap.read() +#motion_frame = cv.resize(first_frame, (0,0), fx=downsizeRatio, fy=downsizeRatio) +ret, motion_frame = cap.read() + +prev_gray = cv.cvtColor(motion_frame, cv.COLOR_BGR2GRAY) +mask = np.zeros_like(motion_frame) + +color_array = pycuda.gpuarray.zeros(prev_gray.size * 3, np.uint8) +velocity_array = pycuda.gpuarray.zeros(prev_gray.size, np.uint8) +prev_velocity = pycuda.gpuarray.zeros(prev_gray.size, np.uint8) +prev_velocity.fill(0xaa); +deltav_array = pycuda.gpuarray.zeros(prev_gray.shape, np.uint8) +block_size = np.int32(prev_gray.size) + +print("Block Size: {}".format(block_size)) +print("motion_frame shape: {}".format(motion_frame.shape)) +print("motion_frame size: {}".format(motion_frame.size)) + +print("color_array size: {}".format(color_array.size)) +print("color_array shape: {}".format(color_array.shape)) + +print("deltav size: {}".format(deltav_array.size)) +print("deltav shape: {}".format(deltav_array.shape)) + +print("color_array size: {}".format(color_array.size)) +print("color_array shape: {}".format(color_array.shape)) + +print("prev_gray size: {}".format(prev_gray.size)) +print("prev_gray shape: {}".format(prev_gray.shape)) + + + +mask[..., 1] = 255 + +frameCount = 0 +totalProcessingTime = 0 + +while(cap.isOpened()): + ret, newFrame = cap.read() + frameCount += 1 + start = time.time() + # Opens a new window and displays the input frame + try: +# newFrame = cv.resize(frame, (0,0), fx=downsizeRatio, fy=downsizeRatio) + + # cv.imshow("input", newFrame) + + # Converts each frame to grayscale - we previously only converted the first frame to grayscale + gray = cv.cvtColor(newFrame, cv.COLOR_BGR2GRAY) + # Calculates dense optical flow by Farneback method + # https://docs.opencv.org/3.0-beta/modules/video/doc/motion_analysis_and_object_tracking.html#calcopticalflowfarneback + flow = cv.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0) + flow_array = pycuda.gpuarray.to_gpu(flow) + # Computes the magnitude and angle of the 2D vectors +# +# Output the magnitudes and the angles. +# angle colors are black unless the veliocity exceeds vCutoff +# + a = open("Data/angles-{:06d}.ppm".format(frameCount),"w+b") + v = open("Data/velocities-{:06d}.pgm".format(frameCount),"w+b") + dv = open("Data/deltav-{:06d}.pgm".format(frameCount),"w+b") +# +# Write out the file headers +# + a.write("P6\n{:d} {:d}\n255\n".format(len(flow[0]),len(flow))) + v.write("P5\n{:d} {:d}\n255\n".format(len(flow[0]),len(flow))) + dv.write("P5\n{:d} {:d}\n255\n".format(len(flow[0]),len(flow))) + +#Device 0: "NVIDIA Tegra X1" +# CUDA Driver Version / Runtime Version 10.0 / 10.0 +# CUDA Capability Major/Minor version number: 5.3 +# Total amount of global memory: 3957 MBytes (4148756480 bytes) +# ( 1) Multiprocessors, (128) CUDA Cores/MP: 128 CUDA Cores +# GPU Max Clock rate: 922 MHz (0.92 GHz) +# Memory Clock rate: 13 Mhz +# Memory Bus Width: 64-bit +# L2 Cache Size: 262144 bytes +# Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096) +# Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers +# Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers +# Total amount of constant memory: 65536 bytes +# Total amount of shared memory per block: 49152 bytes +# Total number of registers available per block: 32768 +# Warp size: 32 +# Maximum number of threads per multiprocessor: 2048 +# Maximum number of threads per block: 1024 +# Max dimension size of a thread block (x,y,z): (1024, 1024, 64) +# Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) +# Maximum memory pitch: 2147483647 bytes +# Texture alignment: 512 bytes +# Concurrent copy and kernel execution: Yes with 1 copy engine(s) +# Run time limit on kernels: Yes +# Integrated GPU sharing Host Memory: Yes +# Support host page-locked memory mapping: Yes +# Alignment requirement for Surfaces: Yes +# Device has ECC support: Disabled +# Device supports Unified Addressing (UVA): Yes +# Device supports Compute Preemption: No +# Supports Cooperative Kernel Launch: No +# Supports MultiDevice Co-op Kernel Launch: No +# Device PCI Domain ID / Bus ID / location ID: 0 / 0 / 0 +# Compute Mode: +# < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > +# +#deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.0, CUDA Runtime Version = 10.0, NumDevs = 1 +# +# Give the GPU pointers to color, velocity and delta V outputs +# and the flow inputs +# + + + + bdim = (1024,1,1) + gridSize = (block_size+1024-1)/1024 + + gdim = (gridSize,1) +# print("gdim: {}".format(gdim)) +# print("Velocity_array.nbytes = {}".format(velocity_array.nbytes)); +# print("deltav_array.nbytes = {}".format(deltav_array.nbytes)); + + arg_types = ('I','P','P','P','P','P') # Tell the prepare function what to expect + func = mod.get_function("imageProc") + func.prepare(arg_types) + func.prepared_call(gdim, # grid Dimension (x,y) + bdim, # Block Dimension (x,y,z) + velocity_array.nbytes, # Size of our data block + flow_array.gpudata, # GPU Pointer to the flow + color_array.gpudata, # GPU Pointer to the color array to return + velocity_array.gpudata, # GPU Pointer to the velocity data + prev_velocity.gpudata, # GPU Pointer to the previous velocity (Persists between frames) + deltav_array.gpudata) # GPU Pointer to the delta V array + + + b = color_array.get() + b.tofile(a) + a.close() + velocity_array.get().tofile(v) + v.close() + deltav_array.get().tofile(dv) + dv.close() + + wname = 'Data/org'+str(frameCount).zfill(6)+'.ppm' + cv.imwrite(wname,newFrame) + + # Updates previous frame + prev_gray = gray + # Frames are read by intervals of 1 millisecond. The programs breaks out of the while loop when the user presses the 'q' key + if cv.waitKey(1) & 0xFF == ord('q'): + break + print("Frame: {} Elapsed: {}".format(frameCount, time.time() - start)) + totalProcessingTime += time.time() - start + print("Average frame processing time: {}".format(totalProcessingTime/frameCount)) + except: + raise + break +# The following frees up resources and closes all windows +cap.release() +cv.destroyAllWindows()