diff --git a/mvect-cuda.py b/mvect-cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..05e418641d4df1d43f593dddf2d109525417980e
--- /dev/null
+++ b/mvect-cuda.py
@@ -0,0 +1,255 @@
+import pycuda.driver as cuda
+import pycuda.autoinit
+import pycuda.gpuarray
+from pycuda.compiler import SourceModule
+import cv2 as cv
+import numpy as np
+import sys
+import os
+import time
+import math
+
+#
+# Simplified Motion vector code that does the color coding and velocity coding using Jetson NANO GPUs.
+# Primarily done as a test of functionality and speed, without regard to programming style or completeness 
+# Specifically in the areas of error checking and recovery. Also, I'm a complete neophyte in Python. so no laughing!
+
+downsizeRatio    = 0.5
+
+#
+# CUDA Code:
+#
+mod = SourceModule("""
+           #include <stdint.h>
+           #include <cuda.h>
+           #include <math_constants.h>
+           __global__ void imageProc(int32_t n,              // input - count of items in the arrays
+                                     float   *flow,          // input - coordinants from the optical flow
+                                     uint8_t *color,         // output - color angles
+                                     uint8_t *velocity,      // output - velocity
+                                     uint8_t *prev_velocity, // output/input - previous velocity
+                                     uint8_t *deltav)        // output - delta velocity
+           {
+               const float   vmax = 255.0/12.0;
+               const uint8_t vCutOff = 3;   // minimum pixel velocity to display colors for
+               uint8_t       vel;
+               float         clr;
+
+               int idx = (blockIdx.x * blockDim.x) + threadIdx.x; 
+               if(idx < n) 
+               {
+                   int color_idx = idx * 3;
+                   int flow_idx  = idx * 2;
+
+                   float v   = vmax * sqrtf((flow[flow_idx]*flow[flow_idx]) + (flow[flow_idx+1]*flow[flow_idx+1]));
+
+                   // OpenCV returns 0-360, rather than the more common -180 - 180 degrees. so, make it like opencv
+                   float ang = (180.0/CUDART_PI_F) * (CUDART_PI_F + atan2f(flow[flow_idx+1],flow[flow_idx]));
+                   
+                   vel = 0xff & __float2uint_rn(v);
+                   if(v > 255.0) {
+                       vel = 255;
+                   }
+                   velocity[idx]        = vel;
+                   deltav[idx]          = 0xff &  __float2uint_rn(abs(vel - prev_velocity[idx]));
+                   prev_velocity[idx]   = velocity[idx];
+                   color[color_idx + 0] = 0; // red
+                   color[color_idx + 1] = 0; // green
+                   color[color_idx + 2] = 0; // blue
+
+                   if(vel > vCutOff) {
+                       clr=(ang/360.) * 4./6.;
+                       if(clr<=(1./6.)) {
+                           color[color_idx + 0] = 0; // red
+                           color[color_idx + 1] = 255 & int(255*(6*clr)); // green
+                           color[color_idx + 2] = 255; // blue
+                       } else if(clr<=(2./6.)) {
+                           color[color_idx + 0] = 0; // red
+                           color[color_idx + 1] = 255; // green
+                           color[color_idx + 2] = 255 & int(255*(2-6*clr)); // blue
+                       } else if(clr<=(3./6.)) {
+                           color[color_idx + 0] = 255 & int(255*(6*clr-2)); // red
+                           color[color_idx + 1] = 255; // green
+                           color[color_idx + 2] = 0; // blue
+                       } else if(clr<=(4./6.)) {
+                           color[color_idx + 0] = 255; // red
+                           color[color_idx + 1] = 255 & int(255*(4-6*clr)); // green
+                           color[color_idx + 2] = 0; // blue
+                       } else if(clr<=(5./6.)) {
+                           color[color_idx + 0] = 255; // red
+                           color[color_idx + 1] = 0; // green
+                           color[color_idx + 2] = 255 & int(255*(6*clr-4)); // blue
+                       } else {
+                           color[color_idx + 0] = 255 & int(255*(6-6*clr)); // red
+                           color[color_idx + 1] = 0; // green
+                           color[color_idx + 2] = 255; // blue
+                       }
+                   }
+               }
+           }
+           """)
+
+
+
+atitle           = (sys.argv)
+title            = atitle[1]
+#cap = cv.VideoCapture('http://login:tinkering@132.239.4.196/control/faststream.jpg?stream=full')
+cap              = cv.VideoCapture(title)
+#ret, first_frame = cap.read()
+#motion_frame     = cv.resize(first_frame, (0,0), fx=downsizeRatio, fy=downsizeRatio)
+ret, motion_frame = cap.read()
+
+prev_gray        = cv.cvtColor(motion_frame, cv.COLOR_BGR2GRAY)
+mask             = np.zeros_like(motion_frame)
+
+color_array      = pycuda.gpuarray.zeros(prev_gray.size * 3, np.uint8)
+velocity_array   = pycuda.gpuarray.zeros(prev_gray.size,     np.uint8)
+prev_velocity    = pycuda.gpuarray.zeros(prev_gray.size,     np.uint8)
+prev_velocity.fill(0xaa);
+deltav_array     = pycuda.gpuarray.zeros(prev_gray.shape,     np.uint8)
+block_size       = np.int32(prev_gray.size)
+
+print("Block Size:         {}".format(block_size))
+print("motion_frame shape: {}".format(motion_frame.shape))
+print("motion_frame size:  {}".format(motion_frame.size))
+
+print("color_array size:   {}".format(color_array.size))
+print("color_array shape:  {}".format(color_array.shape))
+
+print("deltav size:        {}".format(deltav_array.size))
+print("deltav shape:       {}".format(deltav_array.shape))
+
+print("color_array size:   {}".format(color_array.size))
+print("color_array shape:  {}".format(color_array.shape))
+
+print("prev_gray size:     {}".format(prev_gray.size))
+print("prev_gray shape:    {}".format(prev_gray.shape))
+
+
+
+mask[..., 1]     = 255
+
+frameCount          = 0
+totalProcessingTime = 0
+
+while(cap.isOpened()):
+    ret, newFrame = cap.read()
+    frameCount += 1
+    start = time.time()
+    # Opens a new window and displays the input frame
+    try:
+#        newFrame = cv.resize(frame, (0,0), fx=downsizeRatio, fy=downsizeRatio)
+
+        # cv.imshow("input", newFrame)
+
+        # Converts each frame to grayscale - we previously only converted the first frame to grayscale
+        gray = cv.cvtColor(newFrame, cv.COLOR_BGR2GRAY)
+        # Calculates dense optical flow by Farneback method
+        # https://docs.opencv.org/3.0-beta/modules/video/doc/motion_analysis_and_object_tracking.html#calcopticalflowfarneback
+        flow = cv.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+        flow_array = pycuda.gpuarray.to_gpu(flow)
+        # Computes the magnitude and angle of the 2D vectors
+# 
+# Output the magnitudes and the angles.
+# angle colors are black unless the veliocity exceeds vCutoff
+#
+        a  = open("Data/angles-{:06d}.ppm".format(frameCount),"w+b")
+        v  = open("Data/velocities-{:06d}.pgm".format(frameCount),"w+b")
+        dv = open("Data/deltav-{:06d}.pgm".format(frameCount),"w+b")
+#
+# Write out the file headers
+#
+        a.write("P6\n{:d} {:d}\n255\n".format(len(flow[0]),len(flow)))
+        v.write("P5\n{:d} {:d}\n255\n".format(len(flow[0]),len(flow)))
+        dv.write("P5\n{:d} {:d}\n255\n".format(len(flow[0]),len(flow)))
+
+#Device 0: "NVIDIA Tegra X1"
+#  CUDA Driver Version / Runtime Version          10.0 / 10.0
+#  CUDA Capability Major/Minor version number:    5.3
+#  Total amount of global memory:                 3957 MBytes (4148756480 bytes)
+#  ( 1) Multiprocessors, (128) CUDA Cores/MP:     128 CUDA Cores
+#  GPU Max Clock rate:                            922 MHz (0.92 GHz)
+#  Memory Clock rate:                             13 Mhz
+#  Memory Bus Width:                              64-bit
+#  L2 Cache Size:                                 262144 bytes
+#  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
+#  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
+#  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
+#  Total amount of constant memory:               65536 bytes
+#  Total amount of shared memory per block:       49152 bytes
+#  Total number of registers available per block: 32768
+#  Warp size:                                     32
+#  Maximum number of threads per multiprocessor:  2048
+#  Maximum number of threads per block:           1024
+#  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
+#  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
+#  Maximum memory pitch:                          2147483647 bytes
+#  Texture alignment:                             512 bytes
+#  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
+#  Run time limit on kernels:                     Yes
+#  Integrated GPU sharing Host Memory:            Yes
+#  Support host page-locked memory mapping:       Yes
+#  Alignment requirement for Surfaces:            Yes
+#  Device has ECC support:                        Disabled
+#  Device supports Unified Addressing (UVA):      Yes
+#  Device supports Compute Preemption:            No
+#  Supports Cooperative Kernel Launch:            No
+#  Supports MultiDevice Co-op Kernel Launch:      No
+#  Device PCI Domain ID / Bus ID / location ID:   0 / 0 / 0
+#  Compute Mode:
+#     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
+#
+#deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.0, CUDA Runtime Version = 10.0, NumDevs = 1
+#
+# Give the GPU pointers to color, velocity and delta V outputs
+# and the flow inputs
+#
+
+
+
+        bdim     = (1024,1,1)
+        gridSize = (block_size+1024-1)/1024
+
+        gdim = (gridSize,1)
+#        print("gdim: {}".format(gdim))
+#        print("Velocity_array.nbytes = {}".format(velocity_array.nbytes));
+#        print("deltav_array.nbytes = {}".format(deltav_array.nbytes));
+
+        arg_types = ('I','P','P','P','P','P')  # Tell the prepare function what to expect
+        func = mod.get_function("imageProc")
+        func.prepare(arg_types)
+        func.prepared_call(gdim,     # grid Dimension (x,y)
+             bdim,                   # Block Dimension (x,y,z)
+             velocity_array.nbytes,  # Size of our data block
+             flow_array.gpudata,     # GPU Pointer to the flow
+             color_array.gpudata,    # GPU Pointer to the color array to return
+             velocity_array.gpudata, # GPU Pointer to the velocity data
+             prev_velocity.gpudata,  # GPU Pointer to the previous velocity (Persists between frames)
+             deltav_array.gpudata)   # GPU Pointer to the delta V array
+
+
+        b = color_array.get()
+        b.tofile(a)
+        a.close()
+        velocity_array.get().tofile(v)
+        v.close()
+        deltav_array.get().tofile(dv)
+        dv.close()
+
+        wname = 'Data/org'+str(frameCount).zfill(6)+'.ppm'
+        cv.imwrite(wname,newFrame)
+
+        # Updates previous frame
+        prev_gray = gray
+        # Frames are read by intervals of 1 millisecond. The programs breaks out of the while loop when the user presses the 'q' key
+        if cv.waitKey(1) & 0xFF == ord('q'):
+            break
+        print("Frame: {} Elapsed: {}".format(frameCount, time.time() - start))
+        totalProcessingTime += time.time() - start
+        print("Average frame processing time: {}".format(totalProcessingTime/frameCount))
+    except:
+        raise
+        break
+# The following frees up resources and closes all windows
+cap.release()
+cv.destroyAllWindows()