diff --git a/algorithm_config.yaml b/algorithm_config.yaml
index ad845f094e0a6cc55d9ac0c04753cd42c583132b..59d38f2bab77beee5f10de52ddf2cd7d6cd60b0a 100644
--- a/algorithm_config.yaml
+++ b/algorithm_config.yaml
@@ -1,6 +1,6 @@
 description: Estimates canopy height from InSAR coherence and LiDAR data
 algo_name: ich
-version: 0.4
+version: 0.4.1
 environment: ubuntu
 repository_url: https://repo.maap-project.org/bnarayanarao/insar_forest_height.git
 docker_url: mas.maap-project.org/root/maap-workspaces/base_images/python:v4.1.0
diff --git a/src/ich/algo.py b/src/ich/algo.py
index 91972b5d683f5f3c66bde67d9ab5ef4d4c7f9354..f457595751430e7f3e0fd4e4c94b805cc767c310 100644
--- a/src/ich/algo.py
+++ b/src/ich/algo.py
@@ -24,95 +24,86 @@ from scipy.interpolate import interpn
 from sklearn.metrics import mean_squared_error as mse
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import os
+from concurrent.futures import ThreadPoolExecutor
+from scipy.interpolate import PchipInterpolator
+
+# Precompute XX and YY
+XX = np.linspace(0, np.pi, num=100, endpoint=True)
+XX[0] = np.spacing(1)  # Avoid division by zero
+YY = np.sin(XX) / XX
+YY[0] = 1  # Correct first value
+YY[-1] = 0  # Correct last value
+XX = XX[::-1]  # Flip arrays
+YY = YY[::-1]
+
+# Precompute the interpolation function
+interp_func = PchipInterpolator(YY, XX)
+
+def arc_sinc_fast(x, c_param):
+    # Clip x to the range [0, 1], works for both 1D and 2D arrays
+    x_clipped = np.clip(x, 0, 1)
+    
+    # Interpolate and apply c_param scaling (works for both 1D and 2D arrays)
+    y = interp_func(x_clipped) * c_param
+    
+    # Clip final result to the range [0, inf), works for both 1D and 2D arrays
+    return np.maximum(y, 0)
 
-def arc_sinc(x, c_param):
-    x = np.clip(x, 0, 1)  # Ensure x is between 0 and 1
+
+def arc_sinc_(x, c_param):
+    # Get rid of extreme values by set all values where x > 1 equal to 1, and x < 0 equal to 0 
+    if x >1:
+        x=1
+    elif x<0:
+        x=0
+    # x[(x > 1)] = 1
+    # x[(x < 0)] = 0
+
+    # Create array of increments between 0 and pi of size pi/100
     XX = np.linspace(0, np.pi, num=100, endpoint=True)
-    XX[0] = np.spacing(1)  # Avoid division by zero issues
+
+    # Set the first value of XX to eps to avoid division by zero issues -> Paul's suggestion
+    XX[0] = np.spacing(1)
+
+    # Calculate sinc for XX and save it to YY
+    ## YY = sinc(XX / math.pi)
     YY = np.sin(XX) / XX
+
+    # Reset the first value of XX to zero and the first value of YY to the corresponding output
     XX[0] = 0
     YY[0] = 1
+    
+    # Set the last value of YY to 0 to avoid NaN issues
     YY[-1] = 0
+    # Flip XX and YY left to right
     XX = XX[::-1]
     YY = YY[::-1]
-    interp_func = interpolate.interp1d(YY, XX * c_param, kind='slinear', bounds_error=False, fill_value=0)
+    # print(XX.shape,YY.shape)
+     # Run interpolation
+    # XX and YY are your original values, x is the query values, and y is the interpolated values that correspond to x
+    interp_func = interpolate.interp1d(YY, XX * c_param, kind='slinear') 
     y = interp_func(x)
-    return np.clip(y, 0, None)  # Ensure no negative heights
-
-from concurrent.futures import ThreadPoolExecutor
-import os
-
-
-def arc_sinc_block(x_chunk, c_param_chunk, XX, YY):
-    # Initialize result array for the chunk
-    y_chunk = np.zeros_like(x_chunk)
-    
-    # Handle the interpolation for each unique c_param value in the chunk
-    for unique_c in np.unique(c_param_chunk):
-        # Interpolation function for the current unique c_param
-        interp_func = interpolate.interp1d(YY, XX * unique_c, kind='slinear', bounds_error=False, fill_value=0)
-        
-        # Apply the interpolation
-        mask = (c_param_chunk == unique_c)
-        y_chunk[mask] = interp_func(x_chunk[mask])
-    
-    # Clip the result to ensure non-negative values
-    return np.clip(y_chunk, 0, None)
-
 
+    # Set all values in y less than 0 equal to 0
+    y[(y < 0)] = 0
+    # return y
+    return y
 
 
-def arc_sinc_vectorized_parallel(x, c_param, chunk_size=100000):
-    # Ensure x and c_param are numpy arrays
-    x = np.asarray(x)
-    c_param = np.asarray(c_param)
-    
-    # Clip x to the range [0, 1]
-    x = np.clip(x, 0, 1)
-    
-    # Create array of increments between 0 and pi
+def arc_sinc(x, c_param):
+    x = np.clip(x, 0, 1)  # Ensure x is between 0 and 1
     XX = np.linspace(0, np.pi, num=100, endpoint=True)
-    
-    # Set the first value of XX to eps to avoid division by zero issues
-    XX[0] = np.spacing(1)
-    
-    # Calculate sinc for XX and save it to YY
+    XX[0] = np.spacing(1)  # Avoid division by zero issues
     YY = np.sin(XX) / XX
-    
-    # Reset the first value of XX and YY
     XX[0] = 0
     YY[0] = 1
     YY[-1] = 0
-    
-    # Reverse arrays for interpolation
     XX = XX[::-1]
     YY = YY[::-1]
-    
-    # Get the length of the input arrays
-    n = len(x)
-    
-    # Automatically get the number of available CPU cores
-    num_workers = os.cpu_count()
-    
-    # Split input arrays into chunks and process in parallel
-    y = np.zeros_like(x)
-    with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        futures = []
-        
-        # Loop over chunks
-        for i in range(0, n, chunk_size):
-            x_chunk = x[i:i + chunk_size]
-            c_param_chunk = c_param[i:i + chunk_size]
-            
-            # Submit chunk processing to the thread pool
-            futures.append(executor.submit(arc_sinc_block, x_chunk, c_param_chunk, XX, YY))
-        
-        # Collect results
-        for idx, future in enumerate(futures):
-            i = idx * chunk_size
-            y[i:i + chunk_size] = future.result()
-    
-    return y
+    interp_func = interpolate.interp1d(YY, XX * c_param, kind='slinear', bounds_error=False, fill_value=0)
+    y = interp_func(x)
+    return np.clip(y, 0, None)  # Ensure no negative heights
+
 
 def arc_sinc_vectorized(x, c_param):
     # Ensure x and c_param are numpy arrays
@@ -170,6 +161,9 @@ def cal_(cor, gedi, htl, htg):
             for C_param in C_range:
                 gama = np.clip(temp_cor / S_param, 0, 1)
                 height = arc_sinc(gama, C_param)
+                # height = arc_sinc_fast(gama, C_param)
+
+                
                 height[(height >= htg) | (height <= htl)] = np.nan
                 x, y = temp_gedi.flatten(), height.flatten()
                 valid = ~np.isnan(x) & ~np.isnan(y)
@@ -204,6 +198,7 @@ def cal_(cor, gedi, htl, htg):
                 for C_param in np.arange(cCoarse - 1, cCoarse + 1, 0.2):
                     gama = np.clip(temp_cor / S_param, 0, 1)
                     height = arc_sinc(gama, C_param)
+                    # height = arc_sinc_fast(gama, C_param)
                     height[(height >= htg) | (height <= htl)] = np.nan
                     x, y = temp_gedi.flatten(), height.flatten()
                     valid = ~np.isnan(x) & ~np.isnan(y)
@@ -267,6 +262,7 @@ def process_block(i, j, cohArray, lidarArray, initial_ws, htl, htg, parm_):
 
             gama = cohBlock / parm[1]
             ht = arc_sinc(gama, parm[2]) * mask
+            # ht = arc_sinc_fast(gama, parm[2]) * mask
 
             return start_i, end_i, start_j, end_j, s_parm, c_parm, rmse_parm, ht, count
         else:
diff --git a/src/ich/args_in.py b/src/ich/args_in.py
index 7e269072e81e8e717bceb27a6854a872811648d2..498c3a1ca9601edde075cc162206e28a5a8a5170 100644
--- a/src/ich/args_in.py
+++ b/src/ich/args_in.py
@@ -26,7 +26,7 @@ from scipy.interpolate import interpn
 from empatches import EMPatches
 from sklearn.model_selection import train_test_split
 from plotting import density_scatter, plot_data, rmse
-from algo import arc_sinc, arc_sinc_vectorized, cal_, dynamicWindow,arc_sinc_vectorized_parallel
+from algo import arc_sinc_fast,arc_sinc, arc_sinc_, cal_, dynamicWindow
 from rst_io import read_bin, write_tif, write_bin
 from utils import blockshaped, unblockshaped, unpad, spatial_intp_lin, split_sample
 
@@ -191,7 +191,14 @@ def rvog_inverse(args):
     height = []
     print('...')
     # height = arc_sinc_vectorized(gama, c_temp)
-    height = arc_sinc_vectorized_parallel(gama, c_temp)
+    # height = arc_sinc_vectorized_parallel(gama, c_temp)
+    height = arc_sinc_fast(gama, c_temp)
+
+    # for ii in tqdm(range(np.size(c_temp))):
+    # # for ii in (range(np.size(c_temp))):
+    #     height.append(arc_sinc_(gama[ii], c_temp[ii]))
+
+
     height = np.array(height).reshape(np.shape(temp_cor))
     height[height < 0] = 0
     print("%0.2f sec"%(time.time()-t2))