Fixes: pip installation issues (#44)

divyanshugit · web-flow · commit 74586f7fad0b · 2025-06-29T12:10:53.000-07:00
* Fix: Resolve pip installation issues

Signed-off-by: Divyanshu &lt;kumardivyanshu4485@gmail.com&gt;

* revert the formatting changes

Signed-off-by: Divyanshu &lt;kumardivyanshu4485@gmail.com&gt;

* docs: update README with improved installation instructions

Signed-off-by: Divyanshu &lt;kumardivyanshu4485@gmail.com&gt;

---------

Signed-off-by: Divyanshu &lt;kumardivyanshu4485@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -141,7 +141,6 @@ sparse_mlp_forward(
 
 ## Installation
 
-
 ### Build C++ Extensions
 ```bash
 # Clone repository
@@ -160,10 +159,17 @@ Install torch dependencies from [requirements.txt](requirements.txt#L2)
 ```bash
 # Install in editable mode (builds C++ extensions automatically)
 pip install -r requirements.txt
-python setup.py build_ext --inplace
+pip install -e .                 # Auto-detect (prefer GPU if available)
+pip install -e . --build-option=cpu     # Force CPU-only build
+pip install -e . --build-option=gpu     # Force GPU build (fallback to CPU if not available)
+
+# Alternative: Direct setup.py commands
+python setup.py develop         # Auto-detect (prefer GPU if available)
+python setup.py develop cpu     # Force CPU-only build
+python setup.py develop gpu     # Force GPU build (fallback to CPU if not available)
 
 # Verify installation
-python -c "import sparse_transformers; print('✅ Installation successful!')"
+python -c "import sparse_transformers; print('✅ Installation successful')"
 ```
 
 ## Community engagement
diff --git a/setup.py b/setup.py
@@ -7,15 +7,54 @@
 import sys
 import warnings
 
+
+# Parse custom build arguments
+def parse_build_args():
+    """Parse custom build arguments for CPU/GPU selection.
+
+    Usage:
+      python setup.py develop cpu     # Force CPU-only build
+      python setup.py develop gpu     # Force GPU build (fallback to CPU if not available)
+      python setup.py develop         # Auto-detect (prefer GPU if available)
+    """
+    build_mode = 'auto'  # Default to auto-detect
+
+    # Check for help request
+    if 'help' in sys.argv or '--help' in sys.argv:
+        print("\nSparse Transformers Build Options:")
+        print("   python setup.py develop cpu     # Force CPU-only build")
+        print("   python setup.py develop gpu     # Force GPU build")
+        print("   python setup.py develop         # Auto-detect (prefer GPU)")
+        print()
+
+    # Check for our custom arguments
+    if 'cpu' in sys.argv:
+        build_mode = 'cpu'
+        sys.argv.remove('cpu')
+        print("Forced CPU-only build mode")
+    elif 'gpu' in sys.argv:
+        build_mode = 'gpu'
+        sys.argv.remove('gpu')
+        print("Forced GPU build mode")
+    else:
+        print("Auto-detecting build mode (default: GPU if available)")
+
+    return build_mode
+
+
 # Check PyTorch C++ ABI compatibility
 def get_pytorch_abi_flag():
     """Get the correct C++ ABI flag to match PyTorch compilation."""
     return f'-D_GLIBCXX_USE_CXX11_ABI={int(torch._C._GLIBCXX_USE_CXX11_ABI)}'
 
+
 # Get PyTorch ABI flag
 pytorch_abi_flag = get_pytorch_abi_flag()
 print(f"Using PyTorch C++ ABI flag: {pytorch_abi_flag}")
 
+# Parse build mode from command line
+build_mode = parse_build_args()
+
 # Create build directory if it doesn't exist
 build_dir = Path(__file__).parent / 'build'
 if build_dir.exists():
@@ -36,12 +75,15 @@ def get_pytorch_abi_flag():
         for i in range(torch.cuda.device_count()):
             arch_list.append(torch.cuda.get_device_capability(i))
         arch_list = sorted(list(set(arch_list)))
-        arch_flags = [f"-gencode=arch=compute_{arch[0]}{arch[1]},code=sm_{arch[0]}{arch[1]}" for arch in arch_list]
+        arch_flags = [
+            f"-gencode=arch=compute_{arch[0]}{arch[1]},code=sm_{arch[0]}{arch[1]}"
+            for arch in arch_list
+        ]
         print(f"CUDA architectures detected: {arch_list}")
     except Exception as e:
         warnings.warn(f"Error detecting CUDA architecture: {e}")
         # Use a common architecture as fallback
-        arch_flags = ["-gencode=arch=compute_86,code=sm_86"]
+        arch_flags = ['-gencode=arch=compute_86,code=sm_86']
 
 # Common optimization flags (compatible with both old and new ABI)
 common_compile_args = [
@@ -59,6 +101,7 @@ def get_pytorch_abi_flag():
 # Try to detect if we can use advanced CPU optimizations safely
 try:
     import platform
+
     if platform.machine() in ['x86_64', 'AMD64']:
         advanced_cpu_flags = [
             '-march=native',            # Optimize for local CPU architecture
@@ -72,28 +115,39 @@ def get_pytorch_abi_flag():
     advanced_cpu_flags = []
 
 # CPU-specific optimization flags
-cpu_compile_args = common_compile_args + advanced_cpu_flags + [
-    '-flto',                    # Link-time optimization
-    '-funroll-loops',           # Unroll loops
-    '-fno-math-errno',          # Assume math functions never set errno
-    '-fno-trapping-math',       # Assume FP ops don't generate traps
-    '-fno-plt',                 # Improve indirect call performance
-    '-fuse-linker-plugin',      # Enable LTO plugin
-    '-fomit-frame-pointer',     # Remove frame pointers
-    '-fno-stack-protector',     # Disable stack protector
-    '-fvisibility=hidden',      # Hide all symbols by default
-    '-fdata-sections',          # Place each data item into its own section
-    '-ffunction-sections',      # Place each function into its own section
-    '-fvisibility=default',
-]
+cpu_compile_args = (
+    common_compile_args
+    + advanced_cpu_flags
+    + [
+        '-flto',                    # Link-time optimization
+        '-funroll-loops',           # Unroll loops
+        '-fno-math-errno',          # Assume math functions never set errno
+        '-fno-trapping-math',       # Assume FP ops don't generate traps
+        '-fno-plt',                 # Improve indirect call performance
+        '-fuse-linker-plugin',      # Enable LTO plugin
+        '-fomit-frame-pointer',     # Remove frame pointers
+        '-fno-stack-protector',     # Disable stack protector
+        '-fvisibility=hidden',      # Hide all symbols by default
+        '-fdata-sections',          # Place each data item into its own section
+        '-ffunction-sections',      # Place each function into its own section
+        '-fvisibility=default',
+    ]
+)
 
 # CUDA-specific optimization flags (ensure C++17 compatibility and ABI matching)
-cuda_compile_args = ['-O3', '--use_fast_math'] + arch_flags + [
-    '--compiler-options', f"'-fPIC'",
-    '--compiler-options', f"'-O3'",
-    '-std=c++17',               # Force C++17 for compatibility
-    '--compiler-options', "'-fvisibility=default'",
-]
+cuda_compile_args = (
+    ['-O3', '--use_fast_math']
+    + arch_flags
+    + [
+        '--compiler-options',
+        "'-fPIC'",
+        '--compiler-options',
+        "'-O3'",
+        '-std=c++17',               # Force C++17 for compatibility
+        '--compiler-options',
+        "'-fvisibility=default'",
+    ]
+)
 
 # Add advanced CPU flags to CUDA compilation if available
 if advanced_cpu_flags:
@@ -112,28 +166,30 @@ def get_pytorch_abi_flag():
     '-Wl,--exclude-libs,ALL',   # Don't export any symbols from libraries
 ]
 
+
 # Get CUDA include paths
 def get_cuda_include_dirs():
     cuda_home = os.getenv('CUDA_HOME', '/usr/local/cuda')
     if not os.path.exists(cuda_home):
         cuda_home = os.getenv('CUDA_PATH')  # Windows
-    
+
     if cuda_home is None:
         # Try common CUDA locations
         for path in ['/usr/local/cuda', '/opt/cuda', '/usr/cuda']:
             if os.path.exists(path):
                 cuda_home = path
                 break
-    
+
     if cuda_home is None:
-        warnings.warn("CUDA installation not found. CUDA extensions will not be built.")
+        warnings.warn('CUDA installation not found. CUDA extensions will not be built.')
         return []
-        
+
     return [
         os.path.join(cuda_home, 'include'),
-        os.path.join(cuda_home, 'samples', 'common', 'inc')
+        os.path.join(cuda_home, 'samples', 'common', 'inc'),
     ]
 
+
 # Base extension configuration
 base_include_dirs = [
     os.path.dirname(torch.__file__) + '/include',
@@ -152,7 +208,34 @@ def get_cuda_include_dirs():
     warnings.warn(f"C++ source file not found: {cpp_source}")
     raise FileNotFoundError(f"Missing source file: {cpp_source}")
 
-if torch.cuda.is_available() and os.path.exists(cuda_source):
+# Determine if we should build CUDA extension based on build mode
+should_build_cuda = False
+
+if build_mode == 'cpu':
+    print("CPU-only build requested - skipping CUDA")
+    should_build_cuda = False
+elif build_mode == 'gpu':
+    print("GPU build requested")
+    if not torch.cuda.is_available():
+        print("WARNING: GPU build requested but PyTorch CUDA not available")
+        print("   Falling back to CPU-only build")
+        should_build_cuda = False
+    elif not os.path.exists(cuda_source):
+        print("WARNING: GPU build requested but CUDA source file not found")
+        print("   Falling back to CPU-only build")
+        should_build_cuda = False
+    else:
+        should_build_cuda = True
+else:  # auto mode
+    # Default behavior: prefer GPU if available, otherwise CPU
+    if torch.cuda.is_available() and os.path.exists(cuda_source):
+        print("Auto-detected: Building GPU extension (CUDA available)")
+        should_build_cuda = True
+    else:
+        print("Auto-detected: Building CPU-only extension (CUDA not available)")
+        should_build_cuda = False
+
+if should_build_cuda:
     print("Building CUDA extension...")
     cuda_include_dirs = get_cuda_include_dirs()
     if cuda_include_dirs:
@@ -161,35 +244,34 @@ def get_cuda_include_dirs():
             name='sparse_transformers.sparse_transformers',
             sources=[cpp_source, cuda_source],
             include_dirs=base_include_dirs,
-            extra_compile_args={
-                'cxx': cpu_compile_args,
-                'nvcc': cuda_compile_args
-            },
+            extra_compile_args={'cxx': cpu_compile_args, 'nvcc': cuda_compile_args},
             extra_link_args=extra_link_args,
             libraries=['gomp', 'cudart'],
             library_dirs=[str(build_dir / 'lib')],
-            define_macros=[('WITH_CUDA', None)]
+            define_macros=[('WITH_CUDA', None)],
         )
     else:
-        print("CUDA include directories not found, falling back to CPU-only extension...")
-        raise RuntimeError("CUDA headers not found")
-else:
+        print(
+            "CUDA include directories not found, falling back to CPU-only extension..."
+        )
+        should_build_cuda = False
+
+if not should_build_cuda:
     print("Building CPU-only extension...")
-    cuda_include_dirs = get_cuda_include_dirs()
-    if cuda_include_dirs:
-        base_include_dirs.extend(cuda_include_dirs)
     extension = CppExtension(
         name='sparse_transformers.sparse_transformers',
         sources=[cpp_source],
         extra_compile_args=cpu_compile_args,
         extra_link_args=extra_link_args,
         library_dirs=[str(build_dir / 'lib')],
         include_dirs=base_include_dirs,
-        libraries=['gomp']
+        libraries=['gomp'],
+        define_macros=[('CPU_ONLY', None)],
     )
 
 ext_modules.append(extension)
-print(f"Extension configured successfully: {extension.name}")
+build_type = "CUDA" if should_build_cuda else "CPU-only"
+print(f"Extension configured successfully: {extension.name} ({build_type})")
 
 
 # Custom build extension to handle clean builds and ABI compatibility
@@ -203,18 +285,19 @@ def get_ext_fullpath(self, ext_name):
         # Override to ensure extension is built in our build directory
         filename = self.get_ext_filename(ext_name)
         return str(build_dir / 'lib' / filename)
-    
+
     def build_extensions(self):
         # Disable parallel build for better error reporting and CUDA compatibility
         if self.parallel:
             self.parallel = False
-            
+
         # Print compilation info for debugging
         print(f"Building extensions with PyTorch {torch.__version__}")
         print(f"PyTorch C++ ABI: {pytorch_abi_flag}")
         super().build_extensions()
         print("C++ extension built successfully!")
 
+
 # Read requirements from requirements.txt
 def read_requirements():
     requirements_path = Path(__file__).parent / 'requirements.txt'
@@ -228,6 +311,7 @@ def read_requirements():
             return requirements
     return []
 
+
 setup(
     name='sparse_transformers',
     version='0.0.1',
@@ -241,4 +325,4 @@ def read_requirements():
     python_requires='>=3.8',
     include_package_data=True,
     zip_safe=False,  # Required for C++ extensions
-) 
+)
diff --git a/sparse_transformers/csrc/sparse_mlp_op.cpp b/sparse_transformers/csrc/sparse_mlp_op.cpp
@@ -19,8 +19,10 @@ namespace py = pybind11;
 #include <thread>
 #include <mutex>
 
-// Add device check utilities
+// Add device check utilities (only if not CPU-only build)
+#ifndef CPU_ONLY
 #include <c10/cuda/CUDAGuard.h>
+#endif
 
 // Add custom headers
 #include "weight_cache.h"