Parallel Processing and Performance

12.4. Parallel Processing and Performance#

12.4.1. Common Pitfalls#

1. Too many parallel processes overwhelming the system

# Bad: No limit on parallelism
find . -type f | xargs process_file
# Can spawn thousands of processes, freezing system

# Good: Set reasonable limit
find . -type f | xargs -P 4 process_file

2. Not handling failures in parallel execution

# Bad: Errors silently disappear
for file in *.txt; do
  process_file "$file" &
done
wait

# Better: Capture and report failures
declare -a failed_files=()
for file in *.txt; do
  if ! process_file "$file"; then
    failed_files+=("$file")
  fi &
done
wait
if [[ ${#failed_files[@]} -gt 0 ]]; then
  echo "Failed: ${failed_files[@]}" >&2
  exit 1
fi

3. Race conditions with shared resources

# Bad: Multiple processes writing to same file
find . -name "*.txt" | \
  xargs -P 5 -I {} bash -c 'wc -l "$1" >> output.txt' _ {}
# Race condition: output.txt gets corrupted

# Good: Use atomic operations
find . -name "*.txt" | \
  xargs -P 5 -I {} bash -c '
    wc -l "$1" > temp_$$.txt
    cat temp_$$.txt >> output.txt
    rm temp_$$.txt
  ' _ {}
# Or use better approach:
find . -name "*.txt" -exec wc -l {} + > output.txt

4. Excessive memory usage with large data

# Bad: Each parallel job loads entire file
find . -name "*.csv" | \
  xargs -P 10 -I {} awk '{sum += $2} END {print sum}' {}

# Better: Stream data, limit parallelism
find . -name "*.csv" | \
  xargs -P 2 -I {} awk '{sum += $2} END {print sum}' {}

5. Not waiting for background jobs

# Bad: Script exits before jobs complete
for file in *.txt; do
  process_file "$file" &
done
# Script may exit while jobs still running

# Good: Always wait
for file in *.txt; do
  process_file "$file" &
done
wait  # Don't exit until all jobs done

12.4.2. Real-World Example: Parallel File Processing Pipeline#

#!/bin/bash
set -euo pipefail

# PARALLEL FILE PROCESSING PIPELINE
# Compress, verify, and archive large log files

readonly LOG_DIR="/var/log"
readonly ARCHIVE_DIR="/backup/logs"
readonly NUM_WORKERS=${1:-4}
readonly MAX_SIZE=$((1024 * 1024 * 1024))  # 1GB

create_worker_pool() {
  local worker_count=$1
  declare -ag WORKER_PIDS=()
  
  for ((i = 0; i < worker_count; i++)); do
    mkfifo "worker_$i.fifo" 2>/dev/null || true
  done
}

process_file_worker() {
  local file=$1
  local name=$(basename "$file")
  
  # Check file size
  local size=$(stat -c%s "$file" 2>/dev/null || echo 0)
  if [[ $size -lt 1000 ]]; then
    echo "[SKIP] $name (too small)"
    return 0
  fi
  
  # Compress
  if gzip -9 -c "$file" > "$ARCHIVE_DIR/$name.gz"; then
    echo "[ZIP] $name"
  else
    echo "[ERROR] Failed to compress $name" >&2
    return 1
  fi
  
  # Verify
  if gzip -t "$ARCHIVE_DIR/$name.gz"; then
    echo "[VERIFY] $name OK"
    # Remove original
    rm -f "$file"
  else
    echo "[ERROR] Verification failed: $name" >&2
    rm -f "$ARCHIVE_DIR/$name.gz"
    return 1
  fi
}

export -f process_file_worker ARCHIVE_DIR

# Main processing
mkdir -p "$ARCHIVE_DIR"

echo "Starting parallel file processing with $NUM_WORKERS workers..."
find "$LOG_DIR" -maxdepth 1 -type f -name "*.log" -size "+$MAX_SIZE" | \
  xargs -P "$NUM_WORKERS" -I {} bash -c 'process_file_worker "$@"' _ {}

echo "Compression complete"

# Report statistics
total_compressed=$(find "$ARCHIVE_DIR" -name "*.gz" -exec du -ch {} + | tail -1)
echo "Total archived: $total_compressed"

12.4.3. Performance Optimization Techniques#

12.4.3.1. Tuning Parallelism#

#!/bin/bash

# Determine optimal parallelism
get_cpu_count() {
  nproc || echo 1
}

# General rule: I/O-bound tasks can use 2-4x CPU count
# CPU-bound tasks should use 1x CPU count
# Network tasks might need more

# Adaptive parallelism based on load
run_parallel_task() {
  local max_jobs=$(($(nproc) * 2))
  local current_load=$(cut -d' ' -f1 /proc/loadavg)
  
  # If load is high, reduce parallelism
  if (( $(echo "$current_load > $(nproc)" | bc -l) )); then
    max_jobs=$(($(nproc)))
  fi
  
  find . -name "*.txt" | \
    xargs -P "$max_jobs" process_file
}

# Memory-aware parallelism
run_memory_safe_parallel() {
  local available_mem=$(free -h | awk 'NR==2 {print int($7)}')
  local per_task_mem=500  # MB per task
  local max_jobs=$((available_mem / per_task_mem))
  
  [[ $max_jobs -lt 1 ]] && max_jobs=1
  
  echo "Running $max_jobs parallel jobs (${available_mem}M available)"
  find . -name "*.sql" | \
    xargs -P "$max_jobs" -I {} mysql < {}
}

12.4.3.2. Avoiding Common Performance Issues#

#!/bin/bash

# Problem 1: Too many processes saturate I/O
# Bad: Unlimited parallelism
find . -type f | xargs -P 0 process_file

# Better: Limited to reasonable number
find . -type f | xargs -P 4 process_file

# Problem 2: Context switching overhead
# Bad: More processes than cores
parallel_tasks=$(nproc)
find . -type f | xargs -P $((parallel_tasks * 10)) process_file

# Better: Match core count
find . -type f | xargs -P "$parallel_tasks" process_file

# Problem 3: Memory exhaustion with large parallelism
# Bad: Each process copies large data
for i in {1..100}; do
  process_large_dataset &
done
wait

# Better: Queue-based approach with limited parallelism
queue=()
max_queue=4
for dataset in large_*.txt; do
  if [[ ${#queue[@]} -ge $max_queue ]]; then
    wait -n
    queue=("${queue[@]:1}")
  fi
  process_large_dataset "$dataset" &
  queue+=($!)
done
wait "${queue[@]}"

12.4.4. Parallel Processing with xargs#

The xargs command is specifically designed for parallel execution.

12.4.4.1. xargs for Parallelism#

#!/bin/bash

# Sequential (default)
find . -name "*.log" | xargs gzip
# Equivalent to: gzip file1 file2 file3... (all at once)

# Parallel: Run 4 processes at a time
find . -name "*.log" | xargs -P 4 -I {} gzip {}
# -P 4: Use 4 parallel processes
# -I {}: Replace {} with the input line

# Practical examples
# Resize images in parallel (4 at a time)
find . -name "*.jpg" | \
  xargs -P 4 -I {} convert {} -resize 800x600 {}.resized

# Download multiple URLs in parallel
cat urls.txt | \
  xargs -P 5 -I {} wget -q {}

# Process files with a custom function
find . -name "*.csv" | \
  xargs -P 3 -I {} bash -c 'process_data "$1"' _ {}

# Kill processes by pattern in parallel
pgrep nginx | \
  xargs -P 0 kill
# -P 0: Use as many parallel processes as needed

12.4.4.2. Advanced xargs Patterns#

#!/bin/bash

# Batch processing: Process N items per invocation
find . -type f | \
  xargs -L 10 batch_process
# -L 10: Pass 10 lines per invocation

# Handle special characters safely
find . -name "*.txt" -print0 | \
  xargs -0 -P 4 process_file
# -print0: NUL-terminate filenames
# -0: Accept NUL-terminated input

# Parallel with error handling
find . -name "*.log" | \
  xargs -P 4 -I {} bash -c '
    if ! gzip "$1"; then
      echo "Failed: $1" >&2
    fi
  ' _ {}

# Limit resource usage with xargs
# Process in batches with size limit
find . -size +1G -type f | \
  xargs -P 2 -I {} bash -c '
    # Only 2 processes to avoid I/O saturation
    rsync -av "$1" /backup/
  ' _ {}

12.4.5. Parallel Processing Fundamentals#

Running multiple tasks simultaneously can dramatically improve performance for I/O-bound and embarrassingly parallel workloads.

12.4.5.1. Basic Parallel Execution with &#

#!/bin/bash

# BASIC PARALLEL PROCESSING WITH &

# Sequential (slow)
for file in *.txt; do
  process_file "$file"  # Takes 10 seconds each
done
# Total: 100 seconds for 10 files

# Parallel (fast)
for file in *.txt; do
  process_file "$file" &  # Run in background
done
wait  # Wait for all to complete
# Total: ~10 seconds (one file at a time, all in parallel)

# Controlled parallelism (optimal)
# Run at most 4 jobs at a time
counter=0
for file in *.txt; do
  if [[ $counter -ge 4 ]]; then
    wait -n  # Wait for one job to finish
    ((counter--))
  fi
  process_file "$file" &
  ((counter++))
done
wait  # Wait for remaining jobs

12.4.5.2. Managing Background Jobs#

#!/bin/bash

# Start multiple background jobs
long_task1 &
pid1=$!

long_task2 &
pid2=$!

long_task3 &
pid3=$!

# Wait for all to complete
wait $pid1 $pid2 $pid3

# Get exit codes
wait $pid1
code1=$?

# Kill job if it takes too long
timeout 30 long_running_task &
pid=$!
wait $pid || {
  echo "Task timed out, killing..."
  kill $pid 2>/dev/null
}