12.4. Parallel Processing and Performance#
12.4.1. Common Pitfalls#
1. Too many parallel processes overwhelming the system
# Bad: No limit on parallelism
find . -type f | xargs process_file
# Can spawn thousands of processes, freezing system
# Good: Set reasonable limit
find . -type f | xargs -P 4 process_file
2. Not handling failures in parallel execution
# Bad: Errors silently disappear
for file in *.txt; do
process_file "$file" &
done
wait
# Better: Capture and report failures
declare -a failed_files=()
for file in *.txt; do
if ! process_file "$file"; then
failed_files+=("$file")
fi &
done
wait
if [[ ${#failed_files[@]} -gt 0 ]]; then
echo "Failed: ${failed_files[@]}" >&2
exit 1
fi
3. Race conditions with shared resources
# Bad: Multiple processes writing to same file
find . -name "*.txt" | \
xargs -P 5 -I {} bash -c 'wc -l "$1" >> output.txt' _ {}
# Race condition: output.txt gets corrupted
# Good: Use atomic operations
find . -name "*.txt" | \
xargs -P 5 -I {} bash -c '
wc -l "$1" > temp_$$.txt
cat temp_$$.txt >> output.txt
rm temp_$$.txt
' _ {}
# Or use better approach:
find . -name "*.txt" -exec wc -l {} + > output.txt
4. Excessive memory usage with large data
# Bad: Each parallel job loads entire file
find . -name "*.csv" | \
xargs -P 10 -I {} awk '{sum += $2} END {print sum}' {}
# Better: Stream data, limit parallelism
find . -name "*.csv" | \
xargs -P 2 -I {} awk '{sum += $2} END {print sum}' {}
5. Not waiting for background jobs
# Bad: Script exits before jobs complete
for file in *.txt; do
process_file "$file" &
done
# Script may exit while jobs still running
# Good: Always wait
for file in *.txt; do
process_file "$file" &
done
wait # Don't exit until all jobs done
12.4.2. Real-World Example: Parallel File Processing Pipeline#
#!/bin/bash
set -euo pipefail
# PARALLEL FILE PROCESSING PIPELINE
# Compress, verify, and archive large log files
readonly LOG_DIR="/var/log"
readonly ARCHIVE_DIR="/backup/logs"
readonly NUM_WORKERS=${1:-4}
readonly MAX_SIZE=$((1024 * 1024 * 1024)) # 1GB
create_worker_pool() {
local worker_count=$1
declare -ag WORKER_PIDS=()
for ((i = 0; i < worker_count; i++)); do
mkfifo "worker_$i.fifo" 2>/dev/null || true
done
}
process_file_worker() {
local file=$1
local name=$(basename "$file")
# Check file size
local size=$(stat -c%s "$file" 2>/dev/null || echo 0)
if [[ $size -lt 1000 ]]; then
echo "[SKIP] $name (too small)"
return 0
fi
# Compress
if gzip -9 -c "$file" > "$ARCHIVE_DIR/$name.gz"; then
echo "[ZIP] $name"
else
echo "[ERROR] Failed to compress $name" >&2
return 1
fi
# Verify
if gzip -t "$ARCHIVE_DIR/$name.gz"; then
echo "[VERIFY] $name OK"
# Remove original
rm -f "$file"
else
echo "[ERROR] Verification failed: $name" >&2
rm -f "$ARCHIVE_DIR/$name.gz"
return 1
fi
}
export -f process_file_worker ARCHIVE_DIR
# Main processing
mkdir -p "$ARCHIVE_DIR"
echo "Starting parallel file processing with $NUM_WORKERS workers..."
find "$LOG_DIR" -maxdepth 1 -type f -name "*.log" -size "+$MAX_SIZE" | \
xargs -P "$NUM_WORKERS" -I {} bash -c 'process_file_worker "$@"' _ {}
echo "Compression complete"
# Report statistics
total_compressed=$(find "$ARCHIVE_DIR" -name "*.gz" -exec du -ch {} + | tail -1)
echo "Total archived: $total_compressed"
12.4.3. Performance Optimization Techniques#
12.4.3.1. Tuning Parallelism#
#!/bin/bash
# Determine optimal parallelism
get_cpu_count() {
nproc || echo 1
}
# General rule: I/O-bound tasks can use 2-4x CPU count
# CPU-bound tasks should use 1x CPU count
# Network tasks might need more
# Adaptive parallelism based on load
run_parallel_task() {
local max_jobs=$(($(nproc) * 2))
local current_load=$(cut -d' ' -f1 /proc/loadavg)
# If load is high, reduce parallelism
if (( $(echo "$current_load > $(nproc)" | bc -l) )); then
max_jobs=$(($(nproc)))
fi
find . -name "*.txt" | \
xargs -P "$max_jobs" process_file
}
# Memory-aware parallelism
run_memory_safe_parallel() {
local available_mem=$(free -h | awk 'NR==2 {print int($7)}')
local per_task_mem=500 # MB per task
local max_jobs=$((available_mem / per_task_mem))
[[ $max_jobs -lt 1 ]] && max_jobs=1
echo "Running $max_jobs parallel jobs (${available_mem}M available)"
find . -name "*.sql" | \
xargs -P "$max_jobs" -I {} mysql < {}
}
12.4.3.2. Avoiding Common Performance Issues#
#!/bin/bash
# Problem 1: Too many processes saturate I/O
# Bad: Unlimited parallelism
find . -type f | xargs -P 0 process_file
# Better: Limited to reasonable number
find . -type f | xargs -P 4 process_file
# Problem 2: Context switching overhead
# Bad: More processes than cores
parallel_tasks=$(nproc)
find . -type f | xargs -P $((parallel_tasks * 10)) process_file
# Better: Match core count
find . -type f | xargs -P "$parallel_tasks" process_file
# Problem 3: Memory exhaustion with large parallelism
# Bad: Each process copies large data
for i in {1..100}; do
process_large_dataset &
done
wait
# Better: Queue-based approach with limited parallelism
queue=()
max_queue=4
for dataset in large_*.txt; do
if [[ ${#queue[@]} -ge $max_queue ]]; then
wait -n
queue=("${queue[@]:1}")
fi
process_large_dataset "$dataset" &
queue+=($!)
done
wait "${queue[@]}"
12.4.4. Parallel Processing with xargs#
The xargs command is specifically designed for parallel execution.
12.4.4.1. xargs for Parallelism#
#!/bin/bash
# Sequential (default)
find . -name "*.log" | xargs gzip
# Equivalent to: gzip file1 file2 file3... (all at once)
# Parallel: Run 4 processes at a time
find . -name "*.log" | xargs -P 4 -I {} gzip {}
# -P 4: Use 4 parallel processes
# -I {}: Replace {} with the input line
# Practical examples
# Resize images in parallel (4 at a time)
find . -name "*.jpg" | \
xargs -P 4 -I {} convert {} -resize 800x600 {}.resized
# Download multiple URLs in parallel
cat urls.txt | \
xargs -P 5 -I {} wget -q {}
# Process files with a custom function
find . -name "*.csv" | \
xargs -P 3 -I {} bash -c 'process_data "$1"' _ {}
# Kill processes by pattern in parallel
pgrep nginx | \
xargs -P 0 kill
# -P 0: Use as many parallel processes as needed
12.4.4.2. Advanced xargs Patterns#
#!/bin/bash
# Batch processing: Process N items per invocation
find . -type f | \
xargs -L 10 batch_process
# -L 10: Pass 10 lines per invocation
# Handle special characters safely
find . -name "*.txt" -print0 | \
xargs -0 -P 4 process_file
# -print0: NUL-terminate filenames
# -0: Accept NUL-terminated input
# Parallel with error handling
find . -name "*.log" | \
xargs -P 4 -I {} bash -c '
if ! gzip "$1"; then
echo "Failed: $1" >&2
fi
' _ {}
# Limit resource usage with xargs
# Process in batches with size limit
find . -size +1G -type f | \
xargs -P 2 -I {} bash -c '
# Only 2 processes to avoid I/O saturation
rsync -av "$1" /backup/
' _ {}
12.4.5. Parallel Processing Fundamentals#
Running multiple tasks simultaneously can dramatically improve performance for I/O-bound and embarrassingly parallel workloads.
12.4.5.1. Basic Parallel Execution with &#
#!/bin/bash
# BASIC PARALLEL PROCESSING WITH &
# Sequential (slow)
for file in *.txt; do
process_file "$file" # Takes 10 seconds each
done
# Total: 100 seconds for 10 files
# Parallel (fast)
for file in *.txt; do
process_file "$file" & # Run in background
done
wait # Wait for all to complete
# Total: ~10 seconds (one file at a time, all in parallel)
# Controlled parallelism (optimal)
# Run at most 4 jobs at a time
counter=0
for file in *.txt; do
if [[ $counter -ge 4 ]]; then
wait -n # Wait for one job to finish
((counter--))
fi
process_file "$file" &
((counter++))
done
wait # Wait for remaining jobs
12.4.5.2. Managing Background Jobs#
#!/bin/bash
# Start multiple background jobs
long_task1 &
pid1=$!
long_task2 &
pid2=$!
long_task3 &
pid3=$!
# Wait for all to complete
wait $pid1 $pid2 $pid3
# Get exit codes
wait $pid1
code1=$?
# Kill job if it takes too long
timeout 30 long_running_task &
pid=$!
wait $pid || {
echo "Task timed out, killing..."
kill $pid 2>/dev/null
}