Python os Module: File System Operations Every Script Needs
Every automation script eventually needs to touch the file system โ create a directory, check if a file exists, read a config value from the environment.
The os module is Python's interface to the operating system. No shell required.
๐ Free: AI Publishing Checklist โ 7 steps in Python ยท Full pipeline: germy5.gumroad.com/l/xhxkzz (pay what you want, min $9.99)
What os gives you
import os
# Where am I right now?
print(os.getcwd()) # /Users/yamil/projects/pipeline
# What's in this directory?
print(os.listdir(".")) # ['main.py', 'state.json', 'chapters']
# Does this path exist?
print(os.path.exists("state.json")) # True or False
Three things you can do without touching the shell, without subprocess, and without any external packages. os ships with Python and works the same on macOS, Linux, and Windows.
os.path โ the building blocks
os.path handles path string manipulation. You give it strings, it gives you strings back.
import os
path = "/Users/yamil/projects/pipeline/state.json"
os.path.dirname(path) # '/Users/yamil/projects/pipeline'
os.path.basename(path) # 'state.json'
os.path.splitext(path) # ('/Users/yamil/projects/pipeline/state', '.json')
os.path.abspath("state.json") # resolves relative path to absolute
# Join paths safely (handles slashes correctly on all platforms)
os.path.join("output", "2026", "chapters", "ch01.md")
# 'output/2026/chapters/ch01.md'
# Check what kind of path it is
os.path.exists(path) # True if file or directory exists
os.path.isfile(path) # True only for files
os.path.isdir(path) # True only for directories
The most common gotcha: never build paths with string concatenation ("dir" + "/" + "file"). Use os.path.join() โ it handles the separator correctly regardless of operating system.
Working with directories
import os
# Where are you?
print(os.getcwd())
# Change working directory
os.chdir("/Users/yamil/projects")
print(os.getcwd()) # /Users/yamil/projects
# List directory contents (returns filenames as strings)
entries = os.listdir(".")
# ['pipeline', 'notes.md', 'env']
# Filter to just files
files = [e for e in os.listdir(".") if os.path.isfile(e)]
# Create one directory (raises FileExistsError if it already exists)
os.mkdir("output")
# Create nested directories โ equivalent to mkdir -p
os.makedirs("output/2026/chapters", exist_ok=True)
# exist_ok=True: no error if the directory already exists
os.makedirs() with exist_ok=True is the pattern you want in almost every script. It's idempotent โ safe to call every time the script runs.
Creating, removing, and renaming
import os
# Create a file (via open, not os โ but os handles the rest)
with open("draft.txt", "w") as f:
f.write("content")
# Rename a file (fails if destination exists on some systems)
os.rename("draft.txt", "final.txt")
# Remove a file
os.remove("final.txt")
# Remove an empty directory
os.rmdir("empty_dir")
# For non-empty directories, use shutil
import shutil
shutil.rmtree("output_dir") # deletes everything inside
Before vs after renaming with a safety check:
# Before: risky โ what if the destination already exists?
os.rename("temp_output.json", "state.json")
# After: explicit check first
if os.path.exists("state.json"):
os.remove("state.json")
os.rename("temp_output.json", "state.json")
There's a better way to do this โ covered in the next section.
os.replace() โ atomic file replacement
This is the most important os function most beginners don't know about.
import os, json
def save_state(state: dict, state_file: str) -> None:
"""Write state atomically โ no partial writes on failure."""
tmp = state_file + ".tmp"
# Write to temp file first
with open(tmp, "w", encoding="utf-8") as f:
json.dump(state, f, indent=2)
# Replace atomically โ on POSIX systems, this is a single syscall
os.replace(tmp, state_file)
# If the process crashes here, state_file still has the old data
# If os.replace() succeeds, state_file has the new data โ no in-between state
Why this matters:
# Naive approach โ dangerous
with open("state.json", "w") as f:
json.dump(state, f) # if Python crashes mid-write, file is corrupted
# Safe approach โ atomic
with open("state.json.tmp", "w") as f:
json.dump(state, f)
os.replace("state.json.tmp", "state.json") # all-or-nothing
Unlike os.rename(), os.replace() silently overwrites the destination if it exists. On POSIX systems (macOS, Linux), the replacement is atomic at the OS level โ readers either see the old file or the new one, never a partial write.
Use this pattern any time you're writing state files, config files, or any output that another process might be reading concurrently.
Environment variables
import os
# Read a variable (KeyError if not set)
api_key = os.environ["OPENAI_API_KEY"]
# Read with a default (no error if not set)
model = os.getenv("MODEL", "gpt-4o")
debug = os.getenv("DEBUG", "false").lower() == "true"
# Check if a variable exists
if "DATABASE_URL" in os.environ:
print("Database configured")
# Set a variable for the current process (and subprocesses)
os.environ["LOG_LEVEL"] = "INFO"
# Get all environment variables as a dict
env_copy = os.environ.copy()
env_copy["EXTRA_VAR"] = "injected"
# Pass env_copy to subprocess.run() to give subprocess a modified environment
The os.getenv() pattern with a default is the right approach for optional config. Never hardcode API keys or paths โ read them from the environment.
# Pipeline config pattern
import os
API_KEY = os.environ["OPENAI_API_KEY"] # required โ fail fast if missing
MODEL = os.getenv("PIPELINE_MODEL", "gpt-4o") # optional with default
MAX_RETRY = int(os.getenv("MAX_RETRY", "3")) # type conversion after getenv
OUTPUT = os.getenv("OUTPUT_DIR", "chapters") # directory override
Walking directory trees: os.walk()
os.listdir() gives you one directory level. os.walk() gives you everything recursively.
import os
# Walk the entire directory tree
for dirpath, dirnames, filenames in os.walk("chapters"):
print(f"Dir: {dirpath}")
print(f"Dirs: {dirnames}")
print(f"Files: {filenames}")
print()
Dir: chapters
Dirs: ['en', 'es']
Files: []
Dir: chapters/en
Dirs: []
Files: ['ch01.md', 'ch02.md', 'ch03.md']
Dir: chapters/es
Dirs: []
Files: ['ch01.md', 'ch02.md']
Practical pattern โ collect all Markdown files:
import os
def find_markdown_files(root: str) -> list[str]:
"""Return absolute paths of all .md files under root."""
results = []
for dirpath, _, filenames in os.walk(root):
for filename in filenames:
if filename.endswith(".md"):
results.append(os.path.join(dirpath, filename))
return results
md_files = find_markdown_files("chapters")
print(f"Found {len(md_files)} markdown files")
os.path vs pathlib โ when to use each
pathlib is the modern alternative. For new code, it's usually the better choice.
# os.path โ verbose but explicit
import os
base = os.path.dirname(os.path.abspath(__file__))
config = os.path.join(base, "config", "settings.json")
os.makedirs(os.path.dirname(config), exist_ok=True)
# pathlib โ cleaner, same result
from pathlib import Path
base = Path(__file__).parent
config = base / "config" / "settings.json"
config.parent.mkdir(parents=True, exist_ok=True)
Use os when you need:
-
os.replace()for atomic writes โ pathlib has no equivalent -
os.environ/os.getenv()for environment variables -
os.walk()(pathlib hasrglob()which is usually cleaner, butos.walk()gives you more control) - Compatibility with code or libraries that only accept strings
Use pathlib when you need:
- Path manipulation in new code
- Reading/writing files (
path.read_text(),path.write_text()) - Glob patterns (
path.glob("*.json")) - Cleaner API in general
In practice, most real automation scripts use both.
Real pipeline example: organizing generated chapter files
This is the pattern from the ebook publishing pipeline โ generating chapter files and organizing them into per-language directories.
import os
import json
def organize_chapters(
source_dir: str,
output_base: str,
languages: list[str],
) -> dict[str, list[str]]:
"""
Move generated chapter files from a flat source directory
into output_base/lang/ subdirectories.
Expects filenames like: ch01_en.md, ch01_es.md, ch02_en.md
"""
organized = {lang: [] for lang in languages}
# Create output directories for each language
for lang in languages:
lang_dir = os.path.join(output_base, lang)
os.makedirs(lang_dir, exist_ok=True)
# Walk source directory and sort by language suffix
for filename in os.listdir(source_dir):
if not filename.endswith(".md"):
continue
stem, _ = os.path.splitext(filename) # 'ch01_en'
parts = stem.rsplit("_", 1) # ['ch01', 'en']
if len(parts) != 2:
continue
chapter_id, lang = parts
if lang not in languages:
continue
src = os.path.join(source_dir, filename)
dst = os.path.join(output_base, lang, filename)
# Atomic replace โ safe even if destination exists from a previous run
os.replace(src, dst)
organized[lang].append(dst)
print(f" {filename} โ {lang}/")
return organized
def save_manifest(organized: dict, manifest_path: str) -> None:
"""Write the chapter manifest atomically."""
tmp = manifest_path + ".tmp"
with open(tmp, "w", encoding="utf-8") as f:
json.dump(organized, f, indent=2)
os.replace(tmp, manifest_path)
# Usage
result = organize_chapters(
source_dir="generated",
output_base="chapters",
languages=["en", "es"],
)
save_manifest(result, "chapters/manifest.json")
print(f"Organized {sum(len(v) for v in result.values())} files")
This script is safe to re-run. os.makedirs(..., exist_ok=True) won't fail if directories already exist. os.replace() won't fail if the destination already exists from a previous partial run.
The pipeline uses os and pathlib together for every file operation โ organizing chapters, managing state files, and cleaning up temp dirs: germy5.gumroad.com/l/xhxkzz โ pay what you want, min $9.99.
Top comments (0)