How Python Manages Memory

┌─────────────────────────────────────────┐
│            Python Memory Model          │
│                                         │
│  ┌─────────────┐    ┌─────────────────┐ │
│  │   Stack     │    │      Heap       │ │
│  │             │    │                 │ │
│  │ name → ref──┼───►│  Object [42]    │ │
│  │ name → ref──┼───►│  Object ["hi"]  │ │
│  │             │    │  Object [list]  │ │
│  └─────────────┘    └─────────────────┘ │
│                                         │
│  Variables hold REFERENCES, not values  │
└─────────────────────────────────────────┘

# Variables are just references (labels) to objects
a = [1, 2, 3]
b = a               # b points to the SAME object, not a copy

b.append(4)
print(a)            # [1, 2, 3, 4]  ← a is affected!
print(a is b)       # True — same object in memory

# To copy:
b = a.copy()        # Shallow copy — new list, same inner objects
import copy
b = copy.deepcopy(a)  # Deep copy — fully independent

2. Reference Counting

Python’s primary memory management strategy:

import sys

a = [1, 2, 3]
print(sys.getrefcount(a))   # 2 (a + getrefcount's argument)

b = a
print(sys.getrefcount(a))   # 3 (a + b + argument)

c = a
print(sys.getrefcount(a))   # 4

del b
print(sys.getrefcount(a))   # 3

# When refcount hits 0 → object is immediately deallocated
del c
del a
# Object freed from memory

3. Garbage Collection — Handling Cycles

Reference counting alone can’t handle circular references:

import gc

class Node:
    def __init__(self, value):
        self.value = value
        self.next = None

# Circular reference — refcount never hits 0!
a = Node(1)
b = Node(2)
a.next = b
b.next = a      # Cycle!

del a
del b
# Both objects still in memory — refcount is 1, not 0!
# Python's cyclic GC detects and cleans these up

# Manual GC control
print(gc.isenabled())       # True — runs automatically
gc.collect()                # Force a collection
print(gc.get_count())       # (gen0, gen1, gen2) object counts

# Three generations — objects that survive collection
# get promoted to older generations (collected less frequently)
print(gc.get_threshold())   # (700, 10, 10) — default thresholds

4. Interning — Python’s Memory Optimization

Python reuses objects for small integers and strings:

# Small integers (-5 to 256) are interned (cached)
a = 100
b = 100
print(a is b)       # True  — same object!

a = 1000
b = 1000
print(a is b)       # False — different objects

# Short strings are often interned
a = "hello"
b = "hello"
print(a is b)       # True  — interned

a = "hello world"
b = "hello world"
print(a is b)       # False — may not be interned

# Force string interning
import sys
a = sys.intern("my long string")
b = sys.intern("my long string")
print(a is b)       # True — explicitly interned

Always use == to compare values, is only to check identity.

5. `slots` — Dramatic Memory Reduction

By default every instance stores its attributes in a __dict__:

class RegularPoint:
    def __init__(self, x, y, z):
        self.x = x
        self.y = y
        self.z = z

class SlottedPoint:
    __slots__ = ["x", "y", "z"]    # Pre-declare all attributes

    def __init__(self, x, y, z):
        self.x = x
        self.y = y
        self.z = z


import sys
import tracemalloc

# Memory comparison
rp = RegularPoint(1.0, 2.0, 3.0)
sp = SlottedPoint(1.0, 2.0, 3.0)

print(sys.getsizeof(rp))                    # ~48 bytes
print(sys.getsizeof(rp.__dict__))           # ~232 bytes  ← the hidden cost
print(sys.getsizeof(sp))                    # ~72 bytes, no __dict__!


# Real-world impact at scale
tracemalloc.start()

regular = [RegularPoint(i, i, i) for i in range(100_000)]
snap1 = tracemalloc.take_snapshot()

slotted = [SlottedPoint(i, i, i) for i in range(100_000)]
snap2 = tracemalloc.take_snapshot()

# Slotted uses ~40-50% less memory for large collections

`slots` trade-offs

class Slotted:
    __slots__ = ["x", "y"]

    def __init__(self, x, y):
        self.x = x
        self.y = y


s = Slotted(1, 2)

# ✅ Benefits
# - Less memory (no __dict__ overhead)
# - Faster attribute access
# - Prevents accidental new attributes

# ❌ Limitations
s.z = 3                 # AttributeError — can't add new attributes
print(s.__dict__)       # AttributeError — no __dict__!

# Inheritance with __slots__ requires care
class Base:
    __slots__ = ["x"]

class Child(Base):
    __slots__ = ["y"]   # Only declare NEW attributes here

c = Child()
c.x = 1     # ✅  from Base
c.y = 2     # ✅  from Child

6. Measuring Performance — `timeit`

import timeit

# Quick one-liner timing
t = timeit.timeit(
    stmt="[x**2 for x in range(1000)]",
    number=10_000           # Run 10,000 times
)
print(f"List comprehension: {t:.3f}s")

# Compare two approaches
list_comp = timeit.timeit(
    "[x**2 for x in range(1000)]",
    number=10_000
)
map_func = timeit.timeit(
    "list(map(lambda x: x**2, range(1000)))",
    number=10_000
)

print(f"List comp : {list_comp:.3f}s")
print(f"map()     : {map_func:.3f}s")


# Using repeat for more reliable results
results = timeit.repeat(
    stmt="sorted(range(1000, 0, -1))",
    repeat=5,               # Run 5 separate timings
    number=10_000
)
print(f"Min: {min(results):.3f}s")
print(f"Avg: {sum(results)/len(results):.3f}s")


# In a script — using the Timer class
timer = timeit.Timer(
    stmt="sum(x**2 for x in range(1000))",  # generator
)
print(timer.timeit(10_000))

7. Profiling — Finding Bottlenecks

`cProfile` — Function-level profiling

import cProfile
import pstats
import io

def slow_function():
    total = 0
    for i in range(10_000):
        total += sum(x**2 for x in range(100))
    return total

# Profile it
profiler = cProfile.Profile()
profiler.enable()

slow_function()

profiler.disable()

# Pretty-print results
stream = io.StringIO()
stats = pstats.Stats(profiler, stream=stream)
stats.sort_stats("cumulative")      # Sort by cumulative time
stats.print_stats(10)               # Top 10 functions
print(stream.getvalue())

# Output shows:
# ncalls  tottime  percall  cumtime  percall  filename:lineno(function)
# 10000   0.123    0.000    0.456    0.000    ...genexpr...
# 1       0.001    0.001    0.457    0.457    ...slow_function...

Line-by-line profiling with `line_profiler`

# pip install line_profiler

from line_profiler import LineProfiler

def process_data(data):
    result = []
    for item in data:
        cleaned = item.strip().lower()      # Line A
        words = cleaned.split()             # Line B
        filtered = [w for w in words        # Line C
                    if len(w) > 3]
        result.extend(filtered)             # Line D
    return result

profiler = LineProfiler()
profiler.add_function(process_data)

data = ["  Hello World  "] * 10_000
profiler.runcall(process_data, data)
profiler.print_stats()

# Shows time spent on EACH LINE:
# Line A: 45% of time
# Line B: 20% of time
# Line C: 30% of time  ← bottleneck

Memory profiling with `tracemalloc`

import tracemalloc

tracemalloc.start()

# Code to profile
data = {i: [j**2 for j in range(100)] for i in range(1000)}

snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics("lineno")

print("Top memory consumers:")
for stat in top_stats[:5]:
    print(stat)

# Output:
# file.py:5: size=8.5 MiB, count=101000, average=88 B

8. Performance Optimization Patterns

Use built-ins — they’re implemented in C

import timeit

data = list(range(10_000))

# ❌ Manual loop
def manual_sum(lst):
    total = 0
    for x in lst:
        total += x
    return total

# ✅ Built-in
builtin_sum = sum(data)

print(timeit.timeit(lambda: manual_sum(data), number=1000))  # ~0.5s
print(timeit.timeit(lambda: sum(data), number=1000))         # ~0.05s  10x faster!


# Same principle — prefer built-ins
min(data)               # faster than manual loop
max(data)               # faster than manual loop
sorted(data)            # faster than bubble sort
"".join(strings)        # MUCH faster than += in a loop
any(x > 5 for x in data)  # short-circuits — stops early
all(x > 0 for x in data)  # short-circuits — stops early

String concatenation

parts = ["word"] * 10_000

# ❌ String += in a loop — creates a new string every time! O(n²)
def bad_concat(parts):
    result = ""
    for p in parts:
        result += p
    return result

# ✅ join — single allocation O(n)
def good_concat(parts):
    return "".join(parts)

# ✅ f-string for small fixed concatenations
name = "Alice"
age  = 30
s = f"Name: {name}, Age: {age}"     # Faster than "Name: " + name + ...

Data structure choice matters

import timeit

needle = 999_999

# List — O(n) lookup
lst = list(range(1_000_000))
print(timeit.timeit(lambda: needle in lst, number=100))    # ~2.5s

# Set — O(1) lookup
st = set(range(1_000_000))
print(timeit.timeit(lambda: needle in st, number=100))     # ~0.0001s  !!

# Dict — O(1) lookup by key
d = {i: i for i in range(1_000_000)}
print(timeit.timeit(lambda: needle in d, number=100))      # ~0.0001s

# Use sets/dicts for membership testing, lists for ordered sequences

Local variables are faster

import math
import timeit

# ❌ Global lookup every iteration
def slow():
    result = 0
    for i in range(10_000):
        result += math.sqrt(i)     # Looks up math, then sqrt each time
    return result

# ✅ Cache as local variable
def fast():
    sqrt = math.sqrt               # One lookup, stored locally
    result = 0
    for i in range(10_000):
        result += sqrt(i)          # Local variable lookup — much faster
    return result

print(timeit.timeit(slow, number=1000))   # ~1.2s
print(timeit.timeit(fast, number=1000))   # ~0.8s  ← 33% faster

Generator vs list when you only iterate once

import sys

data = range(1_000_000)

# If you only need to iterate once — use a generator
total_gen  = sum(x**2 for x in data)   # ~8 MB saved, same result
total_list = sum([x**2 for x in data]) # Builds full list first

gen  = (x**2 for x in data)
lst  = [x**2 for x in data]

print(sys.getsizeof(gen))    # ~200 bytes
print(sys.getsizeof(lst))    # ~8 MB

9. NumPy for Numerical Performance

When pure Python isn’t fast enough for numerical work:

import numpy as np
import timeit

data_list = list(range(1_000_000))
data_np   = np.arange(1_000_000)

# Python loop
def python_squares(lst):
    return [x**2 for x in lst]

# NumPy vectorized
def numpy_squares(arr):
    return arr ** 2             # Operates on entire array at once

print(timeit.timeit(lambda: python_squares(data_list), number=10))  # ~0.8s
print(timeit.timeit(lambda: numpy_squares(data_np),    number=10))  # ~0.01s  80x!

# NumPy avoids Python overhead entirely — operations run in C
result = data_np[data_np > 500_000]     # Vectorized filtering
mean   = data_np.mean()                 # C-level computation

10. Optimization Decision Process

Is it actually slow?
    │
    ▼
Profile first (cProfile / timeit)
    │
    ▼
Found the bottleneck?
    │
    ├── Algorithm/data structure wrong? → Fix that first (biggest gains)
    │       list → set for lookups
    │       O(n²) → O(n log n)
    │
    ├── Python overhead? → Use built-ins, local vars, generators
    │
    ├── Numerical? → NumPy / SciPy
    │
    ├── Memory? → __slots__, generators, lazy evaluation
    │
    └── Still not enough?
            ├── Cython — compile Python to C
            ├── Numba — JIT compile numerical code
            └── C extension — write hot path in C

Quick Reference

Tool / Technique	Purpose
`sys.getrefcount`	Check reference count
`gc` module	Control garbage collection
`__slots__`	Reduce instance memory by 40–50%
`timeit`	Benchmark small code snippets
`cProfile`	Function-level performance profiling
`line_profiler`	Line-level performance profiling
`tracemalloc`	Track memory allocations
`"".join()`	Fast string concatenation
`set` / `dict`	O(1) membership testing
Generator expressions	Memory-efficient iteration
NumPy	Vectorized numerical computation

You now understand Python’s memory model from reference counting all the way to performance tuning.

Want to wrap up the series with Type Hints & Static Analysis — writing self-documenting, IDE-friendly, bug-resistant Python?