ext4recovery/test/scan.py

#!/usr/bin/env python3
"""
Strict ext4 directory entry scanner for pterodactyl paths.
"""
import struct

CHUNK    = 128 * 512
LV_START = 5120000 * 512
BSIZE    = 4096
DISKS    = ['/dev/sda', '/dev/sdd', '/dev/sdc', '/dev/sdb']

# Only exact target names we expect as directory entries
EXACT_TARGETS = [
    b'pterodactyl',
    b'volumes',
    b'wings',
]

def is_valid_dirent(block, off, name):
    """Strict validation of an ext4 directory entry."""
    if off + 8 + len(name) > BSIZE:
        return False

    inode   = struct.unpack_from('<I', block, off)[0]
    rec_len = struct.unpack_from('<H', block, off+4)[0]
    name_len = block[off+6]
    ftype   = block[off+7]

    # inode must be plausible (> 10, not absurdly large)
    if not (10 < inode < 500_000_000):
        return False

    # name_len must exactly match our target
    if name_len != len(name):
        return False

    # rec_len must be >= 8 + name_len and <= 4096
    # and aligned to 4 bytes
    min_rec = 8 + name_len
    if rec_len < min_rec or rec_len > BSIZE or rec_len % 4 != 0:
        return False

    # file type must be a known ext4 type
    if ftype not in (1, 2, 7):  # file, dir, symlink only
        return False

    # the name bytes must match exactly and be clean ASCII
    actual_name = block[off+8:off+8+name_len]
    if actual_name != name:
        return False

    # byte immediately after name (padding) should be 0
    pad_off = off + 8 + name_len
    if pad_off < BSIZE and block[pad_off] != 0:
        return False

    # Previous entry should also look valid if we're not at start of block
    # (skip this check for now - too complex)

    return True


def scan_block(block, phys_base):
    hits = []
    for off in range(0, BSIZE - 8):
        for target in EXACT_TARGETS:
            # Quick check: does target appear at this offset+8?
            if block[off+8:off+8+len(target)] != target:
                continue
            if is_valid_dirent(block, off, target):
                inode   = struct.unpack_from('<I', block, off)[0]
                rec_len = struct.unpack_from('<H', block, off+4)[0]
                ftype   = block[off+7]
                grp     = (inode - 1) // 8192
                hits.append({
                    'phys':    phys_base + off,
                    'inode':   inode,
                    'name':    target.decode(),
                    'ftype':   {1:'file',2:'dir',7:'symlink'}.get(ftype,'?'),
                    'group':   grp,
                    'intact':  grp >= 13,
                    'rec_len': rec_len,
                })
    return hits


def iter_data_chunks(disk_path):
    with open(disk_path, 'rb') as f:
        f.seek(0, 2)
        disk_size = f.tell()

    chunk_num = 0
    with open(disk_path, 'rb') as f:
        phys = LV_START
        while phys + CHUNK <= disk_size:
            if chunk_num % 5 != 4:
                f.seek(phys)
                yield phys, f.read(CHUNK)
            phys += CHUNK
            chunk_num += 1


def main():
    all_hits = []

    for disk_idx, disk in enumerate(DISKS):
        print(f'\nScanning {disk}...', flush=True)
        chunks = 0
        hits   = 0

        for phys, chunk_data in iter_data_chunks(disk):
            # Pre-filter: any target in chunk?
            if not any(t in chunk_data for t in EXACT_TARGETS):
                chunks += 1
                continue

            # Scan each 4KB block in chunk
            for blk in range(0, len(chunk_data), BSIZE):
                block = chunk_data[blk:blk+BSIZE]
                for hit in scan_block(block, phys + blk):
                    status = 'INTACT' if hit['intact'] else 'LOST'
                    print(f"  [{status}] '{hit['name']}' "
                          f"inode={hit['inode']} "
                          f"group={hit['group']} "
                          f"type={hit['ftype']} "
                          f"phys={hit['phys']}")
                    all_hits.append((disk_idx, hit))
                    hits += 1

            chunks += 1
            if chunks % 5000 == 0:
                gb = (phys - LV_START) / 1024**3
                print(f'  {disk}: {gb:.1f}GB, {hits} hits', flush=True)

        print(f'  Finished: {hits} hits')

    print('\n=== RESULTS ===')
    # Group by name and inode
    from collections import defaultdict
    by_inode = defaultdict(list)
    for disk_idx, hit in all_hits:
        key = (hit['inode'], hit['name'])
        by_inode[key].append((DISKS[disk_idx], hit['phys']))

    print(f'\nUnique (inode, name) pairs: {len(by_inode)}')
    for (inode, name), locations in sorted(by_inode.items()):
        grp = (inode-1)//8192
        status = 'INTACT' if grp >= 13 else 'LOST'
        print(f"  '{name}' inode={inode} group={grp} [{status}]")
        for disk, phys in locations[:3]:
            print(f"    {disk} phys={phys}")

if __name__ == '__main__':
    main()