If you've ever wanted to do anything with the vast amount of data e621 holds, nothing compares to the daily database exports. Over the last year or so, I've slowly accumulated an extensive suite of homegrown software designed to ease parsing the idiosyncratic not-technically-CSV format the data is provided in, and today I voluntarily spent most of my Sunday rewriting most of what I had written and expanding its capabilities to handle every field available, even the boring ones. This is because I don't do drugs.
This supports every export currently available except the wiki. All functions are generator iterators which take a file handle, and return a dictionary object where the key names correspond to the first line of the file. The types of the values vary depending on the column. Some non-exhaustive notes:
- Stuff like post IDs and fav counts are integers, or None where not applicable.
- Datestamps use Python's datetime library, and also may be None for early entries.
- Some fields have custom enums.
- Tags are returned as a list of strings, and pool contents as a list of integers.
- Boolean values (like is_deleted) may be True, False or None, because the early years of e621's database are full of Fun.
Some examples of usage follow in the replies.
e6csv.py
import sys from enum import Enum, IntEnum from datetime import datetime import io class Rating(IntEnum): SAFE = 0 QUESTIONABLE = 1 EXPLICIT = 2 ratingtable = {"s": Rating.SAFE, "q": Rating.QUESTIONABLE, "e": Rating.EXPLICIT} class PoolCategory(IntEnum): SERIES = 0 COLLECTION = 1 # ACTIVE is assigned 0 here to ease boolean comparisons, since in most cases # it's the only one you care about class RelationStatus(IntEnum): ACTIVE = 0 PENDING = 1 DELETED = 2 APPROVED = 3 QUEUED = 4 PROCESSING = 5 RETIRED = 6 class TagCategory(IntEnum): GENERAL = 0 ARTIST = 1 # 2 is unused COPYRIGHT = 3 CHARACTER = 4 SPECIES = 5 INVALID = 6 META = 7 LORE = 8 Schema = Enum("Schema", "INT STR TIME MQUOTE SQUOTE RATING BOOL STRLIST " \ + "INTLIST FLOAT PCAT RSTAT TCAT") def commaorend(l, o): n = l.find(",", o) if n == -1: n = len(l)-1 return n def readcsv(f, schema): lineno = 1 try: colnames = f.readline().strip().split(",") l = f.readline() # e621's string mangling algorithm works as follows: # If the first character is not a double quote, the string does not # contain any newlines, commas or double quotes, and is terminated by # the end of the field as normal. # If the first character is a double quote, the string continues until # the next unescaped double quote. # All characters are valid including commas and newlines, and double # quotes are escaped by following each one with a second double quote. # The string returned from this function replaces escaped double quotes # within the string with unescaped double quotes, and does not include # the enclosing double quotes (if any). def unescape(o, multiline): nonlocal l, lineno buf = io.StringIO() if l[o] != '"': n = commaorend(l, o) buf.write(l[o:n]) else: while True: o += 1 n = l.find('"', o) while n == -1: if not multiline: raise Exception("Expected \"") buf.write(l[o:]) l = f.readline() lineno += 1 o = 0 n = l.find('"') buf.write(l[o:n]) n += 1 if l[n] != '"': break buf.write('"') o = n return (buf.getvalue(), n) while len(l) > 0: row = {} o = 0 for col in zip(colnames, schema, strict=True): if o != 0: o += 1 if o == len(l): raise Exception("Reached end of line unexpectedly") match col[1]: case Schema.INT: # Integer (may be blank) n = commaorend(l, o) v = int(l[o:n]) if o != n else None case Schema.STR: # String (cannot contain commas, newlines or quotes) n = commaorend(l, o) v = l[o:n] case Schema.TIME: # Datestamp (these SUCK) # The microsecond part may not be present or may have # fewer than 6 decimal places, which causes the # datetime library to throw a wobbly. # May also be blank. n = commaorend(l, o) if n == o: v = None else: datestr = l[o:n] if n-o == 19: datestr += ".000000" else: datestr += "0" * (26-(n-o)) v = datetime.fromisoformat(datestr) case Schema.MQUOTE: # Arbitrary data (may contain anything including # newlines) v, n = unescape(o, True) case Schema.SQUOTE: # Arbitrary line (may contain anything except newlines) v, n = unescape(o, False) case Schema.RATING: # Rating # Must be one of "s", "q" or "e", and cannot be blank n = commaorend(l, o) v = ratingtable[l[o:n]] case Schema.BOOL: # Boolean # Must be "t", "f", or blank n = commaorend(l, o) boolstr = l[o:n] match boolstr: case "t": v = True case "f": v = False case "": v = None case _: raise Exception("Invalid bool: "+boolstr) case Schema.STRLIST: # Space-separated string list v, n = unescape(o, False) v = v.split(" ") if n != o else [] case Schema.INTLIST: # Comma-separated integer list in curly brackets # Also enclosed in double quotes unless the list has 1 # or 0 entries (i.e. no commas). if l[o] == '"': o += 1 if l[o] != "{": raise Exception("Expected {") o += 1 n = l.find("}", o) if n == -1: raise Exception("Expected }") v = [int(s) for s in l[o:n].split(",")] if n != o else [] n += 1 if l[n] == '"': n += 1 case Schema.FLOAT: # Float (may be blank) n = commaorend(l, o) v = float(l[o:n]) if o != n else None case Schema.PCAT: # Pool category (cannot be blank) n = commaorend(l, o) v = PoolCategory[l[o:n].upper()] case Schema.RSTAT: # Relation status (cannot be blank) n = commaorend(l, o) v = RelationStatus[l[o:n].upper()] case Schema.TCAT: # Tag category (numeric, cannot be blank) n = commaorend(l, o) v = TagCategory(int(l[o:n])) case _: raise Exception("Unknown schema: "+str(col[1])) row[col[0]] = v o = n yield row l = f.readline() lineno += 1 except: raise Exception("Error at line "+str(lineno)) from sys.exc_info()[1] def readposts(f): return readcsv(f, [Schema.INT, Schema.INT, Schema.TIME, Schema.STR, \ Schema.MQUOTE, Schema.RATING, Schema.INT, Schema.INT, Schema.STRLIST, \ Schema.STRLIST, Schema.INT, Schema.STR, Schema.INT, Schema.INT, \ Schema.INT, Schema.INT, Schema.INT, Schema.MQUOTE, Schema.FLOAT, \ Schema.TIME, Schema.BOOL, Schema.BOOL, Schema.BOOL, Schema.INT, \ Schema.INT, Schema.INT, Schema.BOOL, Schema.BOOL, Schema.BOOL]) def readpools(f): return readcsv(f, [Schema.INT, Schema.SQUOTE, Schema.TIME, Schema.TIME, \ Schema.INT, Schema.MQUOTE, Schema.BOOL, Schema.PCAT, Schema.INTLIST]) def readrelations(f): return readcsv(f, [Schema.INT, Schema.SQUOTE, Schema.SQUOTE, Schema.TIME, \ Schema.RSTAT]) def readtags(f): return readcsv(f, [Schema.INT, Schema.SQUOTE, Schema.TCAT, Schema.INT])
Updated