From d98285e367c29ec9eb1cacf5cf424d6910270efd Mon Sep 17 00:00:00 2001 From: Jari Vetoniemi Date: Sun, 16 Sep 2018 10:54:51 +0300 Subject: redesign --- doc/fspec-guide.adoc | 192 +++++++++++++++++++++++++++++++++++---------------- spec/ability.fspec | 18 ++--- spec/eaf.fspec | 24 +++---- spec/elf.fspec | 57 ++++++++------- spec/emz.fspec | 10 +-- spec/ftable.fspec | 2 +- spec/item.fspec | 16 ++--- spec/model.fspec | 2 +- spec/name.fspec | 6 +- spec/spell.fspec | 34 ++++----- spec/vtable.fspec | 2 +- vim/filespec.vim | 9 +-- 12 files changed, 220 insertions(+), 152 deletions(-) diff --git a/doc/fspec-guide.adoc b/doc/fspec-guide.adoc index 576c734..8a61609 100644 --- a/doc/fspec-guide.adoc +++ b/doc/fspec-guide.adoc @@ -29,33 +29,38 @@ Filespec Version 0.1 === Abstract -Writeup about how writing and reading structured data is mostly done manually. +Often while developing a software, it has to understand and access some sort of +binary data. Often there might be a library or something that helps you with +this task. However, many times these solutions are lacking or doesn't work +properly on different platforms. Sometimes there might not be solution at all +for your particular environment and you have to do it yourself again. Or even +if you are creating a new format, there's lack of tools to prototype it, and +every change needs refactoring of the packing/unpacking code. === Motivation -Writeup how boring it is to write similar code each time when trying to read or -write structured data. How easy it is to make mistakes or cause unportable and -unoptimized code. Write how filespec can help with reverse engineering and -figuring out data structures, how it can be used to generate both packers and -unpackers giving you powerful tools for working with structured data. +Filespec lets you describe the format itself, so that you can generate +portable and effecient code for reading and writing the specified data in +a simple way. It also provides bunch of utilities that helps you develop +formats, or even used as a tool for reading and writing binary files. === Overview Goal of Filespec is to document the structured data and relationships within, -so the data can be understood and accessed completely. +so the data can be understood completely. === Related Work ==== Kaitai Kaitai is probably not very well known utility, that has similar goal to -filespec. +Filespec. Explain cons: - Depends on runtime - Can only model data which runtime supports (only certain - compression/decompression available for example, while in filespec + compression/decompression available for example, while in Filespec filters can express anything) - Mainly designed for generated code, not general utility - Uses YAML for modelling structured data which is quite wordy and akward @@ -81,60 +86,115 @@ Brief of Filespec specifications and syntax include::../spec/elf.fspec[] ---- -=== Keywords +=== Top-level keywords -|============================================================================= +Top-level keywords, they can't be used inside struct declarations. + +[options="header"] +|======================================================== +| Keyword | Description +| enum { ... } | Declares enumeration | struct _name_ { ... } | Declares structured data -| enum _name_ { ... } | Declares enumeration -| union _name_ (_var_) { ... } | Declares union, can be used to model variants -|============================================================================= +|======================================================== + +=== Enums -.Struct member declaration syntax -Parenthesis indicate optional fields ---- -member_name: member_type (array ...) (| filter ...) (visual hint); +enum { + first, + second, + seventh = 7, + eight +}; ---- -=== Types +=== Structs + +---- +struct blob { + type name (array ...) (| filter ...) (visual hint); +}; +---- + +==== Types Basic types to express binary data. -|================================================================ -| struct _name_ | Named structured data (Struct member only) -| enum _name_ | Value range is limited to the named enumeration -| u8, s8 | Unsigned, signed 8bit integer -| u16, s16 | Unsigned, signed 16bit integer -| u32, s32 | Unsigned, signed 32bit integer -| u64, s64 | Unsigned, signed 64bit integer -|================================================================ +[options="header"] +|================================================================================================== +| Type | Description +| if (_expr_) { ... } else { ... } | Conditional +| select (_expr_) { ... } _name_ | Tagged union +| struct _name_ | Substructure +| u??, s?? | Unsigned, signed ??bit integer (e.g. u8 for 8bit unsigned integer) +|================================================================================================== + +==== If -=== Arrays +Conditional reading/writing of fields depending on the result of _expr_. -Valid values that can be used inside array subscript operation. +---- +u8 version; +if (version >= 2) { + u8 ver2_field; +} +if (version >= 3) { + u8 ver3_field; +} else { + u8 removed_old_field; +} +---- + +==== Select + +Conditionally pack/unpack field depending on the result of _expr_. +This is identical to tagged union, variant, etc... and generates into union in C. -|================================================= -| _expr_ | Uses result of expression as array size -| \'str' | Grow array until occurance of str -| $ | Grow array until end of data is reached -|================================================= +There may not be duplicate cases inside single select. + +---- +u8 type; +select (type) { + 0) struct string string (array ...) (| filter ...) (visual hint); + 1) u1 bool; + *) u32 any; +} value; +---- + +==== Arrays + +Valid expressions that can be used to define array size during declaration. + +[options="header"] +|======================================================================== +| Expression | Description +| _expr_ | Result of expression +| \'str' | Grow array until occurance of str in binary data +| until (_expr_) | Grow array until condition has been reached +|======================================================================== .Reading length prefixed data ---- -num_items: u16 dec; -items: struct item[num_items]; +u16 num_items dec; +struct item items[num_items]; ---- .Reading null terminated string ---- -cstr: u8['\0'] str; +u8 cstr['\0'] str; ---- -.Reading repeating pattern +.Reading repeating pattern until we hit stop condition ---- -pattern: struct pattern[$]; +struct pattern pattern[until (pattern.last_block)]; ---- -=== Filters +.Reading repeating pattern until the data ends +---- +struct pattern pattern[until (false)]; +---- + +==== Filters Filters can be used to sanity check and transform data into more sensible format while still maintaining compatible data layout for both packing and @@ -153,35 +213,40 @@ consider contributing your filter to the interpeter. Filters for official interepter are implemented as command pairs (Thus filters are merely optional dependency in interpeter) -|======================================================================== +[options="header"] +|============================================================================ +| Filter | Description | matches(_str_) | Data matches _str_ +| range(_min_, _max_) | Data is within the range of _min_ and _max_ | encoding(_str_, ...) | Data is encoded with algorithm _str_ | compression(_str_, ...) | Data is compressed with algorithm _str_ | encryption(_str_, _key_, ...) | Data is encrypted with algorithm _str_ -|======================================================================== +|============================================================================ .Validating file headers ---- -header: u8[4] | matches('\x7fELF') str; +u8 header[4] | matches('\x7fELF') str; ---- .Decoding strings ---- -name: u8[32] | encoding('sjis') str; +u8 name[32] | encoding('sjis') str; ---- .Decompressing data ---- -data_sz: u32; -data: u8[$] | compression('deflate', data_sz) hex; +u32 data_sz; +u8 data[until (false)] | compression('deflate', data_sz) hex; ---- -=== Visual hints +==== Visual hints Visual hints can be used to advice tools how data should be presented to human, as well as provide small documentation what kind of data to expect. +[options="header"] |=========================================== +| Hint | Description | nul | Do not visualize data | dec | Visualize data as decimal | hex | Visualize data as hexdecimal @@ -210,26 +275,18 @@ value of _len_ from the length of _str_ if it has been filled. We can also use this information to verify that length of _str_ matches the value of _len_, if both have been filled. ---- -len: u16; -str: u8[len] str; +u16 len; +u8 str[len] str; ---- .Parameter relationship In packing case, the same rules apply as in array relationship. Implicit relationship is formed between _decompressed_sz_ member and compression filter. ---- -decompressed_sz: u32 dec; -data: u8[$] | compression('zlib', decompressed_sz); +u32 decompressed_sz dec; +u8 data[until (false)] | compression('zlib', decompressed_sz); ---- -=== Explicit Relationships - -Sometimes we need to form explicit relationships when the structure is more -complicated. - -TODO: When we can actually model FFXI string tables correctly, it will be a -good example. - == Implementation === Compiler @@ -240,8 +297,8 @@ as optimizations would be done on the bytecode level instead the source level. === Validator -Validator takes the output of compiler and checks the bytecode follows a -standard pattern, and isn't invalid. Having validator pass simplifies the +Validator takes the output of compiler and checks the bytecode for validity +and that it follows a standard pattern. Having validator pass simplifies the code of translators, as they can assume their input is valid and don't need to do constant error checking. It also helps catch bugs from compiler early on. @@ -256,7 +313,20 @@ To make sure all source level attributes such as mathematical expressions can be translated losslessly to target language, the bytecode may contain special attributes. -TODO: Document bytecode operations and the predictable pattern here +[options="header"] +|===================================== +| Opcode | Decimal | Description +| OP_ADD | 0 | a + b +| OP_SUB | 1 | a - b +| OP_MUL | 2 | a * b +| OP_DIV | 3 | a / b +| OP_MOD | 4 | a % b +| OP_BIT_AND | 5 | a & b +| OP_BIT_OR | 6 | a \| b +| OP_BIT_XOR | 7 | a ^ b +| OP_BIT_LEFT | 8 | a << b +| OP_BIT_RIGHT | 9 | a >> b +|===================================== === Translators diff --git a/spec/ability.fspec b/spec/ability.fspec index 4498d69..67a9965 100644 --- a/spec/ability.fspec +++ b/spec/ability.fspec @@ -1,14 +1,14 @@ struct ability { - index: u16; - icon_id: u16; - mp_cost: u16; - unknown: u16; - targets: u16; - name: u8[32] | encoding('sjis') str; // The encoding actually depends on ROM region - description: u8[256] | encoding('sjis') str; // ^ Ditto, we can't express this (we need parser options) - padding: u8[726] nul; + u16 index; + u16 icon_id; + u16 mp_cost; + u16 unknown; + u16 targets; + u8 name[32] | necoding('sjis') str; // The encoding actually depends on ROM region + u8 description[256] | encoding('sjis') str; // ^ Ditto, we can't express this (we need parser options) + u8 padding[726] nul; }; struct dat { - ability: struct ability[$]; + struct ability ability[$]; }; diff --git a/spec/eaf.fspec b/spec/eaf.fspec index e9c5702..139539d 100644 --- a/spec/eaf.fspec +++ b/spec/eaf.fspec @@ -1,17 +1,17 @@ struct file { - path: u8[256] | encoding('ascii') str; - offset: u64; - size: u64; - padding: u8[16] nul; + u8 path[256] | encoding('ascii') str; + u64 offset; + u64 size; + u8 padding[16] nul; }; struct eaf { - header: u8[4] | matches('#EAF') str; - major: u16; - minor: u16; - size: u64; - count: u32; - unknown: u64; - padding: u8[100] nul; - files: struct file[count]; + u8 header[4] | matches('#EAF') str; + u16 major; + u16 minor; + u64 size; + u32 count; + u64 unknown; + u8 padding[100] nul; + struct file files[count]; }; diff --git a/spec/elf.fspec b/spec/elf.fspec index e6059c7..5bd9954 100644 --- a/spec/elf.fspec +++ b/spec/elf.fspec @@ -1,36 +1,35 @@ -1 + 5 + 2 * 5 / 2; - -enum foo { - foo: 0x1; - bar: 0x2; - eaf: 0x3; - eaf: 0xDEADBEEF; - bar; +struct elf32 { + u32 e_entry hex; + u32 e_phoff; + u32 e_shoff; }; struct elf64 { - e_entry: u64 hex; - e_phoff: u64; - e_shoff: u64; + u64 e_entry hex; + u64 e_phoff; + u64 e_shoff; }; struct elf { - ei_magic: u8[4] | matches('\x7fELF') str; - ei_class: u8 hex; // word size - ei_data: u8 hex; // endianess - ei_version: u8; - ei_osabi: u8; - ei_abi_version: u8; - padding: u8[7] nul; - e_type: u16 hex; - e_machine: u16 hex; - e_version: u32; - elf64: struct elf64; // fspec needs union to parse ei_class != 2 type - e_flags: u32 hex; - e_ehsz: u16; - e_phentsize: u16; - e_phnum: u16; - e_shentsize: u16; - e_shnum: u16; - e_shstrndx: u16; + u8 ei_magic[4] | matches('\x7fELF') str; + u8 ei_class hex; // word size + u8 ei_data hex; // endianess + u8 ei_version; + u8 ei_osabi; + u8 ei_abi_version; + u8 padding[7] nul; + u16 e_type hex; + u16 e_machine hex; + u32 e_version; + select (ei_class) { + 1) struct elf32 elf32; + 2) struct elf64 elf64; + } arch; + u32 e_flags hex; + u16 e_ehsz; + u16 e_phentsize; + u16 e_phnum; + u16 e_shentsize; + u16 e_shnum; + u16 e_shstrndx; }; diff --git a/spec/emz.fspec b/spec/emz.fspec index 0fe02a1..9eb90a4 100644 --- a/spec/emz.fspec +++ b/spec/emz.fspec @@ -1,7 +1,7 @@ struct emz { - header: u8[4] | matches('#EMZ') str; - unknown: u32 hex; // most likely redunancy check (crc32?) - size: u32; - offset: u32; // always 16? - data: u8[$] | compression('deflate', size) hex; + u8 header[4] | matches('#EMZ') str; + u32 unknown hex; // most likely redunancy check (crc32?) + u32 size; + u32 offset; // always 16? + u8 data[$] | compression('deflate', size) hex; }; diff --git a/spec/ftable.fspec b/spec/ftable.fspec index 39fdd26..c051a6a 100644 --- a/spec/ftable.fspec +++ b/spec/ftable.fspec @@ -1,3 +1,3 @@ struct ftable { - id: u16[$] hex; + u16 id[$] hex; }; diff --git a/spec/item.fspec b/spec/item.fspec index c4d1767..31bd3f5 100644 --- a/spec/item.fspec +++ b/spec/item.fspec @@ -15,14 +15,12 @@ struct item { u16 type; u16 resource; u16 targets; - - union data (type) { - 4 => struct weapon weapon; - 5 => struct armor armor; - 7 => struct usable usable; - 12 => struct puppet puppet; - * => struct general general; - }; - + select (type) { + 4) struct weapon weapon; + 5) struct armor armor; + 7) struct usable usable; + 12) struct puppet puppet; + *) struct general general; + } data; struct strings strings; }; diff --git a/spec/model.fspec b/spec/model.fspec index afa8281..aade45c 100644 --- a/spec/model.fspec +++ b/spec/model.fspec @@ -1,6 +1,6 @@ struct texture { u8 type; - u8 name[16] = ascii; + u8 name[16] | encoding('ascii') str; u32 version; u32 width; u32 height; diff --git a/spec/name.fspec b/spec/name.fspec index d4e0f7c..491c8da 100644 --- a/spec/name.fspec +++ b/spec/name.fspec @@ -1,8 +1,8 @@ struct name { - name: u8[28] | encoding('ascii') str; // The encoding actually depends on ROM region - id: u32; + u8 name[28] | encoding('ascii') str; // The encoding actually depends on ROM region + u32 id; }; struct dat { - name: struct name[$]; + struct name name[$]; }; diff --git a/spec/spell.fspec b/spec/spell.fspec index 68aa5fb..8c9c894 100644 --- a/spec/spell.fspec +++ b/spec/spell.fspec @@ -1,22 +1,22 @@ struct spell { - index: u16; - type: u16; // 1-6 for White/Black/Summon/Ninja/Bard/Blue - element: u16; - targets: u16; - skill: u16; - mp_cost: u16; - casting_time: u8; // in quarter of seconds - recast_delay: u8; // in quarter of seconds - level: u8[24] hex; // 1 byte per job, 0xxFF if not learnable, first slot is NONE job so always 0xFF - id: u16; // 0 for "unused" spells; often, but not always, equal to index - unknown: u8; - jp_name: u8[20] | encoding('sjis') str; - en_name: u8[20] | encoding('ascii') str; - jp_description: u8[128] | encoding('sjis') str; - en_description: u8[128] | encoding('ascii') str; - padding: u8[687] nul; + u16 index; + u16 type; // 1-6 for White/Black/Summon/Ninja/Bard/Blue + u16 element; + u16 targets; + u16 skill; + u16 mp_cost; + u8 casting_time; // in quarter of seconds + u8 recast_delay; // in quarter of seconds + u8 level[24] hex; // 1 byte per job, 0xxFF if not learnable, first slot is NONE job so always 0xFF + u16 id; // 0 for "unused" spells; often, but not always, equal to index + u8 unknown; + u8 jp_name[20] | encoding('sjis') str; + u8 en_name[20] | encoding('ascii') str; + u8 jp_description[128] | encoding('sjis') str; + u8 en_description[128] | encoding('ascii') str; + u8 padding[687] nul; }; struct dat { - spell: struct spell[$]; + struct spell spell[$]; }; diff --git a/spec/vtable.fspec b/spec/vtable.fspec index de281b8..a6be4eb 100644 --- a/spec/vtable.fspec +++ b/spec/vtable.fspec @@ -1,3 +1,3 @@ struct vtable { - exist: u8[$] hex; + u8 exist[$] hex; }; diff --git a/vim/filespec.vim b/vim/filespec.vim index 19c9945..1e29d91 100644 --- a/vim/filespec.vim +++ b/vim/filespec.vim @@ -9,10 +9,9 @@ syn keyword fsTodo contained TODO FIXME XXX syn cluster fsCommentGroup contains=fsTodo,fsBadContinuation syn region fsComment start="//" skip="\\$" end="$" keepend contains=@fsCommentGroup,@Spell -syn keyword fsStructure enum struct union -syn keyword fsType s8 s16 s32 s64 -syn keyword fsType u8 u16 u32 u64 -syn keyword fsConstant nul dec hex str +syn keyword fsStructure enum struct select until +syn match fsType "[su][1-9][0-9]*" +syn keyword fsConstant nul dec hex str be le true false syn case ignore syn match fsNumbers display transparent "\<\d\|\.\d" contains=fsNumber,fsFloat,fsOctalError,fsOctal @@ -31,6 +30,7 @@ syn case match syn match fsSpecial display contained "\\\(x\x\+\|\o\{1,3}\|.\|$\)" syn match fsString1 "'[^']*'" contains=fsSpecial syn match fsString2 '"[^"]*"' contains=fsSpecial +syn match fsBinary "b[0-1x]\+" syn match fsBlock "[{}]" syn match fsBracket "[\[\]]" @@ -43,6 +43,7 @@ hi def link fsComment Comment hi def link fsStructure Structure hi def link fsType Type hi def link fsConstant Constant +hi def link fsBinary Number hi def link fsNumber Number hi def link fsOctal Number hi def link fsOctalZero PreProc -- cgit v1.2.3-70-g09d2