Browse Source

ft:代码优化

master
lijie 2 years ago
parent
commit
33bfe8d679
5 changed files with 815 additions and 655 deletions
  1. +315
    -206
      VelocyPack.md
  2. +396
    -342
      VelocyPack_zh.md
  3. +12
    -13
      include/eVPack.hrl
  4. +2
    -2
      src/deTest.erl
  5. +90
    -92
      src/eVPack.erl

+ 315
- 206
VelocyPack.md View File

@ -24,57 +24,86 @@ indicates the type (and often the length) of the VPack value at hand:
We first give an overview with a brief but accurate description for
reference, for arrays and objects see below for details:
- `0x00` : none - this indicates absence of any type and value, this is not allowed in VPack values
- `0x00` : none - this indicates absence of any type and value,
this is not allowed in VPack values
- `0x01` : empty array
- `0x02` : array without index table (all subitems have the same byte length), 1-byte byte length
- `0x03` : array without index table (all subitems have the same byte length), 2-byte byte length
- `0x04` : array without index table (all subitems have the same byte length), 4-byte byte length
- `0x05` : array without index table (all subitems have the same byte length), 8-byte byte length
- `0x02` : array without index table (all subitems have the same
byte length), 1-byte byte length
- `0x03` : array without index table (all subitems have the same
byte length), 2-byte byte length
- `0x04` : array without index table (all subitems have the same
byte length), 4-byte byte length
- `0x05` : array without index table (all subitems have the same
byte length), 8-byte byte length
- `0x06` : array with 1-byte index table offsets, bytelen and # subvals
- `0x07` : array with 2-byte index table offsets, bytelen and # subvals
- `0x08` : array with 4-byte index table offsets, bytelen and # subvals
- `0x09` : array with 8-byte index table offsets, bytelen and # subvals
- `0x0a` : empty object
- `0x0b` : object with 1-byte index table offsets, sorted by attribute name, 1-byte bytelen and # subvals
- `0x0c` : object with 2-byte index table offsets, sorted by attribute name, 2-byte bytelen and # subvals
- `0x0d` : object with 4-byte index table offsets, sorted by attribute name, 4-byte bytelen and # subvals
- `0x0e` : object with 8-byte index table offsets, sorted by attribute name, 8-byte bytelen and # subvals
- `0x0f` : object with 1-byte index table offsets, not sorted by attribute name, 1-byte bytelen and # subvals
- `0x10` : object with 2-byte index table offsets, not sorted by attribute name, 2-byte bytelen and # subvals
- `0x11` : object with 4-byte index table offsets, not sorted by attribute name, 4-byte bytelen and # subvals
- `0x12` : object with 8-byte index table offsets, not sorted by attribute name, 8-byte bytelen and # subvals
- `0x0b` : object with 1-byte index table offsets, sorted by
attribute name, 1-byte bytelen and # subvals
- `0x0c` : object with 2-byte index table offsets, sorted by
attribute name, 2-byte bytelen and # subvals
- `0x0d` : object with 4-byte index table offsets, sorted by
attribute name, 4-byte bytelen and # subvals
- `0x0e` : object with 8-byte index table offsets, sorted by
attribute name, 8-byte bytelen and # subvals
- `0x0f` : object with 1-byte index table offsets, not sorted by
attribute name, 1-byte bytelen and # subvals - OBSOLETE
- `0x10` : object with 2-byte index table offsets, not sorted by
attribute name, 2-byte bytelen and # subvals - OBSOLETE
- `0x11` : object with 4-byte index table offsets, not sorted by
attribute name, 4-byte bytelen and # subvals - OBSOLETE
- `0x12` : object with 8-byte index table offsets, not sorted by
attribute name, 8-byte bytelen and # subvals - OBSOLETE
- `0x13` : compact array, no index table
- `0x14` : compact object, no index table
- `0x15`-`0x16` : reserved
- `0x17` : illegal - this type can be used to indicate a value that is illegal in the embedding application
- `0x17` : illegal - this type can be used to indicate a value that
is illegal in the embedding application
- `0x18` : null
- `0x19` : false
- `0x1a` : true
- `0x1b` : double IEEE-754, 8 bytes follow, stored as little endian uint64 equivalent
- `0x1c` : UTC-date in milliseconds since the epoch, stored as 8 byte signed int, little endian, two's complement
- `0x1d` : external (only in memory): a char* pointing to the actual place in memory, where another VPack item
resides, not allowed in VPack values on disk or on the network
- `0x1b` : double IEEE-754, 8 bytes follow, stored as little
endian uint64 equivalent
- `0x1c` : UTC-date in milliseconds since the epoch, stored as 8 byte
signed int, little endian, two's complement
- `0x1d` : external (only in memory): a char* pointing to the actual
place in memory, where another VPack item resides, not
allowed in VPack values on disk or on the network
- `0x1e` : minKey, nonsensical value that compares < than all other values
- `0x1f` : maxKey, nonsensical value that compares > than all other values
- `0x20`-`0x27` : signed int, little endian, 1 to 8 bytes, number is V - `0x1f`, two's complement
- `0x20`-`0x27` : signed int, little endian, 1 to 8 bytes, number is V - `0x1f`,
two's complement
- `0x28`-`0x2f` : uint, little endian, 1 to 8 bytes, number is V - `0x27`
- `0x30`-`0x39` : small integers 0, 1, ... 9
- `0x3a`-`0x3f` : small negative integers -6, -5, ..., -1
- `0x40`-`0xbe` : UTF-8-string, using V - `0x40` bytes (not Unicode characters!), length 0 is possible, so `0x40` is the
empty string, maximal length is 126, note that strings here are not zero-terminated and may contain NUL bytes
- `0xbf` : long UTF-8-string, next 8 bytes are length of string in bytes (not Unicode characters) as little
endian unsigned integer, note that long strings are not zero-terminated and may contain NUL bytes
- `0xc0`-`0xc7` : binary blob, next V - `0xbf` bytes are the length of blob in bytes, note that binary blobs are not
zero-terminated
- `0xc8`-`0xcf` : positive long packed BCD-encoded float, V - `0xc7` bytes follow that encode in a little endian way the
length of the mantissa in bytes. Directly after that follow 4 bytes encoding the (power of 10) exponent, by which the
mantissa is to be multiplied, stored as little endian two's complement signed 32-bit integer. After that, as many
bytes follow as the length information at the beginning has specified, each byte encodes two digits in big-endian
packed BCD. Example: 12345 decimal can be encoded as
- `0x40`-`0xbe` : UTF-8-string, using V - `0x40` bytes (not Unicode characters!),
length 0 is possible, so `0x40` is the empty string,
maximal length is 126, note that strings here are not
zero-terminated and may contain NUL bytes
- `0xbf` : long UTF-8-string, next 8 bytes are length of string in
bytes (not Unicode characters) as little endian unsigned
integer, note that long strings are not zero-terminated
and may contain NUL bytes
- `0xc0`-`0xc7` : binary blob, next V - `0xbf` bytes are the length of blob in
bytes, note that binary blobs are not zero-terminated
- `0xc8`-`0xcf` : positive long packed BCD-encoded float, V - `0xc7` bytes follow
that encode in a little endian way the length of the
mantissa in bytes. Directly after that follow 4 bytes
encoding the (power of 10) exponent, by which the mantissa
is to be multiplied, stored as little endian two's
complement signed 32-bit integer. After that, as many
bytes follow as the length information at the beginning
has specified, each byte encodes two digits in
big-endian packed BCD.
Example: 12345 decimal can be encoded as
`c8 03 00 00 00 00 01 23 45` or
`c8 03 ff ff ff ff 12 34 50`
- `0xd0`-`0xd7` : negative long packed BCD-encoded float, V - `0xcf` bytes follow that encode in a little endian way the
length of the mantissa in bytes. After that, same as positive long packed BCD-encoded float above.
- `0xd0`-`0xd7` : negative long packed BCD-encoded float, V - `0xcf` bytes
follow that encode in a little endian way the length of
the mantissa in bytes. After that, same as positive long
packed BCD-encoded float above.
- `0xd8`-`0xed` : reserved
- `0xee`-`0xef` : value tagging for logical types
- `0xf0`-`0xff` : custom types
@ -84,9 +113,10 @@ reference, for arrays and objects see below for details:
Empty arrays are simply a single byte `0x01`.
We next describe the type cases `0x02` to `0x09`, see below for the special compact type `0x13`.
We next describe the type cases `0x02` to `0x09`, see below for the
special compact type `0x13`.
Non-empty arrays look like one of the following:
Non-empty arrNRI TEMSays look like one of the following:
one of 0x02 to 0x05
BYTELENGTH
@ -127,48 +157,64 @@ or
INDEXTABLE with 8 byte per entry
NRITEMS in 8 bytes
If any optional padding is allowed for a type, the padding must consist of exactly that many bytes that the length of
the padding, the length of BYTELENGTH and the length of NRITEMS (if present) sums up to 8. If the length of BYTELENGTH
is already 8, there is no padding allowed. The entire padding must consist of zero bytes (ASCII NUL).
If any optional padding is allowed for a type, the padding must consist
of exactly that many bytes that the length of the padding, the length of
BYTELENGTH and the length of NRITEMS (if present) sums up to 8. If the
length of BYTELENGTH is already 8, there is no padding allowed. The
entire padding must consist of zero bytes (ASCII NUL).
Numbers (for byte length, number of subvalues and offsets in the INDEXTABLE) are little endian unsigned integers, using
1 byte for types `0x02` and `0x06`, 2 bytes for types `0x03` and `0x07`, 4 bytes for types
Numbers (for byte length, number of subvalues and offsets in the
INDEXTABLE) are little endian unsigned integers, using 1 byte for
types `0x02` and `0x06`, 2 bytes for types `0x03` and `0x07`, 4 bytes for types
`0x04` and `0x08`, and 8 bytes for types `0x05` and `0x09`.
NRITEMS is a single number as described above.
The INDEXTABLE consists of:
- for types `0x06`-`0x09` an array of offsets (unaligned, in the number format described above) earlier offsets reside
at lower addresses. Offsets are measured from the start of the VPack value.
Non-empty arrays of types `0x06` to `0x09` have a small header including their byte length, the number of subvalues,
then all the subvalues and finally an index table containing offsets to the subvalues. To find the index table, find the
number of subvalues, then the end, and from that the base of the index table, considering how wide its entries are.
For types `0x02` to `0x05` there is no offset table and no number of items. The first item begins at address A+2, A+3,
A+5 or respectively A+9, depending on the type and thus the width of the byte length field. Note the following special
rule: The actual position of the first subvalue is allowed to be further back, after some run of padding zero bytes.
For example, if 2 bytes are used for both the byte length (BYTELENGTH), then an optional padding of 4 zero bytes is then
allowed to follow, and the actual VPack subvalues can start at A+9. This is to give a program that builds a VPack value
the opportunity to reserve 8 bytes in the beginning and only later find out that fewer bytes suffice to write the byte
length. One can determine the number of subvalues by finding the first subvalue, its byte length, and dividing the
amount of available space by it.
For types `0x06` to `0x09` the offset table describes where the subvalues reside. It is not necessary for the subvalues
to start immediately after the number of subvalues field.
As above, it is allowed to include optional padding. Again here, any padding must consist of a run of consecutive zero
bytes (ASCII NUL) and must be as long that it fills up the length of BYTELENGTH and the length of NRITEMS to 8.
For example, if both BYTELENGTH and NRITEMS can be expressed using 2 bytes each, the sum of their lengths is 4. It is
therefore allowed to add 4 bytes of padding here, so that the first subvalue could be at address A+9.
- for types `0x06`-`0x09` an array of offsets (unaligned, in the number
format described above) earlier offsets reside at lower addresses.
Offsets are measured from the start of the VPack value.
Non-empty arrays of types `0x06` to `0x09` have a small header including
their byte length, the number of subvalues, then all the subvalues and
finally an index table containing offsets to the subvalues. To find the
index table, find the number of subvalues, then the end, and from that
the base of the index table, considering how wide its entries are.
For types `0x02` to `0x05` there is no offset table and no number of items.
The first item begins at address A+2, A+3, A+5 or respectively A+9,
depending on the type and thus the width of the byte length field. Note
the following special rule: The actual position of the first subvalue
is allowed to be further back, after some run of padding zero bytes.
For example, if 2 bytes are used for both the byte length (BYTELENGTH),
then an optional padding of 4 zero bytes is then allowed to follow, and
the actual VPack subvalues can start at A+9.
This is to give a program that builds a VPack value the opportunity to
reserve 8 bytes in the beginning and only later find out that fewer bytes
suffice to write the byte length. One can determine the number of
subvalues by finding the first subvalue, its byte length, and
dividing the amount of available space by it.
For types `0x06` to `0x09` the offset table describes where the subvalues
reside. It is not necessary for the subvalues to start immediately after
the number of subvalues field.
As above, it is allowed to include optional padding. Again here, any
padding must consist of a run of consecutive zero bytes (ASCII NUL) and
must be as long that it fills up the length of BYTELENGTH and the length
of NRITEMS to 8.
For example, if both BYTELENGTH and NRITEMS can be expressed using 2 bytes
each, the sum of their lengths is 4. It is therefore allowed to add 4
bytes of padding here, so that the first subvalue could be at address A+9.
There is one exception for the 8-byte numbers case (type `0x05`):
In this case the number of elements is moved behind the index table. This is to get away without moving memory when one
has reserved 8 bytes in the beginning and later noticed that all 8 bytes are needed for the byte length. For this case
it is not allowed to include any padding.
In this case the number of elements is moved behind the index table.
This is to get away without moving memory when one has reserved 8 bytes
in the beginning and later noticed that all 8 bytes are needed for the
byte length. For this case it is not allowed to include any padding.
All offsets are measured from base A.
@ -204,12 +250,16 @@ possible, though not necessarily advised to use:
0b 00 00 00 00 00 00 00
03 00 00 00 00 00 00 00
Note that it is not recommended to encode short arrays in too long a format.
Note that it is not recommended to encode short arrays in too long a
format.
We now describe the special type `0x13`, which is useful for a particularly compact array representation. Note that to
some extent this goes against the principles of the VelocyPack format, since quick access to subvalues is no longer
possible, all items in the array must be scanned to find a particular one. However, there are certain use cases for
VelocyPack which only require sequential access (for example JSON dumping) and have a particular need for compactness.
We now describe the special type `0x13`, which is useful for a
particularly compact array representation. Note that to some extent this
goes against the principles of the VelocyPack format, since quick access
to subvalues is no longer possible, all items in the array must be
scanned to find a particular one. However, there are certain use cases
for VelocyPack which only require sequential access (for example JSON
dumping) and have a particular need for compactness.
The overall format of this array type is
@ -249,7 +299,8 @@ Here is an example, the array [1, 16] can be encoded as follows:
Empty objects are simply a single byte `0x0a`.
We next describe the type cases `0x0b` to `0x12`, see below for the special compact type `0x14`.
We next describe the type cases `0x0b` to `0x12`, see below for the
special compact type `0x14`.
Non-empty objects look like this:
@ -260,8 +311,9 @@ Non-empty objects look like this:
optional INDEXTABLE
NRITEMS for the 8-byte case
Numbers (for byte length, number of subvalues and offsets in the INDEXTABLE) are little endian unsigned integers, using
1 byte for types `0x0b` and `0x0f`, 2 bytes for types `0x0c` and `0x10`, 4 bytes for types
Numbers (for byte length, number of subvalues and offsets in the
INDEXTABLE) are little endian unsigned integers, using 1 byte for
types `0x0b` and `0x0f`, 2 bytes for types `0x0c` and `0x10`, 4 bytes for types
`0x0d` and `0x11`, and 8 bytes for types `0x0e` and `0x12`.
NRITEMS is a single number as described above.
@ -271,36 +323,48 @@ The INDEXTABLE consists of:
above) earlier offsets reside at lower addresses.
Offsets are measured from the beginning of the VPack value.
Non-empty objects have a small header including their byte length, the number of subvalues, then all the subvalues and
finally an index table containing offsets to the subvalues. To find the index table, find number of subvalues, then the
end, and from that the base of the index table, considering how wide its entries are.
Non-empty objects have a small header including their byte length, the
number of subvalues, then all the subvalues and finally an index table
containing offsets to the subvalues. To find the index table, find
number of subvalues, then the end, and from that the base of the index
table, considering how wide its entries are.
For all types the offset table describes where the subvalues reside. It is not necessary for the subvalues to start
immediately after the number of subvalues field. For performance reasons when building the value, it could be desirable
to reserve 8 bytes for the byte length and the number of subvalues and not fill the gap, even though it turns out later
that offsets and thus the byte length only uses 2 bytes, say.
For all types the offset table describes where the subvalues reside. It
is not necessary for the subvalues to start immediately after the number
of subvalues field. For performance reasons when building the value, it
could be desirable to reserve 8 bytes for the byte length and the number
of subvalues and not fill the gap, even though it turns out later that
offsets and thus the byte length only uses 2 bytes, say.
There is one special case: the empty object is simply stored as the single byte `0x0a`.
There is one special case: the empty object is simply stored as the
single byte `0x0a`.
There is another exception: For 8-byte numbers (`0x12`) the number of subvalues is stored behind the INDEXTABLE. This is
to get away without moving memory when one has reserved 8 bytes in the beginning and later noticed that all 8 bytes are
needed for the byte length.
There is another exception: For 8-byte numbers (`0x12`) the number of
subvalues is stored behind the INDEXTABLE. This is to get away without
moving memory when one has reserved 8 bytes in the beginning and later
noticed that all 8 bytes are needed for the byte length.
All offsets are measured from base A.
Each entry consists of two parts, the key and the value, they are encoded as normal VPack values as above, the first is
always a short or long UTF-8 string starting with a byte `0x40`-`0xbf` as described below. The second is any other VPack
value.
There is one extension: For the key it is possible to use the positive small integer values `0x30`-`0x39` or an unsigned
integer starting with a type byte of `0x28`-`0x2f`. Any such integer value is an index into an outside-given table of
attribute names. These are convenient when only very few attribute names occur or some are repeated very often. The
standard way to encode such an attribute name table is as a VPack array of strings as specified here.
Objects are always stored with sorted key/value pairs, sorted by bytewise comparisons of the keys on each nesting level.
Sorting has some overhead but will allow looking up keys in logarithmic time later. Note that only the index table needs
to be sorted, it is not required that the offsets in these tables are increasing. Since the index table resides after
the actual subvalues, one can build up a complex VPack value by writing linearly.
Each entry consists of two parts, the key and the value, they are
encoded as normal VPack values as above, the first is always a short or
long UTF-8 string starting with a byte `0x40`-`0xbf` as described below. The
second is any other VPack value.
There is one extension: For the key it is possible to use the positive
small integer values `0x30`-`0x39` or an unsigned integer starting with a
type byte of `0x28`-`0x2f`. Any such integer value is an index into an
outside-given table of attribute names. These are convenient when only
very few attribute names occur or some are repeated very often. The
standard way to encode such an attribute name table is as a VPack array
of strings as specified here.
Objects are always stored with sorted key/value pairs, sorted by bytewise
comparisons of the keys on each nesting level. Sorting has some overhead
but will allow looking up keys in logarithmic time later. Note that only the
index table needs to be sorted, it is not required that the offsets in
these tables are increasing. Since the index table resides after the actual
subvalues, one can build up a complex VPack value by writing linearly.
Example: the object `{"a": 12, "b": true, "c": "xyz"}` can have the hexdump:
@ -322,17 +386,20 @@ entries, as in this example:
41 63 43 78 79 7a
0c 00 00 00 09 00 00 00 10 00 00 00
Similarly with type `0x0c` and 2-byte offsets, byte length and number of subvalues, or with type `0x0e` and 8-byte
numbers.
Similarly with type `0x0c` and 2-byte offsets, byte length and number of
subvalues, or with type `0x0e` and 8-byte numbers.
Note that it is not recommended to encode short objects with too long index tables.
Note that it is not recommended to encode short objects with too long
index tables.
### Special compact objects
We now describe the special type `0x14`, which is useful for a particularly compact object representation. Note that to
some extent this goes against the principles of the VelocyPack format, since quick access to subvalues is no longer
possible, all key/value pairs in the object must be scanned to find a particular one. However, there are certain use
cases for VelocyPack which only require sequential access
We now describe the special type `0x14`, which is useful for a
particularly compact object representation. Note that to some extent
this goes against the principles of the VelocyPack format, since quick
access to subvalues is no longer possible, all key/value pairs in the
object must be scanned to find a particular one. However, there are
certain use cases for VelocyPack which only require sequential access
(for example JSON dumping) and have a particular need for compactness.
The overall format of this object type is
@ -342,21 +409,28 @@ The overall format of this object type is
sub VPack key/value pairs
NRPAIRS
There is no index table at all, although the sub VelocyPack values can have different byte sizes. The BYTELENGTH and
NRPAIRS are encoded in a special format, which we describe now. It is the same as for the special compact array
type `0x13`, which we repeat here for the sake of completeness.
The BYTELENGTH consists of 1 to 8 bytes, of which all but the last one have their high bit set. Thus, the high bits
determine, how many bytes are actually used. The lower 7 bits of all these bits together comprise the actual byte length
in a little endian fashion. That is, the byte at address A+1 contains the least significant 7 bits (0 to 6) of the byte
length, the following byte at address A+2 contains the bits 7 to 13, and so on. Since the total number of bytes is
limited to 8, this encodes unsigned integers of up to 56 bits, which is the overall limit for the size of such a compact
array representation.
There is no index table at all, although the sub VelocyPack values can
have different byte sizes. The BYTELENGTH and NRPAIRS are encoded in a
special format, which we describe now. It is the same as for the special
compact array type `0x13`, which we repeat here for the sake of
completeness.
The NRPAIRS entry is encoded essentially the same, except that it is laid out in reverse order in memory. That is, one
has to use the BYTELENGTH to find the end of the array value and go back bytes until one finds a byte with high bit
reset. The last byte (at the highest memory address) contains the least significant 7 bits of the NRPAIRS value, the
second one bits 7 to 13 and so on.
The BYTELENGTH consists of 1 to 8 bytes, of which all but the last one
have their high bit set. Thus, the high bits determine, how many bytes
are actually used. The lower 7 bits of all these bits together comprise
the actual byte length in a little endian fashion. That is, the byte at
address A+1 contains the least significant 7 bits (0 to 6) of the byte
length, the following byte at address A+2 contains the bits 7 to 13, and
so on. Since the total number of bytes is limited to 8, this encodes
unsigned integers of up to 56 bits, which is the overall limit for the
size of such a compact array representation.
The NRPAIRS entry is encoded essentially the same, except that it
is laid out in reverse order in memory. That is, one has to use the
BYTELENGTH to find the end of the array value and go back bytes until
one finds a byte with high bit reset. The last byte (at the highest
memory address) contains the least significant 7 bits of the NRPAIRS
value, the second one bits 7 to 13 and so on.
Here is an example, the object `{"a":1, "b":16}` can be encoded as follows:
@ -367,18 +441,21 @@ Here is an example, the object `{"a":1, "b":16}` can be encoded as follows:
## Doubles
Type `0x1b` indicates a double IEEE-754 value using the 8 bytes following the type byte. To guarantee
platform-independentness the details of the byte order are as follows. Encoding is done by using memcpy to copy the
internal double value to an uint64\_t. This 64-bit unsigned integer is then stored as little endian 8 byte integer in
the VPack value. Decoding works in the opposite direction. This should sort out the undetermined byte order in IEEE-754
in practice.
Type `0x1b` indicates a double IEEE-754 value using the 8 bytes following
the type byte. To guarantee platform-independentness the details of the
byte order are as follows. Encoding is done by using memcpy to copy the
internal double value to an uint64\_t. This 64-bit unsigned integer is
then stored as little endian 8 byte integer in the VPack value. Decoding
works in the opposite direction. This should sort out the undetermined
byte order in IEEE-754 in practice.
## Dates
Type `0x1c` indicates a signed 64-int integer stored in 8 bytes little endian two's complement notation directly after
the type. The value means a universal UTC-time measured in milliseconds since the epoch, which is 00:00 on 1 January
1970 UTC.
Type `0x1c` indicates a signed 64-int integer stored in 8 bytes little
endian two's complement notation directly after the type. The value means
a universal UTC-time measured in milliseconds since the epoch, which is
00:00 on 1 January 1970 UTC.
## External VPack values
@ -390,41 +467,48 @@ points to the actual VPack value elsewhere in memory.
## Artificial minimal and maximal keys
These values of types `0x1e` and `0x1f` have no meaning other than comparing smaller or greater respectively than any
other VPack value. The idea is that these can be used in systems that define a total order on all VPack values to
specify left or right ends of infinite intervals.
These values of types `0x1e` and `0x1f` have no meaning other than comparing
smaller or greater respectively than any other VPack value. The idea is
that these can be used in systems that define a total order on all VPack
values to specify left or right ends of infinite intervals.
## Integer types
There are different ways to specify integers. For small values -6 to 9 inclusively there are specific type bytes in the
range `0x30` to `0x3f` to allow for storage in a single byte. After that there are signed and unsigned integer types
that can code in the type byte the number of bytes used (ranges `0x20`-`0x27` for signed and `0x28`-`0x2f` for unsigned)
.
There are different ways to specify integers. For small values -6 to 9
inclusively there are specific type bytes in the range `0x30` to `0x3f` to
allow for storage in a single byte. After that there are signed and
unsigned integer types that can code in the type byte the number of
bytes used (ranges `0x20`-`0x27` for signed and `0x28`-`0x2f` for unsigned).
## Null and boolean values
These three values use a single byte to store the corresponding JSON values.
These three values use a single byte to store the corresponding JSON
values.
## Strings
Strings are stored as UTF-8 encoded byte sequences. There are two variants, a short one and a long one. In the short
one, the byte length
(not the number of UTF-8 characters) is directly encoded in the type, and this works up to and including byte length
Strings are stored as UTF-8 encoded byte sequences. There are two
variants, a short one and a long one. In the short one, the byte length
(not the number of UTF-8 characters) is directly encoded in the type,
and this works up to and including byte length 126. Types `0x40` to `0xbe`
are used for this and the byte length is V - `0x3f`, if V is the type
byte. For strings longer than 126 bytes, the type byte is `0xbf` and the
byte length of the string is stored in the first 8 bytes after the type
byte, using a little endian unsigned integer representation. The actual
string follows after these 8 bytes. There is no terminating zero byte in
either case and the string may contain zero bytes.
126. Types `0x40` to `0xbe`
are used for this and the byte length is V - `0x3f`, if V is the type byte. For strings longer than 126 bytes, the
type byte is `0xbf` and the byte length of the string is stored in the first 8 bytes after the type byte, using a
little endian unsigned integer representation. The actual string follows after these 8 bytes. There is no
terminating zero byte in either case and the string may contain zero bytes.
## Binary data
The type bytes `0xc0` to `0xc7` allow to store arbitrary binary byte sequences as a VPack value. The format is as
follows: If V is the type byte, then V - `0xbf` bytes follow it to make a little endian unsigned integer representing
the length of the binary data, which directly follows these length bytes. No alignment is guaranteed. The content is
The type bytes `0xc0` to `0xc7` allow to store arbitrary binary byte
sequences as a VPack value. The format is as follows: If V is the type
byte, then V - `0xbf` bytes follow it to make a little endian unsigned
integer representing the length of the binary data, which directly
follows these length bytes. No alignment is guaranteed. The content is
entirely up to the user.
## Packed BCD long floats
These types are used to represent arbitrary precision decimal numbers.
@ -436,44 +520,56 @@ format of these values is:
EXPONENT (as 4-byte little endian signed two's complement integer)
MANTISSA (as packed BCD-encoded integer, big-endian)
The type byte describes the sign of the number as well as the number of bytes used to specify the byte length of the
mantissa. As usual, if V is the type byte, then V - `0xc7` (in the positive case) or V - `0xcf` (in the negative case)
bytes are used for the length of the mantissa, stored as little endian unsigned integer directly after the byte length.
After this follow exactly 4 bytes (little endian signed two's complement integer) to specify the exponent. After the
exponent, the actual mantissa bytes follow.
Packed BCD is used, so that each byte stores exactly 2 decimal digits as in `0x34` for the decimal digits 34. Therefore,
the mantissa always has an even number of decimal digits. Note that the mantissa is stored in big endian form, to make
parsing and dumping efficient. This leads to the
"unholy nibble problem": When a JSON parser sees the beginning of a longish number, it does not know whether an even or
odd number of digits follow. However, for efficiency reasons it wants to start writing bytes to the output as it reads
the input. This is, where the exponent comes to the rescue, which is illustrated by the following example. 12345 decimal
can be encoded as:
The type byte describes the sign of the number as well as the number of
bytes used to specify the byte length of the mantissa. As usual, if V is
the type byte, then V - `0xc7` (in the positive case) or V - `0xcf` (in the
negative case) bytes are used for the length of the mantissa, stored as
little endian unsigned integer directly after the byte length. After
this follow exactly 4 bytes (little endian signed two's complement
integer) to specify the exponent. After the exponent, the actual
mantissa bytes follow.
Packed BCD is used, so that each byte stores exactly 2 decimal digits as
in `0x34` for the decimal digits 34. Therefore, the mantissa always has an
even number of decimal digits. Note that the mantissa is stored in big
endian form, to make parsing and dumping efficient. This leads to the
"unholy nibble problem": When a JSON parser sees the beginning of a
longish number, it does not know whether an even or odd number of digits
follow. However, for efficiency reasons it wants to start writing bytes
to the output as it reads the input. This is, where the exponent comes
to the rescue, which is illustrated by the following example.
12345 decimal can be encoded as:
c8 03 00 00 00 00 01 23 45
c8 03 ff ff ff ff 12 34 50
The former encoding puts a leading 0 in the first byte and uses exponent 0, the latter encoding directly starts putting
two decimal digits in one byte and then in the end has to "erase" the trailing 0 by using exponent -1, encoded by the 4
byte sequence `ff ff ff ff`.
The former encoding puts a leading 0 in the first byte and uses exponent
0, the latter encoding directly starts putting two decimal digits in one
byte and then in the end has to "erase" the trailing 0 by using exponent
-1, encoded by the 4 byte sequence `ff ff ff ff`.
Therefore, the unholy nibble problem is solved and parsing (and indeed dumping) can be efficient.
Therefore, the unholy nibble problem is solved and parsing (and indeed
dumping) can be efficient.
## Tagging
Types `0xee`-`0xef` are used for tagging of values to implement logical types.
Types `0xee`-`0xef` are used for tagging of values to implement logical
types.
For example, if type `0x1c` did not exist, the database driver could serialize a timestamp object (Date in JavaScript,
Instant in Java, etc)
into a Unix timestamp, a 64-bit integer. Assuming the lack of schema, upon deserialization it would not be possible to
tell an integer from a timestamp and deserialize the value accordingly.
For example, if type `0x1c` did not exist, the database driver could
serialize a timestamp object (Date in JavaScript, Instant in Java, etc)
into a Unix timestamp, a 64-bit integer. Assuming the lack of schema,
upon deserialization it would not be possible to tell an integer from
a timestamp and deserialize the value accordingly.
Type tagging resolves this by attaching an integer tag to values that can then be read when deserializing the value,
e.g. that tag=1 is a timestamp and the relevant timestamp class should be used.
Type tagging resolves this by attaching an integer tag to values that
can then be read when deserializing the value, e.g. that tag=1 is a
timestamp and the relevant timestamp class should be used.
The tag values are specified separately and applications can also specify their own to have the database driver
deserialize their specific data types into the appropriate classes (including models).
The tag values are specified separately and applications can also
specify their own to have the database driver deserialize their specific
data types into the appropriate classes (including models).
Essentially this is object-relational mapping for parts of documents.
@ -503,43 +599,56 @@ The following user-defined types exist:
- `0xf1` : 2 bytes payload, directly following the type byte
- `0xf2` : 4 bytes payload, directly following the type byte
- `0xf3` : 8 bytes payload, directly following the type byte
- `0xf4`-`0xf6` : length of the payload is described by a single further unsigned byte directly following the type byte,
the payload of that many bytes follows
- `0xf7`-`0xf9` : length of the payload is described by two bytes (little endian unsigned integer) directly following
the type byte, the payload of that many bytes follows
- `0xfa`-`0xfc` : length of the payload is described by four bytes (little endian unsigned integer) directly following
the type byte, the payload of that many bytes follows
- `0xfd`-`0xff` : length of the payload is described by eight bytes (little endian unsigned integer) directly following
the type byte, the payload of that many bytes follows
Note: In types `0xf4` to `0xff` the "payload" refers to the actual data not including the length specification.
- `0xf4`-`0xf6` : length of the payload is described by a single further
unsigned byte directly following the type byte, the
payload of that many bytes follows
- `0xf7`-`0xf9` : length of the payload is described by two bytes (little
endian unsigned integer) directly following the type
byte, the payload of that many bytes follows
- `0xfa`-`0xfc` : length of the payload is described by four bytes (little
endian unsigned integer) directly following the type
byte, the payload of that many bytes follows
- `0xfd`-`0xff` : length of the payload is described by eight bytes (little
endian unsigned integer) directly following the type
byte, the payload of that many bytes follows
Note: In types `0xf4` to `0xff` the "payload" refers to the actual data not
including the length specification.
## Portability
Serialized booleans, integers, strings, arrays, objects etc. all have a defined endianess and length, which is
platform-independent. These types are fully portable in serialized VelocyPack.
Serialized booleans, integers, strings, arrays, objects etc. all have a
defined endianess and length, which is platform-independent. These types are
fully portable in serialized VelocyPack.
There are still a few caveats when it comes to portability:
It is possible to build up very large values on a 64 bit system, but it may not be possible to read them back on a 32
bit system. This is because the maximum memory allocation size on a 32 bit system may be severely limited compared to a
64 bit system, i.e. a 32 bit OS may simply not allow to allocate buffers larger than 4 GB. This is not a limitation of
VelocyPack, but a limitation of 32 bit architectures. If all VelocyPack values are kept small enough so that they are
well below the 32 bit length boundaries, this should not matter though.
The VelocyPack type *External* contains just a raw pointer to memory, which should only be used during the buildup of
VelocyPack values in memory. The *External* type is not supposed to be used in VelocyPack values that are serialized and
stored persistently, and then later read back from persistence. Doing it anyway is not portable and will also pose a
security risk. Not using the *External* type for any data that is serialized will avoid this problem entirely.
The VelocyPack type *Custom* is completely user-defined, and there is no default implementation for them. So it is up to
the embedder to make these custom type bindings portable if portability of them is a concern.
VelocyPack *Double* values are serialized as integer equivalents in a specific way, and unserialized back into integers
that overlay a IEEE-754 double-precision floating point value in memory. We found this to be sufficiently portable for
our needs, although at least in theory there may be portability issues with some systems.
The [following](https://en.wikipedia.org/wiki/Endianness#Floating_point) was used as a backing for our "reasonably
portable in the real world" assumptions:
It is possible to build up very large values on a 64 bit system, but it may not be
possible to read them back on a 32 bit system. This is because the maximum memory
allocation size on a 32 bit system may be severely limited compared to a 64 bit system,
i.e. a 32 bit OS may simply not allow to allocate buffers larger than 4 GB. This
is not a limitation of VelocyPack, but a limitation of 32 bit architectures.
If all VelocyPack values are kept small enough so that they are well below the
32 bit length boundaries, this should not matter though.
The VelocyPack type *External* contains just a raw pointer to memory, which should
only be used during the buildup of VelocyPack values in memory. The *External* type
is not supposed to be used in VelocyPack values that are serialized and stored
persistently, and then later read back from persistence. Doing it anyway is not
portable and will also pose a security risk.
Not using the *External* type for any data that is serialized will avoid this problem
entirely.
The VelocyPack type *Custom* is completely user-defined, and there is no default
implementation for them. So it is up to the embedder to make these custom type
bindings portable if portability of them is a concern.
VelocyPack *Double* values are serialized as integer equivalents in a specific way,
and unserialized back into integers that overlay a IEEE-754 double-precision
floating point value in memory. We found this to be sufficiently portable for our
needs, although at least in theory there may be portability issues with some systems.
The [following](https://en.wikipedia.org/wiki/Endianness#Floating_point) was used as
a backing for our "reasonably portable in the real world" assumptions:
> It may therefore appear strange that the widespread IEEE 754 floating-point standard does not specify endianness.[17] Theoretically, this means that even standard IEEE floating-point data written by one machine might not be readable by another. However, on modern standard computers (i.e., implementing IEEE 754), one may in practice safely assume that the endianness is the same for floating-point numbers as for integers, making the conversion straightforward regardless of data type.

+ 396
- 342
VelocyPack_zh.md View File

@ -1,342 +1,396 @@
# VelocyPack(VPack)
Version 1
VelocyPack(VPack)是一种快速而紧凑的序列化格式
# 共性
VPack是面向(无符号)字节的,因此VPack值只是字节序列,并且与平台无关。值不一定要对齐,因此必须正确组织对较大子值的所有访问,以避免CPU的对齐假设。
# 值类型
我们描述了一个VPack值,该值本质上是递归的,但驻留在一个连续的内存块中(有两个例外,请参见下文)。假设该值从地址A开始,则第一个字节V指示当前VPack值的类型(通常是长度):
# 总述 我们首先给出一个简短而准确的概述作为参考,以供参考,有关数组和对象的详细信息,请参见下文:
0x00:0 无-表示不存在任何类型和值,VPack值中不允许
0x01:1 空数组
0x02:2 不带索引表的数组(所有子项具有相同的字节长度),1字节字节长度
0x03:3 不带索引表的数组(所有子项具有相同的字节长度),2字节字节长度
0x04:4 不带索引表的数组(所有子项具有相同的字节长),4字节字节长
0x05:5 不带索引表的数组(所有子项具有相同的字节长度),8字节字节长度
0x06:6 具有1字节索引表偏移量,bytelen和#个子区间的数组
0x07:7 具有2字节索引表偏移量,bytelen和#个子区间的数组
0x08:8 具有4字节索引表偏移量,bytelen和#个子区间的数组
0x09:9 具有8字节索引表偏移量,bytelen和#个子区间的数组
0x0a:10 空对象
0x0b:11 具有1字节索引表偏移量的对象,按属性名称排序,1字节bytelen和#个子值
0x0c:12 具有2字节索引表偏移量的对象,按属性名称排序,2字节bytelen和#个子值
0x0d:13 具有4字节索引表偏移量的对象,按属性名称排序,4字节bytelen和#个子值
0x0e:14 具有8字节索引表偏移量的对象,按属性名称排序,8字节bytelen和#个子值
0x0f:15 具有1字节索引表偏移量的对象,未按属性名称排序,1字节bytelen和#个子值
0x10:16 具有2字节索引表偏移量的对象,未按属性名称排序,2字节bytelen和#个子值
0x11:17 具有4字节索引表偏移量的对象,未按属性名称排序,4字节bytelen和#个子值
0x12:18 具有8字节索引表偏移量的对象,未按属性名称排序,8字节字节数和#个子值
0x13:19 紧凑数组,没有索引表
0x14:20 紧凑对象,没有索引表
0x15-0x16:21-22保留
0x17:23 不合法-此类型可用于指示嵌入应用程序中不合法的值
0x18:24 null
0x19:25 false
0x1a:26 true
0x1b:27 双IEEE-754,后跟8个字节,存储为与uint64等效的小字节序
0x1c:28 UTC日期(自纪元以来)毫秒数,以8字节有符号int,little endian,二进制补码形式存储
0x1d:29 外部(仅在内存中):一个char *,指向另一个VPack项在内存中的实际位置,磁盘或网络上的VPack值中不允许
0x1e:30 minKey,比较所有其他值<的无意义的值
0x1f:31 maxKey,与所有其他值相比>的无意义值
0x20-0x27:32-39带符号的int,小字节序,1至8个字节,数字为V-0x1f,二进制补码
0x28-0x2f:40-47 uint,小端,1至8字节,数字为V-0x27
0x30-0x39:48-57 ,小整数0,1,... 9
0x3a-0x3f:58- 63 小负整数-6,-5,...,-1
0x40-0xbe:64- 190 UTF-8字符串,使用V-0x40字节(不是Unicode字符!),长度为0,所以0x40是空字符串,最大长度为126,请注意,这里的字符串不是以0结尾的,并且可以包含NUL个字节
0xbf:191 长UTF-8字符串,接下来的8个字节是字符串的长度(以字节为单位)(不是Unicode字符),是小端无符号整数,请注意,长字符串不以0结尾,并且可以包含NUL字节
0xc0-0xc7:192-199 二进制blob,下一个V-0xbf字节是blob的长度(以字节为单位),请注意二进制blob不会以零结尾
0xc8-0xcf:200-207 正长打包BCD编码的浮点数,后跟V-0xc7字节,以一点尾数方式对尾数的长度(以字节为单位)进行编码。之后紧随其后的是4个字节(乘以10的幂),然后乘以尾数,将其存储为小尾数2的补码有符号32位整数。
之后,跟着指定的开始处的长度信息一样多的字节,每个字节以大端字节序打包的BCD编码两位。
示例:12345十进制可以编码为0xc8 0x03 0x00 0x00 0x00 0x00 0x01 0x23 0x45或0xc8 0x03 0xff 0xff 0xff 0xff 0x12 0x34 0x50
0xd0-0xd7:208- 215 负长打包BCD编码的浮点数,后跟V-0xcf字节,以一点尾数法对尾数的长度(以字节为单位)进行编码。之后,与上面的正长打包BCD编码浮点相同。
0xd8-0xed:216-237 保留
0xee-0xef:238-239 逻辑类型的值标记
0xf0-0xff:240-255 自定义类型
# 详述
## Arrays
空数组只是一个字节0x01。
接下来,我们将描述类型为0x02到0x09的情况,请参见下面的特殊紧凑型0x13。
非空数组看起来像以下之一:
one of 0x02 to 0x05
BYTELENGTH
OPTIONAL UNUSED: padding
sub VPack values
要么
0x06
BYTELENGTH in 1 byte
NRITEMS in 1 byte
OPTIONAL UNUSED: 6 bytes of padding
sub VPack values
INDEXTABLE with 1 byte per entry
要么
0x07
BYTELENGTH in 2 bytes
NRITEMS in 2 bytes
OPTIONAL UNUSED: 4 bytes of padding
sub VPack values
INDEXTABLE with 4 byte per entry
要么
0x08
BYTELENGTH in 4 bytes
NRITEMS in 4 bytes
sub VPack values
INDEXTABLE with 4 byte per entry
要么
0x09
BYTELENGTH in 8 bytes
sub VPack values
INDEXTABLE with 8 byte per entry
NRITEMS in 8 bytes
如果类型允许使用任何可选的填充,则填充必须完全由填充字节的长度,BYTELENGTH的长度和NRITEMS的长度(如果存在)之和等于8的字节数组成。如果BYTELENGTH的长度为已经是8,不允许填充。整个填充必须由零字节(ASCII NUL)组成。
数字(对于字节长度,INDEXTABLE中的子值数量和偏移量)是小端无符号整数,对于类型0x02和0x06使用1字节,对于类型0x03和0x07使用2字节,对于类型0x04和0x08使用4字节,对于类型0x05和0x09使用8字节。
NRITEMS是如上所述的单个数字。
INDEXTABLE包含:
对于类型0x06-0x09,偏移量数组(未对齐,采用上述数字格式)较早的偏移量位于较低地址。偏移量从VPack值的开头开始测量。
类型为0x06到0x09的非空数组具有一个小的标头,包括它们的字节长度,子值数量,所有子值以及最后一个包含这些子值偏移量的索引表。要找到索引表,请先考虑子值的数量,然后是末尾,再从索引表的底部开始,并考虑其条目的宽度。
对于类型0x02至0x05,没有偏移表,也没有项目数。第一项根据字节长度字段的类型和宽度而定,从地址A + 2,A + 3,A + 5或分别为A + 9开始。请注意以下特殊规则:在填充零字节之后,允许第一个子值的实际位置再退一步。
例如,如果两个字节长度(BYTELENGTH)都使用了2个字节,则随后可以选择填充4个零字节,并且实际的VPack子值可以从A + 9开始。这是为了给构建VPack值的程序提供在开始时保留8个字节的机会,直到以后才发现可以写出字节长度的字节更少。可以通过找到第一个子值,其字节长度并将可用空间除以它来确定子值的数量。
对于类型0x06至0x09,偏移量表描述了子值所在的位置。子值不必在“子值数”字段之后立即开始。
如上所述,允许包括可选的填充。同样在这里,任何填充都必须由连续的零字节(ASCII NUL)组成,并且填充长度必须足以填充BYTELENGTH的长度和NRITEMS的长度为8。
例如,如果BYTELENGTH和NRITEMS都可以用2个字节表示,则它们的长度之和为4。因此,允许在此处添加4个字节的填充,以便第一个子值可以位于地址A + 9。
对于8字节数字情况(类型0x05),有一个例外:在这种情况下,元素数被移到索引表的后面。当一个人在开始时已经保留了8个字节,后来又注意到字节长度需要全部8个字节时,这是为了不移动内存而逃脱。在这种情况下,不允许包含任何填充。
所有偏移量均从基数A开始测量。
范例:
[1,2,3] 有十六进制转储
02 05 31 32 33
以最紧凑的形式表示,但以下情况同样可行,尽管不一定建议使用:
例子:
03 06 00 31 32 33
04 08 00 00 00 31 32 33
05 0c 00 00 00 00 00 00 00 31 32 33
06 09 03 31 32 33 03 04 05
07 0e 00 03 00 31 32 33 05 00 06 00 07 00
08 18 00 00 00 03 00 00 00 31 32 33 09 00 00 00 0a 00 00 00 0b 00 00 00
09
2c 00 00 00 00 00 00 00
31 32 33
09 00 00 00 00 00 00 00
0a 00 00 00 00 00 00 00
0b 00 00 00 00 00 00 00
03 00 00 00 00 00 00 00
请注意,不建议以太长的格式编码短数组。
现在我们描述特殊类型0x13,它对于特别紧凑的数组表示很有用。请注意,在某种程度上这与VelocyPack格式的原理背道而驰,因为不再能够快速访问子值,因此必须扫描数组中的所有项以找到特定项。但是,VelocyPack的某些用例只需要顺序访问(例如JSON转储),并且对紧凑性有特殊的需求。
此数组类型的整体格式为
0x13作为类型字节BYTELENGTH子VPack值NRITEMS
尽管子VelocyPack值可以具有不同的字节大小,但根本没有索引表。BYTELENGTH和NRITEMS以特殊格式编码,我们现在将对其进行描述。
BYTELENGTH由1到8个字节组成,除最后一个字节外,所有字节均已设置其高位。因此,高位确定实际使用了多少个字节。所有这些位的低7位以一点字节序的形式一起构成了实际的字节长度。也就是说,地址A + 1处的字节包含字节长度的最低有效7位(0至6),地址A + 2之后的字节包含位7至13,依此类推。由于字节的总数限制为8,因此可以对多达56位的无符号整数进行编码,这是此类紧凑数组表示形式的大小的总体限制。
NRITEMS条目的编码方式基本上相同,只是它以相反的顺序排列在内存中。也就是说,必须使用BYTELENGTH来查找数组值的末尾并返回字节,直到找到高复位位的字节为止。最后一个字节(在最高的存储器地址处)包含NRITEMS值的最低有效7位,后一个7至13位,依此类推。
这是一个示例,可以将数组[1,16]编码如下:
13 06
31 28 10
02
## Objects
空对象只是一个字节0x0a。
接下来,我们描述类型为0x0b到0x12的情况,有关特殊的紧凑型0x14,请参见下文。
非空对象如下所示:
one of 0x0b - 0x12
BYTELENGTH
optional NRITEMS
sub VPack values as pairs of attribute and value
optional INDEXTABLE
NRITEMS for the 8-byte case
数字(对于字节长度,INDEXTABLE中的子值数量和偏移量)是小端无符号整数,对于类型0x0b和0x0f使用1个字节,对于类型0x0c和0x10使用2个字节,对于类型0x0d和0x11使用4个字节,对于类型8x类型0x0e和0x12。
NRITEMS是如上所述的单个数字。
INDEXTABLE包含:
较早的偏移量数组(未对齐,采用上述数字格式)位于较低的地址。偏移量是从VPack值的开头开始测量的。
非空对象的标头很小,包括字节长度,子值数量,所有子值以及最后一个包含子值偏移量的索引表。要查找索引表,请先考虑子值的数量,然后查找末尾,再从索引表的底部开始,考虑其条目的宽度。
对于所有类型,偏移量表都描述了子值所在的位置。子值不必在“子值数”字段之后立即开始。出于性能原因,在构建值时,可能希望为字节长度和子值的数量保留8个字节,而不填补空白,即使后来发现偏移量(因此字节长度仅使用2个字节)也是如此。 。
有一种特殊情况:空对象仅存储为单个字节0x0a。
还有一个例外:对于8字节数字(0x12),子值的数量存储在INDEXTABLE的后面。当一个人在开始时已经保留了8个字节,后来又注意到字节长度需要全部8个字节时,这是为了不移动内存而逃脱。
所有偏移量均从基数A开始测量。
每个条目都由键和值两部分组成,它们如上所述被编码为普通的VPack值,第一个始终是长或短的UTF-8字符串,从字节0x40-0xbf开始,如下所述。第二个是任何其他VPack值。
有一个扩展:对于密钥,可以使用正的小整数值0x30-0x39或以类型字节0x28-0x2f开头的无符号整数。任何此类整数值都是属性名称的外部表的索引。当仅出现很少的属性名称或经常重复某些属性名称时,这些方法很方便。编码此类属性名称表的标准方法是使用此处指定的VPack字符串数组。
对象总是存储有排序的键/值对,并按每个嵌套级别上键的按字节比较排序。排序有一些开销,但允许在以后的对数时间内查找键。注意,仅索引表需要排序,不需要这些表中的偏移量增加。由于索引表位于实际的子值之后,因此可以通过线性写入来构建复杂的VPack值。
示例:对象{"a": 12, "b": true, "c": "xyz"}可以具有hexdump:
0b
13 03
41 62 1a
41 61 28 0c
41 63 43 78 79 7a
06 03 0a
可以使用具有更长条目的索引表来完成相同的对象,如以下示例所示:
0d
22 00 00 00
03 00 00 00
41 62 1a
41 61 28 0c
41 63 43 78 79 7a
0c 00 00 00 09 00 00 00 10 00 00 00
类似地,对于类型0x0c和2个字节的偏移量,字节长度和子值数量,或者对于类型0x0e和8个字节的数字。
请注意,不建议对索引表太长的短对象进行编码。
特殊的紧凑物体
现在我们描述特殊类型0x14,它对于特别紧凑的对象表示很有用。请注意,在某种程度上这与VelocyPack格式的原理背道而驰,因为不再能够快速访问子值,因此必须扫描对象中的所有键/值对以找到特定的键/值对。但是,VelocyPack的某些用例只需要顺序访问(例如JSON转储),并且对紧凑性有特殊的需求。
该对象类型的整体格式为
0x14作为字节BYTELENGTH子VPack键/值对的类型字节NRPAIRS
尽管子VelocyPack值可以具有不同的字节大小,但根本没有索引表。BYTELENGTH和NRPAIRS以特殊格式编码,我们现在将对其进行描述。它与特殊紧凑型数组0x13相同,为完整起见,在此重复。
BYTELENGTH由1到8个字节组成,除最后一个字节外,所有字节均已设置其高位。因此,高位确定实际使用了多少个字节。所有这些位的低7位以一点字节序的形式一起构成了实际的字节长度。也就是说,地址A + 1处的字节包含字节长度的最低有效7位(0至6),地址A + 2之后的字节包含位7至13,依此类推。由于字节的总数限制为8,因此可以对多达56位的无符号整数进行编码,这是此类紧凑数组表示形式的大小的总体限制。
NRPAIRS条目的编码方式基本上相同,只是它在内存中的排列顺序相反。也就是说,必须使用BYTELENGTH来查找数组值的末尾并返回字节,直到找到高复位位的字节为止。最后一个字节(在最高的存储器地址处)包含NRPAIRS值的最低有效7位,后一个7至13位,依此类推。
这是一个示例,对象{“ a”:1,“ b”:16}可以编码如下:
14 0a
41 61 31 42 62 28 10
02
## Doubles
类型0x1b使用类型字节后的8个字节指示双IEEE-754值。为了保证平台独立性,字节顺序的详细信息如下。通过使用memcpy将内部double值复制到uint64_t来完成编码。然后,将此64位无符号整数存储为VPack值中的8位小字节序。解码的方向相反。实际上,这应该整理出IEEE-754中未确定的字节顺序。
## Dates
类型0x1c指示在类型之后紧接着以8字节小尾数补码表示的有符号64位整数。该值表示自该时期以来的通用UTC时间(以毫秒为单位),该时间是1970年1月1日UTC的00:00。
## External VPack values
此类型仅用于内存中,而不用于通过磁盘或网络进行数据交换。因此,我们仅需要指定以下k个字节为当前体系结构上char *的memcpy。该char *指向内存中其他位置的实际VPack值。
## Artifical minimal and maximal keys
这些类型0x1e和0x1f的值除了分别比较小于或大于任何其他VPack值之外没有其他意义。这个想法是可以在定义所有VPack值的总顺序以指定无限间隔的左端或右端的系统中使用它们。
## Integer types
有多种指定整数的方法。对于-6到9的小数值(包括-6)(包括0x30到0x3f),可以在单个字节中存储。此后,可以在字节类型中对有符号和无符号整数类型进行编码,这些字节类型使用的字节数(有符号的范围为0x20-0x27,无符号的范围为0x28-0x2f)。
## Null and boolean values
这三个值使用单个字节存储相应的JSON值。
## Binary data
字符串存储为UTF-8编码的字节序列。有两种变体,短的和长的。简短地说,字节长度(不是UTF-8字符的数量)直接在该类型中编码,并且可以工作至字节长度126,包括字节长度126。为此使用类型0x40至0xbe,字节长度为V -0x3f(如果V是类型字节)。对于长度超过126个字节的字符串,类型字节为0xbf,并且字符串的字节长度使用小尾数无符号整数表示形式存储在类型字节之后的前8个字节中。实际的字符串在这8个字节之后。两种情况都没有终止的零字节,并且字符串可能包含零字节。
## Binary data
字节类型0xc0至0xc7允许将任意二进制字节序列存储为VPack值。格式如下:如果V是类型字节,则V-0xbf字节跟在其后,以一个小端无符号整数表示二进制数据的长度,该整数紧随这些长度字节之后。不保证对齐。内容完全取决于用户。
## Packed BCD long floats
这些类型用于表示任意精度的十进制数字。正数和负数有不同的类型。这些值的整体格式为:
one of 0xc8 - 0xcf (positive) or of 0xd0 - 0xd7 (negative)
LENGTH OF MANTISSA in bytes
EXPONENT (as 4-byte little endian signed two's complement integer)
MANTISSA (as packed BCD-encoded integer, big-endian)
字节类型描述数字的符号以及用于指定尾数字节长度的字节数。通常,如果V是类型字节,则将V-0xc7(在正数情况下)或V-0xcf(在负数情况下)字节用作尾数的长度,并在该字节后直接存储为小端无符号整数长度。在此之后,正好跟随4个字节(小尾数有符号二进制补码整数)来指定指数。指数之后,是实际的尾数字节。
使用压缩的BCD,以便每个字节正好存储2个十进制数字,如0x34中的十进制数字34一样。因此,尾数始终为偶数个十进制数字。请注意,尾数以大尾数形式存储,以提高解析和转储效率。这导致“邪恶的半字节问题”:当JSON解析器看到一个长数字的开头时,它不知道后面是偶数还是奇数。但是,出于效率原因,它希望在读取输入时开始将字节写入输出。这就是救援的关键所在,如以下示例所示:
12345 decimal can be encoded as:
0xc8 0x03 0x00 0x00 0x00 0x00 0x01 0x23 0x45
0xc8 0x03 0xff 0xff 0xff 0xff 0x12 0x34 0x50
前一种编码在第一个字节中放置前导0并使用指数0,后一种编码直接开始在一个字节中放置两个十进制数字,然后最后必须使用由-1编码的指数-1“擦除”尾随0。 4字节序列0xff 0xff 0xff 0xff。
在那里解决了邪恶的蚕食问题,并且解析(实际上是转储)效率很高。
## Tagging
类型0xee-0xef用于标记值以实现逻辑类型。
例如,如果类型0x1c不存在,则数据库驱动程序可以将时间戳对象(JavaScript中的Date,Java中的Instant等)序列化为Unix时间戳(64位整数)。假设缺少模式,则在反序列化时,不可能从时间戳中分辨出整数并相应地反序列化该值。
类型标记通过将整数标记附加到值上来解决此问题,这些值可在反序列化值时读取,例如,tag = 1是时间戳,应使用相关的时间戳类。
标记值是分别指定的,应用程序也可以指定它们自己的名称,以使数据库驱动程序将其特定数据类型反序列化为适当的类(包括模型)。
本质上,这是文档各部分的对象关系映射。
类型的格式为:
0xee
TAG number in 1 byte
sub VPack value
要么
0xef
TAG number in 8 bytes, little-endian encoding
sub VPack value
## Custom types
请注意,自定义类型通常不应用于数据交换,而只能用于系统内部。尽管如此,本规范的这一部分仍进行了设计,使得可以通过通用方法得出每种自定义数据类型的字节长度。
存在以下用户定义的类型:
0xf0:1字节有效负载,紧随类型字节之后
0xf1:2字节有效负载,紧随类型字节之后
0xf2:4字节有效负载,紧随类型字节之后
0xf3:8字节有效负载,紧随类型字节之后
0xf4-0xf6:有效载荷的长度由紧随类型字节之后的另一个无符号字节描述,该多个字节的有效载荷如下
0xf7-0xf9:有效负载的长度由紧随类型字节之后的两个字节(小尾数无符号整数)描述,该字节的有效负载紧随其后
0xfa-0xfc:有效载荷的长度由紧随类型字节之后的四个字节(小尾数无符号整数)描述,该字节的有效载荷如下
0xfd-0xff:有效载荷的长度由紧随类型字节之后的八个字节(小尾数无符号整数)描述,该字节的有效载荷如下
注意:在类型0xf4至0xff中,“有效负载”是指不包括长度说明的实际数据。
## Portability
序列化的布尔值、整数、字符串、数组、对象等都具有定义的字节序和长度,这与平台无关。这些类型在序列化的 VelocyPack 中是完全可移植的。
在可移植性方面仍有一些注意事项:
在 64 位系统上可以建立非常大的值,但在 32 位系统上可能无法读取它们。这是因为与 64 位系统相比,32 位系统上的最大内存分配大小可能受到严重限制,即 32 位操作系统可能根本不允许分配大于 4 GB 的缓冲区。这不是 VelocyPack 的限制,而是 32 位架构的限制。如果所有 VelocyPack 值都保持足够小,以至于它们远低于 32 位长度边界,但这应该无关紧要。
VelocyPack 类型External只包含一个指向内存的原始指针,它应该只在内存中 VelocyPack 值的构建期间使用。在外部类型是不应该在被序列化和永久存储,再后来从持续读回VelocyPack值使用。无论如何这样做是不可移植的,还会带来安全风险。不对任何序列化的数据使用External类型将完全避免这个问题。
VelocyPack 类型Custom完全由用户定义,并且没有默认实现。因此,如果它们的可移植性是一个问题,则由嵌入器来使这些自定义类型绑定具有可移植性。
VelocyPack Double值以特定方式序列化为整数等价物,并反序列化为覆盖内存中 IEEE-754 双精度浮点值的整数。我们发现这足以满足我们的需求,尽管至少在理论上某些系统可能存在可移植性问题。
在下面被用作我们的“现实世界中的合理便携式”假设一个后盾:
因此,广泛使用的 IEEE 754 浮点标准没有指定字节顺序可能会显得很奇怪。 [17] 从理论上讲,这意味着即使是由一台机器编写的标准 IEEE 浮点数据也可能无法被另一台机器读取。然而,在现代标准计算机(即实现 IEEE 754)上,人们实际上可以安全地假设浮点数与整数的字节序相同,从而使转换变得简单,而不管数据类型如何。
速度包 (VPack)
Version 1
VelocyPack (VPack) 是一种快速紧凑的序列化格式
概论
VPack 是面向(无符号)字节的,因此 VPack 值只是字节序列并且与平台无关。值不一定对齐,因此必须正确组织对较大子值的所有访问,以避免
CPU 的对齐假设。
值类型
我们描述了一个单一的 VPack 值,它本质上是递归的,但驻留在一个连续的内存块中(有两个例外,见下文)。假设值从地址 A 开始,第一个字节
V 指示手头 VPack 值的类型(通常是长度):
我们首先给出一个概述,并提供一个简短但准确的描述以供参考,对于数组和对象,请参见下面的详细信息:
0x00 : none - 这表示没有任何类型和值,这在 VPack 值中是不允许的
0x01 : 空数组
0x02 : 无索引表的数组(所有子项字节长度相同),1-byte字节长度
0x03 : 无索引表的数组(所有子项字节长度相同),2-byte字节长度
0x04 :无索引表的数组(所有子项字节长度相同),4字节字节长度
0x05 : 无索引表的数组(所有子项字节长度相同),8字节字节长度
0x06 : 具有 1 字节索引表偏移量、bytelen 和 # subvals 的数组
0x07 : 具有 2 字节索引表偏移量、bytelen 和 # subvals 的数组
0x08 : 具有 4 字节索引表偏移量、bytelen 和 # subvals 的数组
0x09 : 具有 8 字节索引表偏移量、bytelen 和 # subvals 的数组
0x0a : 空对象
0x0b :具有 1 字节索引表偏移量的对象,按属性名称、1 字节 bytelen 和 # subvals 排序
0x0c :具有 2 字节索引表偏移量的对象,按属性名称、2 字节 bytelen 和 # subvals 排序
0x0d : 具有 4 字节索引表偏移量的对象,按属性名称、4 字节 bytelen 和 # subvals 排序
0x0e : 具有 8 字节索引表偏移量的对象,按属性名称、8 字节 bytelen 和 # subvals 排序
0x0f :具有 1 字节索引表偏移量的对象,未按属性名称、1 字节 bytelen 和 # subvals 排序 -废弃的
0x10 :具有 2 字节索引表偏移量的对象,未按属性名称、2 字节 bytelen 和 #subvals 排序 - 废弃的
0x11 :具有 4 字节索引表偏移量的对象,未按属性名称、4 字节 bytelen 和 #subvals 排序 - 废弃的
0x12 :具有 8 字节索引表偏移量的对象,未按属性名称、8 字节 bytelen 和 #subvals 排序 - 废弃的
0x13 : 紧凑型数组,无索引表
0x14 : 紧凑型对象,无索引表
0x15- 0x16:保留
0x17 : illegal - 此类型可用于指示嵌入应用程序中非法的值
0x18 : null
0x19 : false
0x1a : true
0x1b : double IEEE-754, 8 bytes followed, stored as little endian uint64 equivalent
0x1c :自纪元以来以毫秒为单位的 UTC 日期,存储为 8 字节有符号整数,小端,二进制补码
0x1d : external (only in memory): 一个 char* 指向内存中的实际位置,另一个 VPack 项所在的位置,不允许出现在磁盘或网络上的
VPack 值中
0x1e : minKey,将 < 与所有其他值进行比较的无意义值
0x1f : maxKey,比较 > 比所有其他值的无意义值
0x20- 0x27: signed int, little endian, 1 to 8 bytes, number is V - 0x1f, twos complement
0x28- 0x2f: uint, little endian, 1 to 8 bytes, number is V -0x27
0x30- 0x39: 小整数 0, 1, ... 9
0x3a- 0x3f: 小的负整数 -6, -5, ..., -1
0x40- 0xbe: UTF-8-string,使用V- 0x40bytes(不是Unicode字符!),长度可以0x40是0,空字符串也可以,最大长度是126,注意这里的字符串不是以零结尾的,可能包含NUL字节
0xbf : long UTF-8-string,接下来的 8 个字节是字符串的长度,以字节为单位(不是 Unicode 字符)作为小端无符号整数,注意长字符串不是零终止的,可能包含
NUL 字节
0xc0- 0xc7:二进制 blob,接下来的 V -0xbf字节是 blob 的字节长度,注意二进制 blob 不是零终止的
0xc8- 0xcf:正长压缩 BCD 编码浮点数,V -0xc7字节跟随以小端方式编码尾数的长度(以字节为单位)。紧随其后的是编码(10 次方)指数的
4 个字节,尾数将乘以该指数,存储为小端二进制补码带符号的 32 位整数。之后,按照开头的长度信息指定的字节数,每个字节用big-endian
packed BCD编码两个数字。示例:12345 十进制可以编码为 c8 03 00 00 00 00 01 23 45或 c8 03 ff ff ff ff 12 34 50
0xd0- 0xd7:负长压缩 BCD 编码浮点数,V -0xcf字节跟随以小端方式编码尾数的长度(以字节为单位)。之后,与上面的正长打包 BCD
编码浮点数相同。
0xd8- 0xed:保留
0xee- 0xef:逻辑类型的值标记
0xf0- 0xff:自定义类型
数组
空数组只是一个字节0x01。
接下来我们将描述类型案例0x02,0x09请参阅下面的特殊紧凑类型0x13。
非空数组类似于以下之一:
one of 0x02 to 0x05
BYTELENGTH
OPTIONAL UNUSED: padding
sub VPack values
或者
0x06
BYTELENGTH in 1 byte
NRITEMS in 1 byte
OPTIONAL UNUSED: 6 bytes of padding
sub VPack values
INDEXTABLE with 1 byte per entry
或者
0x07
BYTELENGTH in 2 bytes
NRITEMS in 2 bytes
OPTIONAL UNUSED: 4 bytes of padding
sub VPack values
INDEXTABLE with 4 byte per entry
或者
0x08
BYTELENGTH in 4 bytes
NRITEMS in 4 bytes
sub VPack values
INDEXTABLE with 4 byte per entry
或者
0x09
BYTELENGTH in 8 bytes
sub VPack values
INDEXTABLE with 8 byte per entry
NRITEMS in 8 bytes
如果一个类型允许任何可选填充,则填充必须包含恰好等于填充长度、BYTELENGTH 长度和 NRITEMS(如果存在)长度总和为 8 的字节数。如果
BYTELENGTH 的长度是已经 8,不允许填充。整个填充必须包含零字节 (ASCII NUL)。
数字(用于字节长度、INDEXTABLE 中的子值和偏移量的数量)是小端无符号整数,类型和使用 1 个字节,类型0x02和使用0x062
个字节,类型0x03和使用0x074 个字节,类型 和0x04使用0x088 个字节。0x050x09
NRITEMS 是如上所述的单个数字。
INDEXTABLE 包括:
对于类型0x06-0x09一个偏移量数组(未对齐,采用上述数字格式)较早的偏移量位于较低的地址。偏移量是从 VPack 值的开始测量的。
0x06类型的非空数组0x09有一个小头,包括它们的字节长度、子值的数量,然后是所有子值,最后是包含子值偏移量的索引表。要找到索引表,请找到子值的数量,然后是末尾,并从中找到索引表的基数,并考虑其条目的宽度。
对于 to 的类型0x02,0x05没有偏移表,也没有项目数。第一项从地址 A+2、A+3、A+5 或 A+9
开始,具体取决于字节长度字段的类型和宽度。请注意以下特殊规则:在填充零字节的一些运行之后,允许第一个子值的实际位置更靠后。
例如,如果两个字节长度 (BYTELENGTH) 都使用 2 个字节,则随后允许有 4 个零字节的可选填充,并且实际的 VPack 子值可以从 A+9
开始。这是为了让构建 VPack 值的程序有机会在开始时保留 8 个字节,然后才发现更少的字节足以写入字节长度。可以通过找到第一个子值及其字节长度并将可用空间量除以它来确定子值的数量。
对于偏移表的类型0x06,0x09描述了子值所在的位置。子值不必在子值数字段之后立即开始。
如上所述,允许包含可选的填充。在这里,任何填充都必须由一系列连续的零字节 (ASCII NUL) 组成,并且长度必须能够将 BYTELENGTH
的长度和 NRITEMS 的长度填满为 8。
例如,如果 BYTELENGTH 和 NRITEMS 都可以用 2 个字节表示,则它们的长度之和为 4。因此可以在此处添加 4 个字节的填充,以便第一个子值可以位于地址
A+9。
8 字节数字的情况(type 0x05)有一个例外:在这种情况下,元素的数量被移到索引表后面。这是为了在一开始保留 8 个字节并且后来注意到字节长度需要所有
8 个字节时不移动内存。对于这种情况,不允许包含任何填充。
所有偏移量均从基准 A 开始测量。
示例:
[1,2,3]有十六进制转储
02 05 31 32 33
在最紧凑的表示中,但以下同样可能,但不一定建议使用:
例子:
03 06 00 31 32 33
04 08 00 00 00 31 32 33
05 0c 00 00 00 00 00 00 00 31 32 33
06 09 03 31 32 33 03 04 05
07 0e 00 03 00 31 32 33 05 00 06 00 07 00
08 18 00 00 00 03 00 00 00 31 32 33 09 00 00 00 0a 00 00 00 0b 00 00 00
09
2c 00 00 00 00 00 00 00
31 32 33
09 00 00 00 00 00 00 00
0a 00 00 00 00 00 00 00
0b 00 00 00 00 00 00 00
03 00 00 00 00 00 00 00
请注意,不建议以太长的格式对短数组进行编码。
我们现在描述特殊类型0x13,它对于特别紧凑的数组表示很有用。请注意,这在某种程度上违反了 VelocyPack
格式的原则,因为不再可能快速访问子值,因此必须扫描数组中的所有项目以找到特定的项目。但是,VelocyPack 的某些用例只需要顺序访问(例如
JSON 转储)并且对紧凑性有特殊需求。
这种数组类型的整体格式是
0x13 as type byte
BYTELENGTH
sub VPack values
NRITEMS
根本没有索引表,尽管子 VelocyPack 值可以有不同的字节大小。BYTELENGTH 和 NRITEMS 以我们现在描述的特殊格式编码。
BYTELENGTH 由 1 到 8 个字节组成,除最后一个字节外,所有字节都设置了高位。因此,高位决定实际使用了多少字节。所有这些位的低 7
位以小端方式共同构成实际字节长度。也就是说,地址 A+1 的字节包含字节长度的最低有效 7 位(0 到 6),地址 A+2 的后续字节包含位 7
到 13,依此类推。由于字节总数限制为 8,因此它对最多 56 位的无符号整数进行编码,这是这种紧凑数组表示的大小的总体限制。
NRITEMS 条目的编码基本相同,只是它在内存中的排列顺序相反。也就是说,必须使用 BYTELENGTH
找到数组值的末尾并返回字节,直到找到高位重置的字节。最后一个字节(在最高内存地址处)包含 NRITEMS 值的最低有效 7 位,第二个字节包含
7 到 13 位,依此类推。
下面是一个例子,数组 [1, 16] 可以编码如下:
13 06
31 28 10
02
对象
空对象只是一个字节0x0a。
接下来我们将描述类型案例0x0b,0x12请参阅下面的特殊紧凑类型0x14。
非空对象如下所示:
one of 0x0b - 0x12
BYTELENGTH
optional NRITEMS
sub VPack values as pairs of attribute and value
optional INDEXTABLE
NRITEMS for the 8-byte case
数字(用于字节长度、INDEXTABLE 中的子值和偏移量的数量)是小端无符号整数,类型和使用 1 个字节,类型0x0b和使用0x0f2
个字节,类型0x0c和使用0x104 个字节,类型 和0x0d使用0x118 个字节。0x0e0x12
NRITEMS 是如上所述的单个数字。
INDEXTABLE 包括:
偏移量数组(未对齐,采用上述数字格式)较早的偏移量位于较低的地址。偏移量是从 VPack 值的开头开始测量的。
非空对象有一个小头,包括它们的字节长度、子值的数量,然后是所有子值,最后是包含子值偏移量的索引表。要查找索引表,请查找子值的数量,然后是结尾,然后根据索引表的基数,考虑其条目的宽度。
对于所有类型,偏移表描述了子值所在的位置。子值不必在子值数字段之后立即开始。出于性能原因,在构建值时,可能需要为字节长度和子值的数量保留
8 个字节而不是填充间隙,即使后来证明偏移量和字节长度仅使用 2 个字节,比如.
有一种特殊情况:空对象被简单地存储为单个字节0x0a。
还有一个例外:对于 8 字节数字 ( 0x12),子值的数量存储在 INDEXTABLE 后面。这是为了在一开始保留 8 个字节并且后来注意到字节长度需要所有
8 个字节时不移动内存。
所有偏移量均从基准 A 开始测量。
每个条目由两部分组成,键和值,它们被编码为如上所述的普通 VPack 值,第一个总是以字节开头的短或长 UTF-8
字符串0x40-0xbf如下所述。第二个是任何其他 VPack 值。
有一个扩展:对于密钥,可以使用正小整数值0x30-或以类型字节-0x39开头的无符号整数。任何此类整数值都是外部给定属性名称表的索引。当只有很少的属性名称出现或某些属性名称经常重复时,这些很方便。编码此类属性名称表的标准方法是作为此处指定的
VPack 字符串数组。0x280x2f
对象总是与排序的键/值对一起存储,通过每个嵌套级别上的键的字节比较来排序。排序有一些开销,但将允许稍后在对数时间内查找键。注意,只需要对索引表进行排序,并不要求这些表中的偏移量都是递增的。由于索引表位于实际子值之后,因此可以通过线性写入来构建复杂的
VPack 值。
示例:对象{"a": 12, "b": true, "c": "xyz"}可以具有 hexdump:
0b
13 03
41 62 1a
41 61 28 0c
41 63 43 78 79 7a
06 03 0a
相同的对象可以用具有更长条目的索引表完成,如本例所示:
0d
22 00 00 00
03 00 00 00
41 62 1a
41 61 28 0c
41 63 43 78 79 7a
0c 00 00 00 09 00 00 00 10 00 00 00
与类型0x0c和 2 字节偏移量、字节长度和子值数量或类型0x0e和 8 字节数字类似。
请注意,不建议使用太长的索引表对短对象进行编码。
特殊紧凑对象
我们现在描述特殊类型0x14,它对于特别紧凑的对象表示很有用。请注意,这在某种程度上违反了 VelocyPack
格式的原则,因为不再可能快速访问子值,因此必须扫描对象中的所有键/值对以找到特定的键/值对。但是,VelocyPack 的某些用例只需要顺序访问(例如
JSON 转储)并且对紧凑性有特殊需求。
这个对象类型的整体格式是
0x14 as type byte
BYTELENGTH
sub VPack key/value pairs
NRPAIRS
根本没有索引表,尽管子 VelocyPack 值可以有不同的字节大小。BYTELENGTH 和 NRPAIRS
以我们现在描述的特殊格式编码。它与特殊的紧凑数组类型相同0x13,为了完整起见,我们在此重复。
BYTELENGTH 由 1 到 8 个字节组成,除最后一个字节外,所有字节都设置了高位。因此,高位决定实际使用了多少字节。所有这些位的低 7
位以小端方式共同构成实际字节长度。也就是说,地址 A+1 的字节包含字节长度的最低有效 7 位(0 到 6),地址 A+2 的后续字节包含位 7
到 13,依此类推。由于字节总数限制为 8,因此它对最多 56 位的无符号整数进行编码,这是这种紧凑数组表示的大小的总体限制。
NRPAIRS 条目的编码基本相同,只是它在内存中的排列顺序相反。也就是说,必须使用 BYTELENGTH
找到数组值的末尾并返回字节,直到找到高位重置的字节。最后一个字节(在最高内存地址处)包含 NRPAIRS 值的最低有效 7 位,第二个字节包含
7 到 13 位,依此类推。
这是一个例子,对象{"a":1, "b":16}可以编码如下:
14 0a
41 61 31 42 62 28 10
02
双打
类型0x1b指示使用类型字节后的 8 个字节的双精度 IEEE-754 值。为了保证平台独立性,字节顺序的细节如下。编码是通过使用 memcpy
将内部双精度值复制到 uint64_t 来完成的。这个 64 位无符号整数然后作为小端 8 字节整数存储在 VPack 值中。解码工作在相反的方向。这应该在实践中解决
IEEE-754 中未确定的字节顺序。
日期
类型0x1c表示一个带符号的 64 整数,存储在紧跟在类型之后的 8 字节小端二进制补码符号中。该值表示自纪元以来以毫秒为单位测量的通用
UTC 时间,即 1970 年 1 月 1 日世界标准时间 00:00。
外部 VPack 值
此类型仅用于内存中,不适用于磁盘或网络上的数据交换。因此,我们只需要指定后面的k字节是一个char*在当前架构上的memcpy即可。该
char* 指向内存中其他地方的实际 VPack 值。
人工最小和最大键
0x1etypes和的这些值0x1f除了分别比较小于或大于任何其他 VPack 值之外没有任何意义。这个想法是,这些可以用于定义所有 VPack
值的总顺序的系统,以指定无限间隔的左端或右端。
整数类型
有不同的方法来指定整数。对于 -6 到
9(含)的小值,范围内有特定类型的字节0x30以0x3f允许存储在单个字节中。之后是有符号和无符号整数类型,它们可以在类型字节中编码使用的字节数(范围0x20-0x27有符号和0x28-0x2f无符号)。
空值和布尔值
这三个值使用单个字节来存储相应的 JSON 值。
字符串
字符串存储为 UTF-8 编码的字节序列。有两种变体,一种是短的,一种是长的。在短的一个中,字节长度(不是 UTF-8
字符的数量)直接在类型中编码,这可以达到并包括字节长度 126。用于此的类型0x40和0xbe 字节长度是 V - 0x3f,如果 V 是类型字节。对于长度超过
126 字节的字符串,类型字节是0xbf,字符串的字节长度存储在类型字节之后的前 8 个字节中,使用小端无符号整数表示。实际字符串跟在这
8 个字节之后。在这两种情况下都没有终止零字节,并且字符串可能包含零字节。
二进制数据
0xc0允许0xc7将任意二进制字节序列存储为 VPack 值的字节类型。格式如下: 如果V是type byte,那么V- 0xbfbytes跟在它后面做一个little
endian无符号整数表示二进制数据的长度,直接跟在这些length字节之后。不保证对齐。内容完全取决于用户。
压缩 BCD 长浮点数
这些类型用于表示任意精度的十进制数。正数和负数有不同的类型。这些值的整体格式是:
one of 0xc8 - 0xcf (positive) or of 0xd0 - 0xd7 (negative)
LENGTH OF MANTISSA in bytes
EXPONENT (as 4-byte little endian signed two's complement integer)
MANTISSA (as packed BCD-encoded integer, big-endian)
类型字节描述了数字的符号以及用于指定尾数字节长度的字节数。通常,如果 V 是类型字节,则 V - 0xc7(在正例中)或 V -
0xcf(在负例中)字节用于尾数的长度,直接在字节长度之后存储为小端无符号整数。在此之后紧跟 4
个字节(小端符号二进制补码整数)来指定指数。指数之后是实际的尾数字节。
使用压缩 BCD,以便每个字节存储恰好 2 个十进制数字,如0x34十进制数字
34。因此,尾数始终具有偶数个十进制数字。请注意,尾数以大端形式存储,以提高解析和转储的效率。这导致了“邪恶的半字节问题”:当 JSON
解析器看到一个较长数字的开头时,它不知道后面是偶数还是奇数。但是,出于效率原因,它希望在读取输入时开始将字节写入输出。这就是指数派上用场的地方,下面的例子说明了这一点。12345
十进制可以编码为:
c8 03 00 00 00 00 01 23 45
c8 03 ff ff ff ff 12 34 50
前一种编码在第一个字节中放置一个前导 0 并使用指数 0,后者编码直接开始将两个十进制数字放入一个字节然后最后必须使用指数 -1
来“擦除”尾随 0,由4字节序列ff ff ff ff。
因此,邪恶的半字节问题得到解决,解析(实际上是转储)可以变得高效。
标记
类型0xee-0xef用于标记值以实现逻辑类型。
例如,如果类型0x1c不存在,数据库驱动程序可以将时间戳对象(JavaScript 中的 Date、Java 中的 Instant 等)序列化为 Unix 时间戳,一个
64 位整数。假设缺少模式,在反序列化时就不可能从时间戳中分辨出整数并相应地反序列化该值。
类型标记通过将整数标记附加到值来解决这个问题,然后在反序列化值时可以读取该标记,例如,tag=1 是时间戳,应该使用相关的时间戳类。
标记值是单独指定的,应用程序也可以指定它们自己的值,以使数据库驱动程序将它们的特定数据类型反序列化为适当的类(包括模型)。
本质上这是文档部分的对象关系映射。
类型的格式是:
0xee
TAG number in 1 byte
sub VPack value
或者
0xef
TAG number in 8 bytes, little-endian encoding
sub VPack value
自定义类型
请注意,自定义类型通常不应用于数据交换,而只能在系统内部使用。尽管如此,规范这一部分的设计使得可以通过通用方法导出每个自定义数据类型的字节长度。
存在以下用户定义类型:
0xf0 : 1 字节有效载荷,紧跟在类型字节之后
0xf1 : 2字节有效载荷,紧跟在类型字节之后
0xf2 :4字节有效载荷,紧跟在类型字节之后
0xf3 : 8 字节有效载荷,紧跟在类型字节之后
0xf4- 0xf6: 有效载荷的长度由紧跟在类型字节之后的单个无符号字节描述,那么多字节的有效载荷紧随其后
0xf7- 0xf9:有效载荷的长度由紧跟在类型字节之后的两个字节(小端无符号整数)描述,那么多字节的有效载荷如下
0xfa- 0xfc:有效载荷的长度由紧跟在类型字节之后的四个字节(小端无符号整数)描述,那么多字节的有效载荷如下
0xfd- 0xff:有效载荷的长度由紧跟在类型字节之后的八个字节(小端无符号整数)描述,那么多字节的有效载荷如下
注:类型0xf4中0xff的“payload”是指不包括长度规格的实际数据。
可移植性
序列化的布尔值、整数、字符串、数组、对象等都有定义的字节顺序和长度,这是平台无关的。这些类型在序列化的 VelocyPack 中是完全可移植的。
在可移植性方面仍有一些注意事项:
可以在 64 位系统上构建非常大的值,但可能无法在 32 位系统上读回它们。这是因为与 64 位系统相比,32 位系统上的最大内存分配大小可能受到严格限制,即
32 位操作系统可能根本不允许分配大于 4 GB 的缓冲区。这不是 VelocyPack 的限制,而是 32 位架构的限制。如果所有 VelocyPack
值都保持足够小,以便它们远低于 32 位长度边界,那么这应该无关紧要。
VelocyPack 类型External只包含一个指向内存的原始指针,它应该只在内存中建立 VelocyPack 值期间使用。External类型不应该用于
VelocyPack 值,这些值被序列化并持久存储,然后从持久性中读回。无论如何都这样做是不可移植的,并且还会带来安全风险。不对任何序列化的数据使用外部类型将完全避免此问题。
VelocyPack 类型Custom完全是用户自定义的,它们没有默认实现。因此,如果考虑到这些自定义类型绑定的可移植性,则由嵌入器来实现可移植性。
VelocyPack Double值以特定方式序列化为整数等价物,然后反序列化回覆盖内存中 IEEE-754
双精度浮点值的整数。我们发现这足以满足我们的需求,尽管至少在理论上某些系统可能存在可移植性问题。
以下内容用作我们“在现实世界中合理便携”假设的支持:
因此,广泛使用的 IEEE 754 浮点标准没有指定字节序可能显得很奇怪。 [17] 从理论上讲,这意味着即使是一台机器写入的标准 IEEE
浮点数据也可能无法被另一台机器读取。然而,在现代标准计算机(即,实现 IEEE
754)上,实际上可以安全地假设浮点数的字节序与整数的字节顺序相同,从而使转换直接进行,而不管数据类型如何。

+ 12
- 13
include/eVPack.hrl View File

@ -1,22 +1,21 @@
-ifndef(eVPack_H_).
-define(eVPack_H_, true).
-type vpack() :: binary() | iodata().
-type vpOpt() :: pos_integer().
-export_type([vpack/0, vpOpt/0]).
-define(VpObjNcYs, 0). %% Key
-define(VpObjYc, 1). %%
-define(VpObjNcNs, 2). %% Key object
-define(VpArrNc, 0). %%
-define(VpArrYc, 1). %%
-define(VpObjNcYs, 0). %% Key
-define(VpObjYc, 1). %%
%% -define(VpObjNcNs, 2). %% Key object
-define(VpArrDef, ?VpArrNc). %% Arr不压缩排序
-define(VpObjDef, ?VpObjNcYs). %% Obj key Obj不压缩
-define(VpArrNc, 0). %%
-define(VpArrYc, 1). %%
-define(VpAllOpts(Arr, Obj), Obj bsl 1 bor Arr). %% Obj Arr选项
-define(VpObjOpts(VpAllOpts), VpAllOpts bsr 1). %% Obj选项
-define(VpArrOpts(VpAllOpts), VpAllOpts band 1). %% Arr选项
-define(VpArrDef, ?VpArrNc). %% Arr不压缩排序
-define(VpObjDef, ?VpObjNcYs). %% Obj key Obj不压缩
-define(blob, blob). %% tag
-define(VpBinaryCopyRatio, 1.2).
-define(VpBinaryCopyRatio, 1.2).
-endif.

+ 2
- 2
src/deTest.erl View File

@ -143,9 +143,9 @@ do() ->
Ex8 = eVPack:decode(<<20, 16, 65, 97, 40, 12, 65, 98, 26, 65, 99, 67, 120, 121, 122, 3>>),
Ex9 = [#{<<"a">> => 12, <<"b">> => true, <<"c">> => <<"xyz">>}, #{<<"a">> => 12, <<"b">> => true, <<"c">> => <<"xyz">>}],
Ex9 = eVPack:decode(<<19, 35, 20, 16, 65, 97, 40, 12, 65, 98, 26, 65, 99, 67, 120, 121, 122, 3, 20, 16, 65, 97, 40, 12, 65, 98, 26, 65, 99, 67, 120, 121, 122, 3, 2>>),
Ex10 = [#{<<"key">> => 42}, <<"fooooobar">>, <<"x">>, {?blob, <<1, 2, 3, 4, 5, 6, 7, 8>>}],
Ex10 = [#{<<"key">> => 42}, <<"fooooobar">>, <<"x">>, <<1, 2, 3, 4, 5, 6, 7, 8>>],
Ex10Bin = <<2, 42, 11, 10, 1, 67, 107, 101, 121, 40, 42, 3, 73, 102, 111, 111, 111, 111, 111,
98, 97, 114, 191, 1, 0, 0, 0, 0, 0, 0, 0, 120, 192, 8, 1, 2, 3, 4, 5, 6, 7, 8>>,
98, 97, 114, 191, 1, 0, 0, 0, 0, 0, 0, 0, 120, 192, 8, 1, 2, 3, 4, 5, 6, 7, 8>>,
Ex10 = eVPack:decode(Ex10Bin),
_Ex11 = #{<<"0">> =>
#{<<"0">> => <<"test">>, <<"1">> => <<"test">>,

+ 90
- 92
src/eVPack.erl View File

@ -53,22 +53,13 @@ encodeIol(Term, ArrOpt, ObjOpt) ->
encodeBin(Term, ArrOpt, ObjOpt) ->
iolist_to_binary(encoder(Term, ArrOpt, ObjOpt)).
encoder(Map, ArrOpt, ObjOpt) when erlang:is_map(Map) ->
encodeMap(ObjOpt, Map, ArrOpt);
encoder(Atom, _, _) when erlang:is_atom(Atom) ->
encodeAtom(Atom);
encoder(Binary, _, _) when erlang:is_binary(Binary) ->
encodeString(Binary);
encoder(Integer, _, _) when erlang:is_integer(Integer) ->
encodeInteger(Integer);
encoder(Float, _, _) when erlang:is_float(Float) ->
encodeFloat(Float);
encoder(List, ArrOpt, ObjOpt) when erlang:is_list(List) ->
encodeList(ArrOpt, List, ObjOpt);
encoder({?blob, Blob}, _, _) when erlang:is_binary(Blob) ->
encodeBlob(Blob);
encoder(_Value, _, _) ->
erlang:throw({error, {invalid_type, dataType(_Value), _Value}}).
encoder(Map, ArrOpt, ObjOpt) when erlang:is_map(Map) -> encodeMap(ObjOpt, Map, ArrOpt);
encoder(Atom, _, _) when erlang:is_atom(Atom) -> encodeAtom(Atom);
encoder(Binary, _, _) when erlang:is_binary(Binary) -> encodeBinary(Binary);
encoder(Integer, _, _) when erlang:is_integer(Integer) -> encodeInteger(Integer);
encoder(Float, _, _) when erlang:is_float(Float) -> encodeFloat(Float);
encoder(List, ArrOpt, ObjOpt) when erlang:is_list(List) -> encodeList(ArrOpt, List, ObjOpt);
encoder(_Value, _, _) -> erlang:throw({error, {invalid_type, dataType(_Value), _Value}}).
dataType(Data) when is_list(Data) -> list;
dataType(Data) when is_integer(Data) -> integer;
@ -90,7 +81,11 @@ encodeAtom(true) -> setSV(1), <<26/integer>>;
encodeAtom(minKey) -> setSV(1), <<30/integer>>;
encodeAtom(maxKey) -> setSV(1), <<31/integer>>;
encodeAtom(Atom) ->
encodeString(erlang:atom_to_binary(Atom, utf8)).
%% 255
AtomBin = erlang:atom_to_binary(Atom, utf8),
StrSize = erlang:byte_size(AtomBin),
setSV(StrSize + 2),
<<244/integer, StrSize:8/integer-little-unsigned, AtomBin/binary>>.
encodeInteger(0) ->
setSV(1), <<48/integer>>;
@ -190,42 +185,42 @@ encodeString(BinStr) ->
StrSize =< 126 ->
setSV(StrSize + 1),
<<(StrSize + 64)/integer, BinStr/binary>>;
StrSize < 18446744073709551616 ->
StrSize =< 18446744073709551616 ->
setSV(StrSize + 9),
<<191/integer, StrSize:64/integer-little-unsigned, BinStr/binary>>;
true ->
erlang:throw({error, too_max_str})
end.
encodeBlob(Blob) ->
encodeBinary(Blob) ->
StrSize = erlang:byte_size(Blob),
if
StrSize < 256 ->
StrSize =< 256 ->
setSV(StrSize + 2),
<<192/integer, StrSize:8/integer-little-unsigned, Blob/binary>>;
StrSize < 65536 ->
StrSize =< 65536 ->
setSV(StrSize + 3),
<<193/integer, StrSize:16/integer-little-unsigned, Blob/binary>>;
StrSize < 16777216 ->
StrSize =< 16777216 ->
setSV(StrSize + 4),
<<194/integer, StrSize:24/integer-little-unsigned, Blob/binary>>;
StrSize < 4294967296 ->
StrSize =< 4294967296 ->
setSV(StrSize + 5),
<<195/integer, StrSize:32/integer-little-unsigned, Blob/binary>>;
StrSize < 1099511627776 ->
StrSize =< 1099511627776 ->
setSV(StrSize + 6),
<<196/integer, StrSize:40/integer-little-unsigned, Blob/binary>>;
StrSize < 281474976710656 ->
StrSize =< 281474976710656 ->
setSV(StrSize + 7),
<<197/integer, StrSize:48/integer-little-unsigned, Blob/binary>>;
StrSize < 72057594037927936 ->
StrSize =< 72057594037927936 ->
setSV(StrSize + 8),
<<198/integer, StrSize:56/integer-little-unsigned, Blob/binary>>;
StrSize < 18446744073709551616 ->
StrSize =< 18446744073709551616 ->
setSV(StrSize + 9),
<<199/integer, StrSize:64/integer-little-unsigned, Blob/binary>>;
true ->
erlang:throw({error, too_max_blob})
erlang:throw({error, too_max_binary})
end.
doEncodeMap(Iterator, ArrOpt, ObjOpt, AccList, SumSize) ->
@ -240,17 +235,17 @@ doEncodeMap(Iterator, ArrOpt, ObjOpt, AccList, SumSize) ->
{AccList, SumSize}
end.
doEncodeMap(Iterator, ArrOpt, ObjOpt, AccList, Offsets, SumSize) ->
case maps:next(Iterator) of
{Key, Value, NextIter} ->
KeyEn = encodeString(asKey(Key)),
KeySize = getSV(),
ValueEn = encoder(Value, ArrOpt, ObjOpt),
ValueSize = getSV(),
doEncodeMap(NextIter, ArrOpt, ObjOpt, [ValueEn, KeyEn | AccList], [SumSize | Offsets], SumSize + KeySize + ValueSize);
none ->
{AccList, Offsets, SumSize}
end.
%% doEncodeMap(Iterator, ArrOpt, ObjOpt, AccList, Offsets, SumSize) ->
%% case maps:next(Iterator) of
%% {Key, Value, NextIter} ->
%% KeyEn = encodeString(asKey(Key)),
%% KeySize = getSV(),
%% ValueEn = encoder(Value, ArrOpt, ObjOpt),
%% ValueSize = getSV(),
%% doEncodeMap(NextIter, ArrOpt, ObjOpt, [ValueEn, KeyEn | AccList], [SumSize | Offsets], SumSize + KeySize + ValueSize);
%% none ->
%% {AccList, Offsets, SumSize}
%% end.
doEncodeSortMap(Iterator, ArrOpt, ObjOpt, AccList, Offsets, SumSize) ->
case maps:next(Iterator) of
@ -286,18 +281,18 @@ encodeMap(?VpObjYc, Map, ArrOpt) ->
{AccList, SumSize} = doEncodeMap(maps:iterator(Map), ArrOpt, ?VpObjYc, [], 0),
IoData = lists:reverse(AccList),
encodeCompactData(<<20/integer>>, IoData, SumSize, MapSize)
end;
encodeMap(?VpObjNcNs, Map, ArrOpt) ->
MapSize = erlang:map_size(Map),
case MapSize == 0 of
true ->
setSV(1),
<<10/integer>>;
_ ->
{AccList, Offsets, SumSize} = doEncodeMap(maps:iterator(Map), ArrOpt, ?VpObjNcNs, [], [], 0),
IoData = lists:reverse(AccList),
encodeUnSortMapIndexTable(IoData, MapSize, Offsets, SumSize)
end.
%% encodeMap(?VpObjNcNs, Map, ArrOpt) ->
%% MapSize = erlang:map_size(Map),
%% case MapSize == 0 of
%% true ->
%% setSV(1),
%% <<10/integer>>;
%% _ ->
%% {AccList, Offsets, SumSize} = doEncodeMap(maps:iterator(Map), ArrOpt, ?VpObjNcNs, [], [], 0),
%% IoData = lists:reverse(AccList),
%% encodeUnSortMapIndexTable(IoData, MapSize, Offsets, SumSize)
%% end.
encodeSortMapIndexTable(IoData, Count, Offsets, SumSize) ->
TemSize = SumSize + Count,
@ -326,32 +321,32 @@ encodeSortMapIndexTable(IoData, Count, Offsets, SumSize) ->
erlang:throw({error, too_much_sort_map_size})
end.
encodeUnSortMapIndexTable(IoData, Count, Offsets, SumSize) ->
TemSize = SumSize + Count,
if
TemSize < 253 ->
AllSize = TemSize + 3,
Header = <<15/integer, AllSize:8/integer-unsigned, Count:8/integer-unsigned>>,
setSV(AllSize),
[Header, IoData, buildIndexTable_1(Offsets, 3)];
TemSize + Count < 65531 ->
AllSize = TemSize + Count + 5,
Header = <<16/integer, AllSize:16/integer-little-unsigned, Count:16/integer-little-unsigned>>,
setSV(AllSize),
[Header, IoData, buildIndexTable_2(Offsets, 5)];
TemSize + Count * 3 < 4294967287 ->
AllSize = TemSize + Count * 3 + 9,
Header = <<17/integer, AllSize:32/integer-little-unsigned, Count:32/integer-little-unsigned>>,
setSV(AllSize),
[Header, IoData, buildIndexTable_4(Offsets, 9)];
TemSize + Count * 7 < 18446744073709551599 ->
AllSize = TemSize + Count * 7 + 17,
Header = <<18/integer, AllSize:64/integer-little-unsigned>>,
setSV(AllSize),
[Header, IoData, buildIndexTable_8(Offsets, 9), <<Count:64/integer-little-unsigned>>];
true ->
erlang:throw({error, too_much_unsort_map_size})
end.
%% encodeUnSortMapIndexTable(IoData, Count, Offsets, SumSize) ->
%% TemSize = SumSize + Count,
%% if
%% TemSize < 253 ->
%% AllSize = TemSize + 3,
%% Header = <<15/integer, AllSize:8/integer-unsigned, Count:8/integer-unsigned>>,
%% setSV(AllSize),
%% [Header, IoData, buildIndexTable_1(Offsets, 3)];
%% TemSize + Count < 65531 ->
%% AllSize = TemSize + Count + 5,
%% Header = <<16/integer, AllSize:16/integer-little-unsigned, Count:16/integer-little-unsigned>>,
%% setSV(AllSize),
%% [Header, IoData, buildIndexTable_2(Offsets, 5)];
%% TemSize + Count * 3 < 4294967287 ->
%% AllSize = TemSize + Count * 3 + 9,
%% Header = <<17/integer, AllSize:32/integer-little-unsigned, Count:32/integer-little-unsigned>>,
%% setSV(AllSize),
%% [Header, IoData, buildIndexTable_4(Offsets, 9)];
%% TemSize + Count * 7 < 18446744073709551599 ->
%% AllSize = TemSize + Count * 7 + 17,
%% Header = <<18/integer, AllSize:64/integer-little-unsigned>>,
%% setSV(AllSize),
%% [Header, IoData, buildIndexTable_8(Offsets, 9), <<Count:64/integer-little-unsigned>>];
%% true ->
%% erlang:throw({error, too_much_unsort_map_size})
%% end.
buildIndexTable_1(Offsets, StartSize) ->
<<<<(OneOff + StartSize):1/integer-little-unsigned-unit:8>> || OneOff <- lists:reverse(Offsets)>>.
@ -1434,73 +1429,76 @@ decoder(192, RestBin) ->
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(193, RestBin) ->
<<Length:2/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(194, RestBin) ->
<<Length:3/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(195, RestBin) ->
<<Length:4/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(196, RestBin) ->
<<Length:5/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(197, RestBin) ->
<<Length:6/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(198, RestBin) ->
<<Length:7/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(199, RestBin) ->
<<Length:8/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
RefSize = binary:referenced_byte_size(RestBin),
case RefSize / Length > ?VpBinaryCopyRatio of
true ->
{{?blob, binary:copy(BinStr)}, LeftBin};
{binary:copy(BinStr), LeftBin};
_ ->
{{?blob, BinStr}, LeftBin}
{BinStr, LeftBin}
end;
decoder(244, RestBin) ->
<<Length:8/integer-little-unsigned-unit:8, BinStr:Length/binary, LeftBin/bitstring>> = RestBin,
{binary_to_atom(BinStr), LeftBin};
decoder(_, _) ->
erlang:throw({error, unexpected_end}).

Loading…
Cancel
Save