mod lazy_stream;
use self::lazy_stream::*;
use super::dictionary::{Fetch, LinearTable, TableRef};
use super::rw::*;
use bytes::lengthwriter::LengthWriter;
use bytes::varnum::WriteVarNum;
use io::statistics::{Bytes, Instances, PerUserExtensibleKind, Rational};
use io::{Path, TokenWriter};
use TokenWriterError;
use binjs_shared::{
FieldName, IdentifierName, InterfaceName, Node, PropertyKey, SharedString, F64,
};
use std::fs;
use std::io::Write;
#[allow(unused_imports)]
use itertools::Itertools;
use range_encoding::opus;
const INITIAL_BUFFER_SIZE_BYTES: usize = 32768;
impl PerUserExtensibleKind<opus::Writer<LengthWriter>> {
pub fn into_statistics(self) -> PerUserExtensibleKind<Bytes> {
self.into_with(|_, value| value.done().unwrap().len().into())
}
}
pub struct Encoder {
options: ::entropy::Options,
writer: opus::Writer<Vec<u8>>,
dump_path: Option<std::path::PathBuf>,
content_streams: PerUserExtensibleKind<Vec<TableRef>>,
prelude_streams: PreludeStreams<LazyStream>,
unsigned_longs: LinearTable<u32>,
string_literals: LinearTable<Option<SharedString>>,
identifier_names: LinearTable<Option<IdentifierName>>,
property_keys: LinearTable<Option<PropertyKey>>,
list_lengths: LinearTable<Option<u32>>,
floats: LinearTable<Option<F64>>,
content_opus_lengths: PerUserExtensibleKind<opus::Writer<LengthWriter>>,
content_instances: PerUserExtensibleKind<Instances>,
path: Option<std::path::PathBuf>,
}
impl Encoder {
pub fn new(path: Option<&std::path::Path>, options: ::entropy::Options) -> Self {
let split_streams = options.split_streams;
let unsigned_longs = options.dictionaries.current().unsigned_longs().clone();
let string_literals = options.dictionaries.current().string_literals().clone();
let identifier_names = options.dictionaries.current().identifier_names().clone();
let property_keys = options.dictionaries.current().property_keys().clone();
let list_lengths = options.dictionaries.current().list_lengths().clone();
let floats = options.dictionaries.current().floats().clone();
Encoder {
writer: opus::Writer::new(Vec::with_capacity(INITIAL_BUFFER_SIZE_BYTES)),
dump_path: if split_streams {
path.map(|path| {
let mut buf = std::path::PathBuf::new();
buf.push(path);
buf.set_extension("streams");
buf.push("main.entropy");
buf
})
} else {
None
},
options,
content_opus_lengths: PerUserExtensibleKind::with(|_| {
opus::Writer::new(LengthWriter::new())
}),
content_streams: PerUserExtensibleKind::with(|_| Vec::new()),
prelude_streams: PreludeStreams::with(|name| {
let maybe_buf = match path {
Some(path) if split_streams => {
let mut buf = std::path::PathBuf::new();
buf.push(path);
buf.set_extension("streams");
buf.push(name);
buf.set_extension("prelude");
Some(buf)
}
_ => None,
};
LazyStream::new(maybe_buf)
}),
content_instances: PerUserExtensibleKind::with(|_| 0.into()),
path: path.map(std::path::Path::to_path_buf),
unsigned_longs,
string_literals,
identifier_names,
property_keys,
list_lengths,
floats,
}
}
}
macro_rules! emit_symbol_to_main_stream {
( $me: ident, $table: ident, $table_of_statistics: ident, $description: expr, $path: expr, $value: expr ) => {
{
use std::borrow::Borrow;
let path = $path.borrow();
let table = $me.options
.dictionaries
.current()
.$table();
let symbol =
table
.stats_by_node_value(path, &$value)
.ok_or_else(|| {
debug!(target: "entropy", "Couldn't find value {:?} at {:?} ({})",
$value, path, $description);
TokenWriterError::NotInDictionary(format!("{}: {:?} at {:?}", $description, $value, path))
})?;
let distribution = symbol.distribution
.as_ref()
.borrow();
$me.writer.symbol(symbol.index.into(), &distribution)
.map_err(TokenWriterError::WriteError)?;
let mut probability_stats = $me.options
.probability_stats
.borrow_mut();
let probability = Rational {
num: distribution.at_index(symbol.index.into()).unwrap().width() as usize,
den: distribution.width() as usize
};
probability_stats.$table_of_statistics.add_probability(probability);
Ok(())
}
}
}
macro_rules! emit_simple_symbol_to_streams {
( $me: ident, $table: ident, $out: ident, $writer: ident, $value: expr, $description: expr ) => {
if let Fetch::Miss(_) = emit_symbol_to_content_stream!($me, $table, $out, $value, $description) {
$me.prelude_streams.$out.$writer(*$value)
.map_err(TokenWriterError::WriteError)?;
}
}
}
macro_rules! emit_string_symbol_to_streams {
( $me: ident, $table: ident, $out: ident, $len: ident, $value: expr, $description: expr ) => {
if let Fetch::Miss(_) = emit_symbol_to_content_stream!($me, $table, $out, $value, $description) {
match $value {
Some(string) => {
let bytes = string.as_str()
.as_bytes();
$me.prelude_streams.$len.write_maybe_varnum(Some(bytes.len() as u32))
.map_err(TokenWriterError::WriteError)?;
$me.prelude_streams.$out.write_all(bytes)
.map_err(TokenWriterError::WriteError)?;
}
None => {
$me.prelude_streams.$len.write_maybe_varnum(None)
.map_err(TokenWriterError::WriteError)?;
}
}
}
}
}
macro_rules! emit_symbol_to_content_stream {
( $me: ident, $table: ident, $out: ident, $value: expr, $description: expr ) => {
{
let value = $value;
let fetch = $me
.$table
.fetch_index(value);
debug!(target: "write", "Writing index {:?} as {:?} index to {}", $value, fetch, $description);
let index = match fetch {
Fetch::Hit(index) => index,
Fetch::Miss(index) => index
};
let ref mut stream = $me.content_streams
.$out;
stream
.push(index);
$me.content_instances
.$out += Into::<Instances>::into(1);
fetch
}
}
}
impl Encoder {
fn flush_indices<T>(
maybe_path: &Option<std::path::PathBuf>,
name: &str,
vec: &[TableRef],
window_len: usize,
table: &LinearTable<T>,
out: &mut Vec<u8>,
) -> std::io::Result<Bytes>
where
T: std::hash::Hash + Eq + Clone + std::fmt::Debug,
{
debug!(target: "write", "Encoder::flush_indices {}, {} instances", name, vec.len());
if vec.len() == 0 {
return Ok(Into::<Bytes>::into(0));
}
let mut lazy_stream = {
let maybe_dump_path = match maybe_path {
None => None,
Some(path) => {
let buf = path
.with_extension("streams")
.join(name)
.with_extension("content");
Some(buf)
}
};
LazyStream::new(maybe_dump_path)
};
lazy_stream.write_varnum(window_len as u32)?;
debug!(target: "write", "Flushing identifiers {:?} ({} shared, {} prelude)", table, table.shared_len(), table.prelude_len());
let mut state = TableRefStreamState::new(window_len, table);
for table_ref in vec {
let varnum = state.into_u32(*table_ref);
lazy_stream.write_varnum(varnum as u32)?;
}
Self::flush_stream(name, lazy_stream, out)
}
fn flush_stream(
name: &str,
mut stream: LazyStream,
out: &mut Vec<u8>,
) -> std::io::Result<Bytes> {
stream.flush()?;
let bytes_written = stream.bytes_written();
if let Some(data) = stream.done()? {
debug!(target: "write", "Encoder::flush_stream: {} contains {} compressed bytes ({} uncompressed bytes written)",
name,
data.len(),
bytes_written,
);
out.write_all(b"[")?;
out.write_all(name.as_bytes())?;
out.write_all(b"]")?;
out.write_all(FORMAT_BROTLI)?;
let len = data.len();
out.write_varnum(len as u32)?;
out.write_all(&data)?;
Ok(Into::<Bytes>::into(len))
} else {
Ok(Into::<Bytes>::into(0))
}
}
}
impl TokenWriter for Encoder {
type Data = Box<[u8]>;
fn done(self) -> Result<Self::Data, TokenWriterError> {
let mut data: Vec<u8> = Vec::with_capacity(INITIAL_BUFFER_SIZE_BYTES);
data.extend(GLOBAL_HEADER_START);
data.extend(SECTION_PRELUDE);
for (name, stream) in self.prelude_streams.into_iter().sorted_by_key(|kv| kv.0) {
Self::flush_stream(name, stream, &mut data).map_err(TokenWriterError::WriteError)?;
}
data.extend(SECTION_CONTENT);
let path_for_flush = if self.options.split_streams {
&self.path
} else {
&None
};
macro_rules! write_indices_with_window_len { ($(($ident: ident, $name: expr, $bname: expr )),*) => {
$(
let indices = &self.content_streams.$ident;
let window_len = self.options.content_window_len.$ident;
let len = Self::flush_indices(path_for_flush, $name, indices, window_len, &self.$ident, &mut data)
.map_err(TokenWriterError::WriteError)?;
self
.options
.content_lengths
.borrow_mut()
.$ident += Into::<Bytes>::into(len);
)*
} };
for_field_in_user_extensible!(write_indices_with_window_len);
data.write_all(SECTION_MAIN)
.map_err(TokenWriterError::WriteError)?;
data.write_all(FORMAT_ENTROPY_0)
.map_err(TokenWriterError::WriteError)?;
let entropy = self.writer.done().map_err(TokenWriterError::WriteError)?;
if let Some(path) = self.dump_path {
fs::DirBuilder::new()
.recursive(true)
.create(path.parent().unwrap())
.map_err(TokenWriterError::WriteError)?;
fs::write(path, &entropy).map_err(TokenWriterError::WriteError)?;
}
data.write_all(&entropy)
.map_err(TokenWriterError::WriteError)?;
*self.options.content_lengths.borrow_mut() += self
.content_opus_lengths
.into_with(|_, field| field.done().unwrap().len());
*self.options.content_instances.borrow_mut() += self.content_instances;
Ok(data.into())
}
fn bool_at(&mut self, value: Option<bool>, path: &Path) -> Result<(), TokenWriterError> {
emit_symbol_to_main_stream!(self, bool_by_path, bools, "bool_by_path", path, value)
}
fn string_enum_at(
&mut self,
value: &SharedString,
path: &Path,
) -> Result<(), TokenWriterError> {
emit_symbol_to_main_stream!(
self,
string_enum_by_path,
string_enums,
"string_enum_by_path",
path,
value
)
}
fn enter_tagged_tuple_at(
&mut self,
_node: &dyn Node,
tag: &InterfaceName,
_children: &[&FieldName],
path: &Path,
) -> Result<(), TokenWriterError> {
emit_symbol_to_main_stream!(
self,
interface_name_by_path,
interface_names,
"interface_name_by_path",
path,
tag
)
}
fn float_at(&mut self, value: Option<f64>, _path: &Path) -> Result<(), TokenWriterError> {
use bytes::float::WriteVarFloat;
emit_simple_symbol_to_streams!(
self,
floats,
floats,
write_maybe_varfloat2,
&value.map(F64::from),
"float_at"
);
Ok(())
}
fn unsigned_long_at(&mut self, value: u32, _path: &Path) -> Result<(), TokenWriterError> {
emit_simple_symbol_to_streams!(
self,
unsigned_longs,
unsigned_longs,
write_varnum,
&value,
"unsigned_long_at"
);
Ok(())
}
fn string_at(
&mut self,
value: Option<&SharedString>,
_path: &Path,
) -> Result<(), TokenWriterError> {
emit_string_symbol_to_streams!(
self,
string_literals,
string_literals,
string_literals_len,
&value.cloned(),
"string_at"
);
Ok(())
}
fn identifier_name_at(
&mut self,
value: Option<&IdentifierName>,
_path: &Path,
) -> Result<(), TokenWriterError> {
emit_string_symbol_to_streams!(
self,
identifier_names,
identifier_names,
identifier_names_len,
&value.cloned(),
"identifier_name_at"
);
Ok(())
}
fn property_key_at(
&mut self,
value: Option<&PropertyKey>,
_path: &Path,
) -> Result<(), TokenWriterError> {
emit_string_symbol_to_streams!(
self,
property_keys,
property_keys,
property_keys_len,
&value.cloned(),
"property_key_at"
);
Ok(())
}
fn enter_list_at(&mut self, len: usize, _path: &Path) -> Result<(), TokenWriterError> {
emit_simple_symbol_to_streams!(
self,
list_lengths,
list_lengths,
write_maybe_varnum,
&Some(len as u32),
"enter_list_at"
);
Ok(())
}
fn enter_scoped_dictionary_at(
&mut self,
name: &SharedString,
_path: &Path,
) -> Result<(), TokenWriterError> {
self.options
.dictionaries
.enter_existing(name)
.map_err(|_| TokenWriterError::DictionarySwitchingError(name.clone()))?;
Ok(())
}
fn exit_scoped_dictionary_at(
&mut self,
name: &SharedString,
_path: &Path,
) -> Result<(), TokenWriterError> {
self.options.dictionaries.exit(name);
Ok(())
}
fn offset_at(&mut self, _path: &Path) -> Result<(), TokenWriterError> {
unimplemented!()
}
}