Make FileMap::{lines, multibyte_chars, non_narrow_chars} non-mutable.
This PR removes most of the interior mutability from `FileMap`, which should be beneficial, especially in a multithreaded setting. This is achieved by initializing the state in question when the filemap is constructed instead of during lexing. Hopefully this doesn't degrade performance.
cc @wesleywiser
version = "0.0.0"
dependencies = [
"arena 0.0.0",
+ "cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc_data_structures 0.0.0",
"scoped-tls 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"serialize 0.0.0",
src_hash.hash_stable(hcx, hasher);
// We only hash the relative position within this filemap
- lines.with_lock(|lines| {
- lines.len().hash_stable(hcx, hasher);
- for &line in lines.iter() {
- stable_byte_pos(line, start_pos).hash_stable(hcx, hasher);
- }
- });
+ lines.len().hash_stable(hcx, hasher);
+ for &line in lines.iter() {
+ stable_byte_pos(line, start_pos).hash_stable(hcx, hasher);
+ }
// We only hash the relative position within this filemap
- multibyte_chars.with_lock(|multibyte_chars| {
- multibyte_chars.len().hash_stable(hcx, hasher);
- for &char_pos in multibyte_chars.iter() {
- stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher);
- }
- });
+ multibyte_chars.len().hash_stable(hcx, hasher);
+ for &char_pos in multibyte_chars.iter() {
+ stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher);
+ }
- non_narrow_chars.with_lock(|non_narrow_chars| {
- non_narrow_chars.len().hash_stable(hcx, hasher);
- for &char_pos in non_narrow_chars.iter() {
- stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher);
- }
- });
+ non_narrow_chars.len().hash_stable(hcx, hasher);
+ for &char_pos in non_narrow_chars.iter() {
+ stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher);
+ }
}
}
let len = BytePos::decode(self)?;
let file_lo = self.file_index_to_file(file_lo_index);
- let lo = file_lo.lines.borrow()[line_lo - 1] + col_lo;
+ let lo = file_lo.lines[line_lo - 1] + col_lo;
let hi = lo + len;
let expn_info_tag = u8::decode(self)?;
src_hash,
start_pos,
end_pos,
- lines,
- multibyte_chars,
- non_narrow_chars,
+ mut lines,
+ mut multibyte_chars,
+ mut non_narrow_chars,
name_hash,
.. } = filemap_to_import;
// `CodeMap::new_imported_filemap()` will then translate those
// coordinates to their new global frame of reference when the
// offset of the FileMap is known.
- let mut lines = lines.into_inner();
for pos in &mut lines {
*pos = *pos - start_pos;
}
- let mut multibyte_chars = multibyte_chars.into_inner();
for mbc in &mut multibyte_chars {
mbc.pos = mbc.pos - start_pos;
}
- let mut non_narrow_chars = non_narrow_chars.into_inner();
for swc in &mut non_narrow_chars {
*swc = *swc - start_pos;
}
}
}
- /// Creates a new filemap without setting its line information. If you don't
- /// intend to set the line information yourself, you should use new_filemap_and_lines.
+ /// Creates a new filemap.
/// This does not ensure that only one FileMap exists per file name.
pub fn new_filemap(&self, filename: FileName, src: String) -> Lrc<FileMap> {
let start_pos = self.next_start_pos();
filemap
}
- /// Creates a new filemap and sets its line information.
- /// This does not ensure that only one FileMap exists per file name.
- pub fn new_filemap_and_lines(&self, filename: &Path, src: &str) -> Lrc<FileMap> {
- let fm = self.new_filemap(filename.to_owned().into(), src.to_owned());
- let mut byte_pos: u32 = fm.start_pos.0;
- for line in src.lines() {
- // register the start of this line
- fm.next_line(BytePos(byte_pos));
-
- // update byte_pos to include this line and the \n at the end
- byte_pos += line.len() as u32 + 1;
- }
- fm
- }
-
-
/// Allocates a new FileMap representing a source file from an external
/// crate. The source code of such an "imported filemap" is not available,
/// but we still know enough to generate accurate debuginfo location
external_src: Lock::new(ExternalSource::AbsentOk),
start_pos,
end_pos,
- lines: Lock::new(file_local_lines),
- multibyte_chars: Lock::new(file_local_multibyte_chars),
- non_narrow_chars: Lock::new(file_local_non_narrow_chars),
+ lines: file_local_lines,
+ multibyte_chars: file_local_multibyte_chars,
+ non_narrow_chars: file_local_non_narrow_chars,
name_hash,
});
match self.lookup_line(pos) {
Ok(FileMapAndLine { fm: f, line: a }) => {
let line = a + 1; // Line numbers start at 1
- let linebpos = (*f.lines.borrow())[a];
+ let linebpos = f.lines[a];
let linechpos = self.bytepos_to_file_charpos(linebpos);
let col = chpos - linechpos;
let col_display = {
- let non_narrow_chars = f.non_narrow_chars.borrow();
- let start_width_idx = non_narrow_chars
+ let start_width_idx = f
+ .non_narrow_chars
.binary_search_by_key(&linebpos, |x| x.pos())
.unwrap_or_else(|x| x);
- let end_width_idx = non_narrow_chars
+ let end_width_idx = f
+ .non_narrow_chars
.binary_search_by_key(&pos, |x| x.pos())
.unwrap_or_else(|x| x);
let special_chars = end_width_idx - start_width_idx;
- let non_narrow: usize =
- non_narrow_chars[start_width_idx..end_width_idx]
+ let non_narrow: usize = f
+ .non_narrow_chars[start_width_idx..end_width_idx]
.into_iter()
.map(|x| x.width())
.sum();
}
Err(f) => {
let col_display = {
- let non_narrow_chars = f.non_narrow_chars.borrow();
- let end_width_idx = non_narrow_chars
+ let end_width_idx = f
+ .non_narrow_chars
.binary_search_by_key(&pos, |x| x.pos())
.unwrap_or_else(|x| x);
- let non_narrow: usize =
- non_narrow_chars[0..end_width_idx]
+ let non_narrow: usize = f
+ .non_narrow_chars[0..end_width_idx]
.into_iter()
.map(|x| x.width())
.sum();
// The number of extra bytes due to multibyte chars in the FileMap
let mut total_extra_bytes = 0;
- for mbc in map.multibyte_chars.borrow().iter() {
+ for mbc in map.multibyte_chars.iter() {
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
if mbc.pos < bpos {
// every character is at least one byte, so we only
// count the actual extra bytes.
- total_extra_bytes += mbc.bytes - 1;
+ total_extra_bytes += mbc.bytes as u32 - 1;
// We should never see a byte position in the middle of a
// character
- assert!(bpos.to_usize() >= mbc.pos.to_usize() + mbc.bytes);
+ assert!(bpos.to_u32() >= mbc.pos.to_u32() + mbc.bytes as u32);
} else {
break;
}
}
- assert!(map.start_pos.to_usize() + total_extra_bytes <= bpos.to_usize());
- CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes)
+ assert!(map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32());
+ CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize)
}
// Return the index of the filemap (in self.files) which contains pos.
#[cfg(test)]
mod tests {
use super::*;
- use std::borrow::Cow;
use rustc_data_structures::sync::Lrc;
- #[test]
- fn t1 () {
- let cm = CodeMap::new(FilePathMapping::empty());
- let fm = cm.new_filemap(PathBuf::from("blork.rs").into(),
- "first line.\nsecond line".to_string());
- fm.next_line(BytePos(0));
- // Test we can get lines with partial line info.
- assert_eq!(fm.get_line(0), Some(Cow::from("first line.")));
- // TESTING BROKEN BEHAVIOR: line break declared before actual line break.
- fm.next_line(BytePos(10));
- assert_eq!(fm.get_line(1), Some(Cow::from(".")));
- fm.next_line(BytePos(12));
- assert_eq!(fm.get_line(2), Some(Cow::from("second line")));
- }
-
- #[test]
- #[should_panic]
- fn t2 () {
- let cm = CodeMap::new(FilePathMapping::empty());
- let fm = cm.new_filemap(PathBuf::from("blork.rs").into(),
- "first line.\nsecond line".to_string());
- // TESTING *REALLY* BROKEN BEHAVIOR:
- fm.next_line(BytePos(0));
- fm.next_line(BytePos(10));
- fm.next_line(BytePos(2));
- }
-
fn init_code_map() -> CodeMap {
let cm = CodeMap::new(FilePathMapping::empty());
- let fm1 = cm.new_filemap(PathBuf::from("blork.rs").into(),
- "first line.\nsecond line".to_string());
- let fm2 = cm.new_filemap(PathBuf::from("empty.rs").into(),
- "".to_string());
- let fm3 = cm.new_filemap(PathBuf::from("blork2.rs").into(),
- "first line.\nsecond line".to_string());
-
- fm1.next_line(BytePos(0));
- fm1.next_line(BytePos(12));
- fm2.next_line(fm2.start_pos);
- fm3.next_line(fm3.start_pos);
- fm3.next_line(fm3.start_pos + BytePos(12));
-
+ cm.new_filemap(PathBuf::from("blork.rs").into(),
+ "first line.\nsecond line".to_string());
+ cm.new_filemap(PathBuf::from("empty.rs").into(),
+ "".to_string());
+ cm.new_filemap(PathBuf::from("blork2.rs").into(),
+ "first line.\nsecond line".to_string());
cm
}
fn init_code_map_mbc() -> CodeMap {
let cm = CodeMap::new(FilePathMapping::empty());
// € is a three byte utf8 char.
- let fm1 =
- cm.new_filemap(PathBuf::from("blork.rs").into(),
- "fir€st €€€€ line.\nsecond line".to_string());
- let fm2 = cm.new_filemap(PathBuf::from("blork2.rs").into(),
- "first line€€.\n€ second line".to_string());
-
- fm1.next_line(BytePos(0));
- fm1.next_line(BytePos(28));
- fm2.next_line(fm2.start_pos);
- fm2.next_line(fm2.start_pos + BytePos(20));
-
- fm1.record_multibyte_char(BytePos(3), 3);
- fm1.record_multibyte_char(BytePos(9), 3);
- fm1.record_multibyte_char(BytePos(12), 3);
- fm1.record_multibyte_char(BytePos(15), 3);
- fm1.record_multibyte_char(BytePos(18), 3);
- fm2.record_multibyte_char(fm2.start_pos + BytePos(10), 3);
- fm2.record_multibyte_char(fm2.start_pos + BytePos(13), 3);
- fm2.record_multibyte_char(fm2.start_pos + BytePos(18), 3);
-
+ cm.new_filemap(PathBuf::from("blork.rs").into(),
+ "fir€st €€€€ line.\nsecond line".to_string());
+ cm.new_filemap(PathBuf::from("blork2.rs").into(),
+ "first line€€.\n€ second line".to_string());
cm
}
let cm = CodeMap::new(FilePathMapping::empty());
let inputtext = "aaaaa\nbbbbBB\nCCC\nDDDDDddddd\neee\n";
let selection = " \n ~~\n~~~\n~~~~~ \n \n";
- cm.new_filemap_and_lines(Path::new("blork.rs"), inputtext);
+ cm.new_filemap(Path::new("blork.rs").to_owned().into(), inputtext.to_string());
let span = span_from_selection(inputtext, selection);
// check that we are extracting the text we thought we were extracting
let inputtext = "bbbb BB\ncc CCC\n";
let selection1 = " ~~\n \n";
let selection2 = " \n ~~~\n";
- cm.new_filemap_and_lines(Path::new("blork.rs"), inputtext);
+ cm.new_filemap(Path::new("blork.rs").to_owned().into(), inputtext.to_owned());
let span1 = span_from_selection(inputtext, selection1);
let span2 = span_from_selection(inputtext, selection2);
match String::from_utf8(buf) {
Ok(src) => {
+ let src_interned = Symbol::intern(&src);
+
// Add this input file to the code map to make it available as
// dependency information
- self.cx.codemap().new_filemap_and_lines(&filename, &src);
+ self.cx.codemap().new_filemap(filename.into(), src);
let include_info = vec![
dummy_spanned(ast::NestedMetaItemKind::MetaItem(
dummy_spanned(file)))),
dummy_spanned(ast::NestedMetaItemKind::MetaItem(
attr::mk_name_value_item_str(Ident::from_str("contents"),
- dummy_spanned(Symbol::intern(&src))))),
+ dummy_spanned(src_interned)))),
];
let include_ident = Ident::from_str("include");
};
match String::from_utf8(bytes) {
Ok(src) => {
+ let interned_src = Symbol::intern(&src);
+
// Add this input file to the code map to make it available as
// dependency information
- cx.codemap().new_filemap_and_lines(&file, &src);
+ cx.codemap().new_filemap(file.into(), src);
- base::MacEager::expr(cx.expr_str(sp, Symbol::intern(&src)))
+ base::MacEager::expr(cx.expr_str(sp, interned_src))
}
Err(_) => {
cx.span_err(sp,
Ok(..) => {
// Add this input file to the code map to make it available as
// dependency information, but don't enter it's contents
- cx.codemap().new_filemap_and_lines(&file, "");
+ cx.codemap().new_filemap(file.into(), "".to_string());
base::MacEager::expr(cx.expr_lit(sp, ast::LitKind::ByteStr(Lrc::new(bytes))))
}
let mut lines: Vec<String> = Vec::new();
// Count the number of chars since the start of the line by rescanning.
- let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos());
+ let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos(rdr.pos));
let end_src_index = rdr.src_index(rdr.pos);
- assert!(src_index <= end_src_index);
+ assert!(src_index <= end_src_index,
+ "src_index={}, end_src_index={}, line_begin_pos={}",
+ src_index, end_src_index, rdr.filemap.line_begin_pos(rdr.pos).to_u32());
let mut n = 0;
while src_index < end_src_index {
let c = char_at(&rdr.src, src_index);
pub ch: Option<char>,
pub filemap: Lrc<syntax_pos::FileMap>,
/// Stop reading src at this index.
- end_src_index: usize,
- /// Whether to record new-lines and multibyte chars in filemap.
- /// This is only necessary the first time a filemap is lexed.
- /// If part of a filemap is being re-lexed, this should be set to false.
- save_new_lines_and_multibyte: bool,
+ pub end_src_index: usize,
// cached:
peek_tok: token::Token,
peek_span: Span,
ch: Some('\n'),
filemap,
end_src_index: src.len(),
- save_new_lines_and_multibyte: true,
// dummy values; not read
peek_tok: token::Eof,
peek_span: syntax_pos::DUMMY_SP,
let mut sr = StringReader::new_raw_internal(sess, begin.fm, None);
// Seek the lexer to the right byte range.
- sr.save_new_lines_and_multibyte = false;
sr.next_pos = span.lo();
sr.end_src_index = sr.src_index(span.hi());
let next_ch = char_at(&self.src, next_src_index);
let next_ch_len = next_ch.len_utf8();
- if self.ch.unwrap() == '\n' {
- if self.save_new_lines_and_multibyte {
- self.filemap.next_line(self.next_pos);
- }
- }
- if next_ch_len > 1 {
- if self.save_new_lines_and_multibyte {
- self.filemap.record_multibyte_char(self.next_pos, next_ch_len);
- }
- }
- self.filemap.record_width(self.next_pos, next_ch);
-
self.ch = Some(next_ch);
self.pos = self.next_pos;
self.next_pos = self.next_pos + Pos::from_usize(next_ch_len);
let output = Arc::new(Mutex::new(Vec::new()));
let code_map = Lrc::new(CodeMap::new(FilePathMapping::empty()));
- code_map.new_filemap_and_lines(Path::new("test.rs"), &file_text);
+ code_map.new_filemap(Path::new("test.rs").to_owned().into(), file_text.to_owned());
let primary_span = make_span(&file_text, &span_labels[0].start, &span_labels[0].end);
let mut msp = MultiSpan::from_span(primary_span);
arena = { path = "../libarena" }
scoped-tls = { version = "0.1.1", features = ["nightly"] }
unicode-width = "0.1.4"
+cfg-if = "0.1.2"
--- /dev/null
+// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use unicode_width::UnicodeWidthChar;
+use super::*;
+
+/// Find all newlines, multi-byte characters, and non-narrow characters in a
+/// FileMap.
+///
+/// This function will use an SSE2 enhanced implementation if hardware support
+/// is detected at runtime.
+pub fn analyze_filemap(
+ src: &str,
+ filemap_start_pos: BytePos)
+ -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>)
+{
+ let mut lines = vec![filemap_start_pos];
+ let mut multi_byte_chars = vec![];
+ let mut non_narrow_chars = vec![];
+
+ // Calls the right implementation, depending on hardware support available.
+ analyze_filemap_dispatch(src,
+ filemap_start_pos,
+ &mut lines,
+ &mut multi_byte_chars,
+ &mut non_narrow_chars);
+
+ // The code above optimistically registers a new line *after* each \n
+ // it encounters. If that point is already outside the filemap, remove
+ // it again.
+ if let Some(&last_line_start) = lines.last() {
+ let file_map_end = filemap_start_pos + BytePos::from_usize(src.len());
+ assert!(file_map_end >= last_line_start);
+ if last_line_start == file_map_end {
+ lines.pop();
+ }
+ }
+
+ (lines, multi_byte_chars, non_narrow_chars)
+}
+
+cfg_if! {
+ if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+ not(stage0)))] {
+ fn analyze_filemap_dispatch(src: &str,
+ filemap_start_pos: BytePos,
+ lines: &mut Vec<BytePos>,
+ multi_byte_chars: &mut Vec<MultiByteChar>,
+ non_narrow_chars: &mut Vec<NonNarrowChar>) {
+ if is_x86_feature_detected!("sse2") {
+ unsafe {
+ analyze_filemap_sse2(src,
+ filemap_start_pos,
+ lines,
+ multi_byte_chars,
+ non_narrow_chars);
+ }
+ } else {
+ analyze_filemap_generic(src,
+ src.len(),
+ filemap_start_pos,
+ lines,
+ multi_byte_chars,
+ non_narrow_chars);
+
+ }
+ }
+
+ /// Check 16 byte chunks of text at a time. If the chunk contains
+ /// something other than printable ASCII characters and newlines, the
+ /// function falls back to the generic implementation. Otherwise it uses
+ /// SSE2 intrinsics to quickly find all newlines.
+ #[target_feature(enable = "sse2")]
+ unsafe fn analyze_filemap_sse2(src: &str,
+ output_offset: BytePos,
+ lines: &mut Vec<BytePos>,
+ multi_byte_chars: &mut Vec<MultiByteChar>,
+ non_narrow_chars: &mut Vec<NonNarrowChar>) {
+ #[cfg(target_arch = "x86")]
+ use std::arch::x86::*;
+ #[cfg(target_arch = "x86_64")]
+ use std::arch::x86_64::*;
+
+ const CHUNK_SIZE: usize = 16;
+
+ let src_bytes = src.as_bytes();
+
+ let chunk_count = src.len() / CHUNK_SIZE;
+
+ // This variable keeps track of where we should start decoding a
+ // chunk. If a multi-byte character spans across chunk boundaries,
+ // we need to skip that part in the next chunk because we already
+ // handled it.
+ let mut intra_chunk_offset = 0;
+
+ for chunk_index in 0 .. chunk_count {
+ let ptr = src_bytes.as_ptr() as *const __m128i;
+ // We don't know if the pointer is aligned to 16 bytes, so we
+ // use `loadu`, which supports unaligned loading.
+ let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize));
+
+ // For character in the chunk, see if its byte value is < 0, which
+ // indicates that it's part of a UTF-8 char.
+ let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
+ // Create a bit mask from the comparison results.
+ let multibyte_mask = _mm_movemask_epi8(multibyte_test);
+
+ // If the bit mask is all zero, we only have ASCII chars here:
+ if multibyte_mask == 0 {
+ assert!(intra_chunk_offset == 0);
+
+ // Check if there are any control characters in the chunk. All
+ // control characters that we can encounter at this point have a
+ // byte value less than 32 or ...
+ let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
+ let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
+
+ // ... it's the ASCII 'DEL' character with a value of 127.
+ let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
+ let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
+
+ let control_char_mask = control_char_mask0 | control_char_mask1;
+
+ if control_char_mask != 0 {
+ // Check for newlines in the chunk
+ let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
+ let newlines_mask = _mm_movemask_epi8(newlines_test);
+
+ if control_char_mask == newlines_mask {
+ // All control characters are newlines, record them
+ let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
+ let output_offset = output_offset +
+ BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
+
+ loop {
+ let index = newlines_mask.trailing_zeros();
+
+ if index >= CHUNK_SIZE as u32 {
+ // We have arrived at the end of the chunk.
+ break
+ }
+
+ lines.push(BytePos(index) + output_offset);
+
+ // Clear the bit, so we can find the next one.
+ newlines_mask &= (!1) << index;
+ }
+
+ // We are done for this chunk. All control characters were
+ // newlines and we took care of those.
+ continue
+ } else {
+ // Some of the control characters are not newlines,
+ // fall through to the slow path below.
+ }
+ } else {
+ // No control characters, nothing to record for this chunk
+ continue
+ }
+ }
+
+ // The slow path.
+ // There are control chars in here, fallback to generic decoding.
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
+ intra_chunk_offset = analyze_filemap_generic(
+ &src[scan_start .. ],
+ CHUNK_SIZE - intra_chunk_offset,
+ BytePos::from_usize(scan_start) + output_offset,
+ lines,
+ multi_byte_chars,
+ non_narrow_chars
+ );
+ }
+
+ // There might still be a tail left to analyze
+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
+ if tail_start < src.len() {
+ analyze_filemap_generic(&src[tail_start as usize ..],
+ src.len() - tail_start,
+ output_offset + BytePos::from_usize(tail_start),
+ lines,
+ multi_byte_chars,
+ non_narrow_chars);
+ }
+ }
+ } else {
+
+ // The target (or compiler version) does not support SSE2 ...
+ fn analyze_filemap_dispatch(src: &str,
+ filemap_start_pos: BytePos,
+ lines: &mut Vec<BytePos>,
+ multi_byte_chars: &mut Vec<MultiByteChar>,
+ non_narrow_chars: &mut Vec<NonNarrowChar>) {
+ analyze_filemap_generic(src,
+ src.len(),
+ filemap_start_pos,
+ lines,
+ multi_byte_chars,
+ non_narrow_chars);
+ }
+ }
+}
+
+// `scan_len` determines the number of bytes in `src` to scan. Note that the
+// function can read past `scan_len` if a multi-byte character start within the
+// range but extends past it. The overflow is returned by the function.
+fn analyze_filemap_generic(src: &str,
+ scan_len: usize,
+ output_offset: BytePos,
+ lines: &mut Vec<BytePos>,
+ multi_byte_chars: &mut Vec<MultiByteChar>,
+ non_narrow_chars: &mut Vec<NonNarrowChar>)
+ -> usize
+{
+ assert!(src.len() >= scan_len);
+ let mut i = 0;
+ let src_bytes = src.as_bytes();
+
+ while i < scan_len {
+ let byte = unsafe {
+ // We verified that i < scan_len <= src.len()
+ *src_bytes.get_unchecked(i as usize)
+ };
+
+ // How much to advance in order to get to the next UTF-8 char in the
+ // string.
+ let mut char_len = 1;
+
+ if byte < 32 {
+ // This is an ASCII control character, it could be one of the cases
+ // that are interesting to us.
+
+ let pos = BytePos::from_usize(i) + output_offset;
+
+ match byte {
+ b'\n' => {
+ lines.push(pos + BytePos(1));
+ }
+ b'\t' => {
+ non_narrow_chars.push(NonNarrowChar::Tab(pos));
+ }
+ _ => {
+ non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
+ }
+ }
+ } else if byte >= 127 {
+ // The slow path:
+ // This is either ASCII control character "DEL" or the beginning of
+ // a multibyte char. Just decode to `char`.
+ let c = (&src[i..]).chars().next().unwrap();
+ char_len = c.len_utf8();
+
+ let pos = BytePos::from_usize(i) + output_offset;
+
+ if char_len > 1 {
+ assert!(char_len >=2 && char_len <= 4);
+ let mbc = MultiByteChar {
+ pos,
+ bytes: char_len as u8,
+ };
+ multi_byte_chars.push(mbc);
+ }
+
+ // Assume control characters are zero width.
+ // FIXME: How can we decide between `width` and `width_cjk`?
+ let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
+
+ if char_width != 1 {
+ non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
+ }
+ }
+
+ i += char_len;
+ }
+
+ i - scan_len
+}
+
+
+
+macro_rules! test {
+ (case: $test_name:ident,
+ text: $text:expr,
+ filemap_start_pos: $filemap_start_pos:expr,
+ lines: $lines:expr,
+ multi_byte_chars: $multi_byte_chars:expr,
+ non_narrow_chars: $non_narrow_chars:expr,) => (
+
+ #[test]
+ fn $test_name() {
+
+ let (lines, multi_byte_chars, non_narrow_chars) =
+ analyze_filemap($text, BytePos($filemap_start_pos));
+
+ let expected_lines: Vec<BytePos> = $lines
+ .into_iter()
+ .map(|pos| BytePos(pos))
+ .collect();
+
+ assert_eq!(lines, expected_lines);
+
+ let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars
+ .into_iter()
+ .map(|(pos, bytes)| MultiByteChar {
+ pos: BytePos(pos),
+ bytes,
+ })
+ .collect();
+
+ assert_eq!(multi_byte_chars, expected_mbcs);
+
+ let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars
+ .into_iter()
+ .map(|(pos, width)| {
+ NonNarrowChar::new(BytePos(pos), width)
+ })
+ .collect();
+
+ assert_eq!(non_narrow_chars, expected_nncs);
+ })
+}
+
+test!(
+ case: empty_text,
+ text: "",
+ filemap_start_pos: 0,
+ lines: vec![],
+ multi_byte_chars: vec![],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: newlines_short,
+ text: "a\nc",
+ filemap_start_pos: 0,
+ lines: vec![0, 2],
+ multi_byte_chars: vec![],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: newlines_long,
+ text: "012345678\nabcdef012345678\na",
+ filemap_start_pos: 0,
+ lines: vec![0, 10, 26],
+ multi_byte_chars: vec![],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: newline_and_multi_byte_char_in_same_chunk,
+ text: "01234β789\nbcdef0123456789abcdef",
+ filemap_start_pos: 0,
+ lines: vec![0, 11],
+ multi_byte_chars: vec![(5, 2)],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: newline_and_control_char_in_same_chunk,
+ text: "01234\u{07}6789\nbcdef0123456789abcdef",
+ filemap_start_pos: 0,
+ lines: vec![0, 11],
+ multi_byte_chars: vec![],
+ non_narrow_chars: vec![(5, 0)],
+);
+
+test!(
+ case: multi_byte_char_short,
+ text: "aβc",
+ filemap_start_pos: 0,
+ lines: vec![0],
+ multi_byte_chars: vec![(1, 2)],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: multi_byte_char_long,
+ text: "0123456789abcΔf012345β",
+ filemap_start_pos: 0,
+ lines: vec![0],
+ multi_byte_chars: vec![(13, 2), (22, 2)],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: multi_byte_char_across_chunk_boundary,
+ text: "0123456789abcdeΔ123456789abcdef01234",
+ filemap_start_pos: 0,
+ lines: vec![0],
+ multi_byte_chars: vec![(15, 2)],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: multi_byte_char_across_chunk_boundary_tail,
+ text: "0123456789abcdeΔ....",
+ filemap_start_pos: 0,
+ lines: vec![0],
+ multi_byte_chars: vec![(15, 2)],
+ non_narrow_chars: vec![],
+);
+
+test!(
+ case: non_narrow_short,
+ text: "0\t2",
+ filemap_start_pos: 0,
+ lines: vec![0],
+ multi_byte_chars: vec![],
+ non_narrow_chars: vec![(1, 4)],
+);
+
+test!(
+ case: non_narrow_long,
+ text: "01\t3456789abcdef01234567\u{07}9",
+ filemap_start_pos: 0,
+ lines: vec![0],
+ multi_byte_chars: vec![],
+ non_narrow_chars: vec![(2, 4), (24, 0)],
+);
+
+test!(
+ case: output_offset_all,
+ text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf",
+ filemap_start_pos: 1000,
+ lines: vec![0 + 1000, 7 + 1000, 27 + 1000],
+ multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)],
+ non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)],
+);
#![feature(optin_builtin_traits)]
#![allow(unused_attributes)]
#![feature(specialization)]
+#![feature(stdsimd)]
use std::borrow::Cow;
use std::cell::Cell;
extern crate serialize;
extern crate serialize as rustc_serialize; // used by deriving
+#[macro_use]
+extern crate cfg_if;
+
extern crate unicode_width;
pub mod edition;
pub mod symbol;
+mod analyze_filemap;
+
pub struct Globals {
symbol_interner: Lock<symbol::Interner>,
span_interner: Lock<span_encoding::SpanInterner>,
pub const NO_EXPANSION: SyntaxContext = SyntaxContext::empty();
/// Identifies an offset of a multi-byte character in a FileMap
-#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)]
+#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)]
pub struct MultiByteChar {
/// The absolute offset of the character in the CodeMap
pub pos: BytePos,
/// The number of bytes, >=2
- pub bytes: usize,
+ pub bytes: u8,
}
/// Identifies an offset of a non-narrow character in a FileMap
-#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)]
+#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)]
pub enum NonNarrowChar {
/// Represents a zero-width character
ZeroWidth(BytePos),
/// The end position of this source in the CodeMap
pub end_pos: BytePos,
/// Locations of lines beginnings in the source code
- pub lines: Lock<Vec<BytePos>>,
+ pub lines: Vec<BytePos>,
/// Locations of multi-byte characters in the source code
- pub multibyte_chars: Lock<Vec<MultiByteChar>>,
+ pub multibyte_chars: Vec<MultiByteChar>,
/// Width of characters that are not narrow in the source code
- pub non_narrow_chars: Lock<Vec<NonNarrowChar>>,
+ pub non_narrow_chars: Vec<NonNarrowChar>,
/// A hash of the filename, used for speeding up the incr. comp. hashing.
pub name_hash: u128,
}
s.emit_struct_field("start_pos", 4, |s| self.start_pos.encode(s))?;
s.emit_struct_field("end_pos", 5, |s| self.end_pos.encode(s))?;
s.emit_struct_field("lines", 6, |s| {
- let lines = self.lines.borrow();
+ let lines = &self.lines[..];
// store the length
s.emit_u32(lines.len() as u32)?;
Ok(())
})?;
s.emit_struct_field("multibyte_chars", 7, |s| {
- (*self.multibyte_chars.borrow()).encode(s)
+ self.multibyte_chars.encode(s)
})?;
s.emit_struct_field("non_narrow_chars", 8, |s| {
- (*self.non_narrow_chars.borrow()).encode(s)
+ self.non_narrow_chars.encode(s)
})?;
s.emit_struct_field("name_hash", 9, |s| {
self.name_hash.encode(s)
src: None,
src_hash,
external_src: Lock::new(ExternalSource::AbsentOk),
- lines: Lock::new(lines),
- multibyte_chars: Lock::new(multibyte_chars),
- non_narrow_chars: Lock::new(non_narrow_chars),
+ lines,
+ multibyte_chars,
+ non_narrow_chars,
name_hash,
})
})
};
let end_pos = start_pos.to_usize() + src.len();
+ let (lines, multibyte_chars, non_narrow_chars) =
+ analyze_filemap::analyze_filemap(&src[..], start_pos);
+
FileMap {
name,
name_was_remapped,
external_src: Lock::new(ExternalSource::Unneeded),
start_pos,
end_pos: Pos::from_usize(end_pos),
- lines: Lock::new(Vec::new()),
- multibyte_chars: Lock::new(Vec::new()),
- non_narrow_chars: Lock::new(Vec::new()),
+ lines,
+ multibyte_chars,
+ non_narrow_chars,
name_hash,
}
}
- /// EFFECT: register a start-of-line offset in the
- /// table of line-beginnings.
- /// UNCHECKED INVARIANT: these offsets must be added in the right
- /// order and must be in the right places; there is shared knowledge
- /// about what ends a line between this file and parse.rs
- /// WARNING: pos param here is the offset relative to start of CodeMap,
- /// and CodeMap will append a newline when adding a filemap without a newline at the end,
- /// so the safe way to call this is with value calculated as
- /// filemap.start_pos + newline_offset_relative_to_the_start_of_filemap.
- pub fn next_line(&self, pos: BytePos) {
- // the new charpos must be > the last one (or it's the first one).
- let mut lines = self.lines.borrow_mut();
- let line_len = lines.len();
- assert!(line_len == 0 || ((*lines)[line_len - 1] < pos));
- lines.push(pos);
- }
-
/// Return the BytePos of the beginning of the current line.
- pub fn line_begin_pos(&self) -> BytePos {
- let lines = self.lines.borrow();
- match lines.last() {
- Some(&line_pos) => line_pos,
- None => self.start_pos,
- }
+ pub fn line_begin_pos(&self, pos: BytePos) -> BytePos {
+ let line_index = self.lookup_line(pos).unwrap();
+ self.lines[line_index]
}
/// Add externally loaded source.
}
let begin = {
- let lines = self.lines.borrow();
- let line = if let Some(line) = lines.get(line_number) {
+ let line = if let Some(line) = self.lines.get(line_number) {
line
} else {
return None;
}
}
- pub fn record_multibyte_char(&self, pos: BytePos, bytes: usize) {
- assert!(bytes >=2 && bytes <= 4);
- let mbc = MultiByteChar {
- pos,
- bytes,
- };
- self.multibyte_chars.borrow_mut().push(mbc);
- }
-
- #[inline]
- pub fn record_width(&self, pos: BytePos, ch: char) {
- let width = match ch {
- '\t' =>
- // Tabs will consume 4 columns.
- 4,
- '\n' =>
- // Make newlines take one column so that displayed spans can point them.
- 1,
- ch =>
- // Assume control characters are zero width.
- // FIXME: How can we decide between `width` and `width_cjk`?
- unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0),
- };
- // Only record non-narrow characters.
- if width != 1 {
- self.non_narrow_chars.borrow_mut().push(NonNarrowChar::new(pos, width));
- }
- }
-
pub fn is_real_file(&self) -> bool {
self.name.is_real()
}
self.end_pos.0 - self.start_pos.0
}
pub fn count_lines(&self) -> usize {
- self.lines.borrow().len()
+ self.lines.len()
}
/// Find the line containing the given position. The return value is the
/// number. If the filemap is empty or the position is located before the
/// first line, None is returned.
pub fn lookup_line(&self, pos: BytePos) -> Option<usize> {
- let lines = self.lines.borrow();
- if lines.len() == 0 {
+ if self.lines.len() == 0 {
return None;
}
- let line_index = lookup_line(&lines[..], pos);
- assert!(line_index < lines.len() as isize);
+ let line_index = lookup_line(&self.lines[..], pos);
+ assert!(line_index < self.lines.len() as isize);
if line_index >= 0 {
Some(line_index as usize)
} else {
return (self.start_pos, self.end_pos);
}
- let lines = self.lines.borrow();
- assert!(line_index < lines.len());
- if line_index == (lines.len() - 1) {
- (lines[line_index], self.end_pos)
+ assert!(line_index < self.lines.len());
+ if line_index == (self.lines.len() - 1) {
+ (self.lines[line_index], self.end_pos)
} else {
- (lines[line_index], lines[line_index + 1])
+ (self.lines[line_index], self.lines[line_index + 1])
}
}
pub trait Pos {
fn from_usize(n: usize) -> Self;
fn to_usize(&self) -> usize;
+ fn from_u32(n: u32) -> Self;
+ fn to_u32(&self) -> u32;
}
/// A byte offset. Keep this small (currently 32-bits), as AST contains
fn from_usize(n: usize) -> BytePos { BytePos(n as u32) }
#[inline(always)]
- fn to_usize(&self) -> usize { let BytePos(n) = *self; n as usize }
+ fn to_usize(&self) -> usize { self.0 as usize }
+
+ #[inline(always)]
+ fn from_u32(n: u32) -> BytePos { BytePos(n) }
+
+ #[inline(always)]
+ fn to_u32(&self) -> u32 { self.0 }
}
impl Add for BytePos {
fn from_usize(n: usize) -> CharPos { CharPos(n) }
#[inline(always)]
- fn to_usize(&self) -> usize { let CharPos(n) = *self; n }
+ fn to_usize(&self) -> usize { self.0 }
+
+ #[inline(always)]
+ fn from_u32(n: u32) -> CharPos { CharPos(n as usize) }
+
+ #[inline(always)]
+ fn to_u32(&self) -> u32 { self.0 as u32}
}
impl Add for CharPos {