parser/htmlparser/src/nsScannerString.cpp

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:0fedaa662eee
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include <stdlib.h>
8 #include "nsScannerString.h"
9
10
11 /**
12 * nsScannerBufferList
13 */
14
15 #define MAX_CAPACITY ((UINT32_MAX / sizeof(char16_t)) - \
16 (sizeof(Buffer) + sizeof(char16_t)))
17
18 nsScannerBufferList::Buffer*
19 nsScannerBufferList::AllocBufferFromString( const nsAString& aString )
20 {
21 uint32_t len = aString.Length();
22 Buffer* buf = AllocBuffer(len);
23
24 if (buf)
25 {
26 nsAString::const_iterator source;
27 aString.BeginReading(source);
28 nsCharTraits<char16_t>::copy(buf->DataStart(), source.get(), len);
29 }
30 return buf;
31 }
32
33 nsScannerBufferList::Buffer*
34 nsScannerBufferList::AllocBuffer( uint32_t capacity )
35 {
36 if (capacity > MAX_CAPACITY)
37 return nullptr;
38
39 void* ptr = malloc(sizeof(Buffer) + (capacity + 1) * sizeof(char16_t));
40 if (!ptr)
41 return nullptr;
42
43 Buffer* buf = new (ptr) Buffer();
44
45 buf->mUsageCount = 0;
46 buf->mDataEnd = buf->DataStart() + capacity;
47
48 // XXX null terminate. this shouldn't be required, but we do it because
49 // nsScanner erroneously thinks it can dereference DataEnd :-(
50 *buf->mDataEnd = char16_t(0);
51 return buf;
52 }
53
54 void
55 nsScannerBufferList::ReleaseAll()
56 {
57 while (!mBuffers.isEmpty())
58 {
59 Buffer* node = mBuffers.popFirst();
60 //printf(">>> freeing buffer @%p\n", node);
61 free(node);
62 }
63 }
64
65 void
66 nsScannerBufferList::SplitBuffer( const Position& pos )
67 {
68 // splitting to the right keeps the work string and any extant token
69 // pointing to and holding a reference count on the same buffer.
70
71 Buffer* bufferToSplit = pos.mBuffer;
72 NS_ASSERTION(bufferToSplit, "null pointer");
73
74 uint32_t splitOffset = pos.mPosition - bufferToSplit->DataStart();
75 NS_ASSERTION(pos.mPosition >= bufferToSplit->DataStart() &&
76 splitOffset <= bufferToSplit->DataLength(),
77 "split offset is outside buffer");
78
79 uint32_t len = bufferToSplit->DataLength() - splitOffset;
80 Buffer* new_buffer = AllocBuffer(len);
81 if (new_buffer)
82 {
83 nsCharTraits<char16_t>::copy(new_buffer->DataStart(),
84 bufferToSplit->DataStart() + splitOffset,
85 len);
86 InsertAfter(new_buffer, bufferToSplit);
87 bufferToSplit->SetDataLength(splitOffset);
88 }
89 }
90
91 void
92 nsScannerBufferList::DiscardUnreferencedPrefix( Buffer* aBuf )
93 {
94 if (aBuf == Head())
95 {
96 while (!mBuffers.isEmpty() && !Head()->IsInUse())
97 {
98 Buffer* buffer = Head();
99 buffer->remove();
100 free(buffer);
101 }
102 }
103 }
104
105 size_t
106 nsScannerBufferList::Position::Distance( const Position& aStart, const Position& aEnd )
107 {
108 size_t result = 0;
109 if (aStart.mBuffer == aEnd.mBuffer)
110 {
111 result = aEnd.mPosition - aStart.mPosition;
112 }
113 else
114 {
115 result = aStart.mBuffer->DataEnd() - aStart.mPosition;
116 for (Buffer* b = aStart.mBuffer->Next(); b != aEnd.mBuffer; b = b->Next())
117 result += b->DataLength();
118 result += aEnd.mPosition - aEnd.mBuffer->DataStart();
119 }
120 return result;
121 }
122
123
124 /**
125 * nsScannerSubstring
126 */
127
128 nsScannerSubstring::nsScannerSubstring()
129 : mStart(nullptr, nullptr)
130 , mEnd(nullptr, nullptr)
131 , mBufferList(nullptr)
132 , mLength(0)
133 , mIsDirty(true)
134 {
135 }
136
137 nsScannerSubstring::nsScannerSubstring( const nsAString& s )
138 : mBufferList(nullptr)
139 , mIsDirty(true)
140 {
141 Rebind(s);
142 }
143
144 nsScannerSubstring::~nsScannerSubstring()
145 {
146 release_ownership_of_buffer_list();
147 }
148
149 int32_t
150 nsScannerSubstring::CountChar( char16_t c ) const
151 {
152 /*
153 re-write this to use a counting sink
154 */
155
156 size_type result = 0;
157 size_type lengthToExamine = Length();
158
159 nsScannerIterator iter;
160 for ( BeginReading(iter); ; )
161 {
162 int32_t lengthToExamineInThisFragment = iter.size_forward();
163 const char16_t* fromBegin = iter.get();
164 result += size_type(NS_COUNT(fromBegin, fromBegin+lengthToExamineInThisFragment, c));
165 if ( !(lengthToExamine -= lengthToExamineInThisFragment) )
166 return result;
167 iter.advance(lengthToExamineInThisFragment);
168 }
169 // never reached; quiets warnings
170 return 0;
171 }
172
173 void
174 nsScannerSubstring::Rebind( const nsScannerSubstring& aString,
175 const nsScannerIterator& aStart,
176 const nsScannerIterator& aEnd )
177 {
178 // allow for the case where &aString == this
179
180 aString.acquire_ownership_of_buffer_list();
181 release_ownership_of_buffer_list();
182
183 mStart = aStart;
184 mEnd = aEnd;
185 mBufferList = aString.mBufferList;
186 mLength = Distance(aStart, aEnd);
187 mIsDirty = true;
188 }
189
190 void
191 nsScannerSubstring::Rebind( const nsAString& aString )
192 {
193 release_ownership_of_buffer_list();
194
195 mBufferList = new nsScannerBufferList(AllocBufferFromString(aString));
196 mIsDirty = true;
197
198 init_range_from_buffer_list();
199 acquire_ownership_of_buffer_list();
200 }
201
202 const nsSubstring&
203 nsScannerSubstring::AsString() const
204 {
205 if (mIsDirty)
206 {
207 nsScannerSubstring* mutable_this = const_cast<nsScannerSubstring*>(this);
208
209 if (mStart.mBuffer == mEnd.mBuffer) {
210 // We only have a single fragment to deal with, so just return it
211 // as a substring.
212 mutable_this->mFlattenedRep.Rebind(mStart.mPosition, mEnd.mPosition);
213 } else {
214 // Otherwise, we need to copy the data into a flattened buffer.
215 nsScannerIterator start, end;
216 CopyUnicodeTo(BeginReading(start), EndReading(end), mutable_this->mFlattenedRep);
217 }
218
219 mutable_this->mIsDirty = false;
220 }
221
222 return mFlattenedRep;
223 }
224
225 nsScannerIterator&
226 nsScannerSubstring::BeginReading( nsScannerIterator& iter ) const
227 {
228 iter.mOwner = this;
229
230 iter.mFragment.mBuffer = mStart.mBuffer;
231 iter.mFragment.mFragmentStart = mStart.mPosition;
232 if (mStart.mBuffer == mEnd.mBuffer)
233 iter.mFragment.mFragmentEnd = mEnd.mPosition;
234 else
235 iter.mFragment.mFragmentEnd = mStart.mBuffer->DataEnd();
236
237 iter.mPosition = mStart.mPosition;
238 iter.normalize_forward();
239 return iter;
240 }
241
242 nsScannerIterator&
243 nsScannerSubstring::EndReading( nsScannerIterator& iter ) const
244 {
245 iter.mOwner = this;
246
247 iter.mFragment.mBuffer = mEnd.mBuffer;
248 iter.mFragment.mFragmentEnd = mEnd.mPosition;
249 if (mStart.mBuffer == mEnd.mBuffer)
250 iter.mFragment.mFragmentStart = mStart.mPosition;
251 else
252 iter.mFragment.mFragmentStart = mEnd.mBuffer->DataStart();
253
254 iter.mPosition = mEnd.mPosition;
255 // must not |normalize_backward| as that would likely invalidate tests like |while ( first != last )|
256 return iter;
257 }
258
259 bool
260 nsScannerSubstring::GetNextFragment( nsScannerFragment& frag ) const
261 {
262 // check to see if we are at the end of the buffer list
263 if (frag.mBuffer == mEnd.mBuffer)
264 return false;
265
266 frag.mBuffer = frag.mBuffer->getNext();
267
268 if (frag.mBuffer == mStart.mBuffer)
269 frag.mFragmentStart = mStart.mPosition;
270 else
271 frag.mFragmentStart = frag.mBuffer->DataStart();
272
273 if (frag.mBuffer == mEnd.mBuffer)
274 frag.mFragmentEnd = mEnd.mPosition;
275 else
276 frag.mFragmentEnd = frag.mBuffer->DataEnd();
277
278 return true;
279 }
280
281 bool
282 nsScannerSubstring::GetPrevFragment( nsScannerFragment& frag ) const
283 {
284 // check to see if we are at the beginning of the buffer list
285 if (frag.mBuffer == mStart.mBuffer)
286 return false;
287
288 frag.mBuffer = frag.mBuffer->getPrevious();
289
290 if (frag.mBuffer == mStart.mBuffer)
291 frag.mFragmentStart = mStart.mPosition;
292 else
293 frag.mFragmentStart = frag.mBuffer->DataStart();
294
295 if (frag.mBuffer == mEnd.mBuffer)
296 frag.mFragmentEnd = mEnd.mPosition;
297 else
298 frag.mFragmentEnd = frag.mBuffer->DataEnd();
299
300 return true;
301 }
302
303
304 /**
305 * nsScannerString
306 */
307
308 nsScannerString::nsScannerString( Buffer* aBuf )
309 {
310 mBufferList = new nsScannerBufferList(aBuf);
311
312 init_range_from_buffer_list();
313 acquire_ownership_of_buffer_list();
314 }
315
316 void
317 nsScannerString::AppendBuffer( Buffer* aBuf )
318 {
319 mBufferList->Append(aBuf);
320 mLength += aBuf->DataLength();
321
322 mEnd.mBuffer = aBuf;
323 mEnd.mPosition = aBuf->DataEnd();
324
325 mIsDirty = true;
326 }
327
328 void
329 nsScannerString::DiscardPrefix( const nsScannerIterator& aIter )
330 {
331 Position old_start(mStart);
332 mStart = aIter;
333 mLength -= Position::Distance(old_start, mStart);
334
335 mStart.mBuffer->IncrementUsageCount();
336 old_start.mBuffer->DecrementUsageCount();
337
338 mBufferList->DiscardUnreferencedPrefix(old_start.mBuffer);
339
340 mIsDirty = true;
341 }
342
343 void
344 nsScannerString::UngetReadable( const nsAString& aReadable, const nsScannerIterator& aInsertPoint )
345 /*
346 * Warning: this routine manipulates the shared buffer list in an unexpected way.
347 * The original design did not really allow for insertions, but this call promises
348 * that if called for a point after the end of all extant token strings, that no token string
349 * or the work string will be invalidated.
350 *
351 * This routine is protected because it is the responsibility of the derived class to keep those promises.
352 */
353 {
354 Position insertPos(aInsertPoint);
355
356 mBufferList->SplitBuffer(insertPos);
357 // splitting to the right keeps the work string and any extant token pointing to and
358 // holding a reference count on the same buffer
359
360 Buffer* new_buffer = AllocBufferFromString(aReadable);
361 // make a new buffer with all the data to insert...
362 // BULLSHIT ALERT: we may have empty space to re-use in the split buffer, measure the cost
363 // of this and decide if we should do the work to fill it
364
365 Buffer* buffer_to_split = insertPos.mBuffer;
366 mBufferList->InsertAfter(new_buffer, buffer_to_split);
367 mLength += aReadable.Length();
368
369 mEnd.mBuffer = mBufferList->Tail();
370 mEnd.mPosition = mEnd.mBuffer->DataEnd();
371
372 mIsDirty = true;
373 }
374
375 void
376 nsScannerString::ReplaceCharacter(nsScannerIterator& aPosition, char16_t aChar)
377 {
378 // XXX Casting a const to non-const. Unless the base class
379 // provides support for writing iterators, this is the best
380 // that can be done.
381 char16_t* pos = const_cast<char16_t*>(aPosition.get());
382 *pos = aChar;
383
384 mIsDirty = true;
385 }
386
387
388 /**
389 * nsScannerSharedSubstring
390 */
391
392 void
393 nsScannerSharedSubstring::Rebind(const nsScannerIterator &aStart,
394 const nsScannerIterator &aEnd)
395 {
396 // If the start and end positions are inside the same buffer, we must
397 // acquire ownership of the buffer. If not, we can optimize by not holding
398 // onto it.
399
400 Buffer *buffer = const_cast<Buffer*>(aStart.buffer());
401 bool sameBuffer = buffer == aEnd.buffer();
402
403 nsScannerBufferList *bufferList;
404
405 if (sameBuffer) {
406 bufferList = aStart.mOwner->mBufferList;
407 bufferList->AddRef();
408 buffer->IncrementUsageCount();
409 }
410
411 if (mBufferList)
412 ReleaseBuffer();
413
414 if (sameBuffer) {
415 mBuffer = buffer;
416 mBufferList = bufferList;
417 mString.Rebind(aStart.mPosition, aEnd.mPosition);
418 } else {
419 mBuffer = nullptr;
420 mBufferList = nullptr;
421 CopyUnicodeTo(aStart, aEnd, mString);
422 }
423 }
424
425 void
426 nsScannerSharedSubstring::ReleaseBuffer()
427 {
428 NS_ASSERTION(mBufferList, "Should only be called with non-null mBufferList");
429 mBuffer->DecrementUsageCount();
430 mBufferList->DiscardUnreferencedPrefix(mBuffer);
431 mBufferList->Release();
432 }
433
434 void
435 nsScannerSharedSubstring::MakeMutable()
436 {
437 nsString temp(mString); // this will force a copy of the data
438 mString.Assign(temp); // mString will now share the just-allocated buffer
439
440 ReleaseBuffer();
441
442 mBuffer = nullptr;
443 mBufferList = nullptr;
444 }
445
446 /**
447 * utils -- based on code from nsReadableUtils.cpp
448 */
449
450 // private helper function
451 static inline
452 nsAString::iterator&
453 copy_multifragment_string( nsScannerIterator& first, const nsScannerIterator& last, nsAString::iterator& result )
454 {
455 typedef nsCharSourceTraits<nsScannerIterator> source_traits;
456 typedef nsCharSinkTraits<nsAString::iterator> sink_traits;
457
458 while ( first != last )
459 {
460 uint32_t distance = source_traits::readable_distance(first, last);
461 sink_traits::write(result, source_traits::read(first), distance);
462 NS_ASSERTION(distance > 0, "|copy_multifragment_string| will never terminate");
463 source_traits::advance(first, distance);
464 }
465
466 return result;
467 }
468
469 void
470 CopyUnicodeTo( const nsScannerIterator& aSrcStart,
471 const nsScannerIterator& aSrcEnd,
472 nsAString& aDest )
473 {
474 nsAString::iterator writer;
475 if (!aDest.SetLength(Distance(aSrcStart, aSrcEnd), mozilla::fallible_t())) {
476 aDest.Truncate();
477 return; // out of memory
478 }
479 aDest.BeginWriting(writer);
480 nsScannerIterator fromBegin(aSrcStart);
481
482 copy_multifragment_string(fromBegin, aSrcEnd, writer);
483 }
484
485 void
486 AppendUnicodeTo( const nsScannerIterator& aSrcStart,
487 const nsScannerIterator& aSrcEnd,
488 nsScannerSharedSubstring& aDest )
489 {
490 // Check whether we can just create a dependent string.
491 if (aDest.str().IsEmpty()) {
492 // We can just make |aDest| point to the buffer.
493 // This will take care of copying if the buffer spans fragments.
494 aDest.Rebind(aSrcStart, aSrcEnd);
495 } else {
496 // The dest string is not empty, so it can't be a dependent substring.
497 AppendUnicodeTo(aSrcStart, aSrcEnd, aDest.writable());
498 }
499 }
500
501 void
502 AppendUnicodeTo( const nsScannerIterator& aSrcStart,
503 const nsScannerIterator& aSrcEnd,
504 nsAString& aDest )
505 {
506 nsAString::iterator writer;
507 uint32_t oldLength = aDest.Length();
508 if (!aDest.SetLength(oldLength + Distance(aSrcStart, aSrcEnd), mozilla::fallible_t()))
509 return; // out of memory
510 aDest.BeginWriting(writer).advance(oldLength);
511 nsScannerIterator fromBegin(aSrcStart);
512
513 copy_multifragment_string(fromBegin, aSrcEnd, writer);
514 }
515
516 bool
517 FindCharInReadable( char16_t aChar,
518 nsScannerIterator& aSearchStart,
519 const nsScannerIterator& aSearchEnd )
520 {
521 while ( aSearchStart != aSearchEnd )
522 {
523 int32_t fragmentLength;
524 if ( SameFragment(aSearchStart, aSearchEnd) )
525 fragmentLength = aSearchEnd.get() - aSearchStart.get();
526 else
527 fragmentLength = aSearchStart.size_forward();
528
529 const char16_t* charFoundAt = nsCharTraits<char16_t>::find(aSearchStart.get(), fragmentLength, aChar);
530 if ( charFoundAt ) {
531 aSearchStart.advance( charFoundAt - aSearchStart.get() );
532 return true;
533 }
534
535 aSearchStart.advance(fragmentLength);
536 }
537
538 return false;
539 }
540
541 bool
542 FindInReadable( const nsAString& aPattern,
543 nsScannerIterator& aSearchStart,
544 nsScannerIterator& aSearchEnd,
545 const nsStringComparator& compare )
546 {
547 bool found_it = false;
548
549 // only bother searching at all if we're given a non-empty range to search
550 if ( aSearchStart != aSearchEnd )
551 {
552 nsAString::const_iterator aPatternStart, aPatternEnd;
553 aPattern.BeginReading(aPatternStart);
554 aPattern.EndReading(aPatternEnd);
555
556 // outer loop keeps searching till we find it or run out of string to search
557 while ( !found_it )
558 {
559 // fast inner loop (that's what it's called, not what it is) looks for a potential match
560 while ( aSearchStart != aSearchEnd &&
561 compare(aPatternStart.get(), aSearchStart.get(), 1, 1) )
562 ++aSearchStart;
563
564 // if we broke out of the `fast' loop because we're out of string ... we're done: no match
565 if ( aSearchStart == aSearchEnd )
566 break;
567
568 // otherwise, we're at a potential match, let's see if we really hit one
569 nsAString::const_iterator testPattern(aPatternStart);
570 nsScannerIterator testSearch(aSearchStart);
571
572 // slow inner loop verifies the potential match (found by the `fast' loop) at the current position
573 for(;;)
574 {
575 // we already compared the first character in the outer loop,
576 // so we'll advance before the next comparison
577 ++testPattern;
578 ++testSearch;
579
580 // if we verified all the way to the end of the pattern, then we found it!
581 if ( testPattern == aPatternEnd )
582 {
583 found_it = true;
584 aSearchEnd = testSearch; // return the exact found range through the parameters
585 break;
586 }
587
588 // if we got to end of the string we're searching before we hit the end of the
589 // pattern, we'll never find what we're looking for
590 if ( testSearch == aSearchEnd )
591 {
592 aSearchStart = aSearchEnd;
593 break;
594 }
595
596 // else if we mismatched ... it's time to advance to the next search position
597 // and get back into the `fast' loop
598 if ( compare(testPattern.get(), testSearch.get(), 1, 1) )
599 {
600 ++aSearchStart;
601 break;
602 }
603 }
604 }
605 }
606
607 return found_it;
608 }
609
610 /**
611 * This implementation is simple, but does too much work.
612 * It searches the entire string from left to right, and returns the last match found, if any.
613 * This implementation will be replaced when I get |reverse_iterator|s working.
614 */
615 bool
616 RFindInReadable( const nsAString& aPattern,
617 nsScannerIterator& aSearchStart,
618 nsScannerIterator& aSearchEnd,
619 const nsStringComparator& aComparator )
620 {
621 bool found_it = false;
622
623 nsScannerIterator savedSearchEnd(aSearchEnd);
624 nsScannerIterator searchStart(aSearchStart), searchEnd(aSearchEnd);
625
626 while ( searchStart != searchEnd )
627 {
628 if ( FindInReadable(aPattern, searchStart, searchEnd, aComparator) )
629 {
630 found_it = true;
631
632 // this is the best match so far, so remember it
633 aSearchStart = searchStart;
634 aSearchEnd = searchEnd;
635
636 // ...and get ready to search some more
637 // (it's tempting to set |searchStart=searchEnd| ... but that misses overlapping patterns)
638 ++searchStart;
639 searchEnd = savedSearchEnd;
640 }
641 }
642
643 // if we never found it, return an empty range
644 if ( !found_it )
645 aSearchStart = aSearchEnd;
646
647 return found_it;
648 }

mercurial