head	1.1;
branch	1.1.1;
access;
symbols
	netbsd-11-0-RC5:1.1.1.5
	netbsd-11-0-RC4:1.1.1.5
	netbsd-11-0-RC3:1.1.1.5
	netbsd-11-0-RC2:1.1.1.5
	netbsd-11-0-RC1:1.1.1.5
	gcc-14-3-0:1.1.1.5
	perseant-exfatfs-base-20250801:1.1.1.5
	netbsd-11:1.1.1.5.0.10
	netbsd-11-base:1.1.1.5
	gcc-12-5-0:1.1.1.5
	netbsd-10-1-RELEASE:1.1.1.5
	perseant-exfatfs-base-20240630:1.1.1.5
	gcc-12-4-0:1.1.1.5
	perseant-exfatfs:1.1.1.5.0.8
	perseant-exfatfs-base:1.1.1.5
	netbsd-8-3-RELEASE:1.1.1.2
	netbsd-9-4-RELEASE:1.1.1.4
	netbsd-10-0-RELEASE:1.1.1.5
	netbsd-10-0-RC6:1.1.1.5
	netbsd-10-0-RC5:1.1.1.5
	netbsd-10-0-RC4:1.1.1.5
	netbsd-10-0-RC3:1.1.1.5
	netbsd-10-0-RC2:1.1.1.5
	netbsd-10-0-RC1:1.1.1.5
	gcc-12-3-0:1.1.1.5
	gcc-10-5-0:1.1.1.5
	netbsd-10:1.1.1.5.0.6
	netbsd-10-base:1.1.1.5
	netbsd-9-3-RELEASE:1.1.1.4
	gcc-10-4-0:1.1.1.5
	cjep_sun2x-base1:1.1.1.5
	cjep_sun2x:1.1.1.5.0.4
	cjep_sun2x-base:1.1.1.5
	cjep_staticlib_x-base1:1.1.1.5
	netbsd-9-2-RELEASE:1.1.1.4
	cjep_staticlib_x:1.1.1.5.0.2
	cjep_staticlib_x-base:1.1.1.5
	gcc-10-3-0:1.1.1.5
	netbsd-9-1-RELEASE:1.1.1.4
	gcc-9-3-0:1.1.1.5
	gcc-7-5-0:1.1.1.4
	phil-wifi-20200421:1.1.1.4
	phil-wifi-20200411:1.1.1.4
	is-mlppp:1.1.1.4.0.4
	is-mlppp-base:1.1.1.4
	phil-wifi-20200406:1.1.1.4
	netbsd-8-2-RELEASE:1.1.1.2
	gcc-8-4-0:1.1.1.4
	netbsd-9-0-RELEASE:1.1.1.4
	netbsd-9-0-RC2:1.1.1.4
	netbsd-9-0-RC1:1.1.1.4
	phil-wifi-20191119:1.1.1.4
	gcc-8-3-0:1.1.1.4
	netbsd-9:1.1.1.4.0.2
	netbsd-9-base:1.1.1.4
	phil-wifi-20190609:1.1.1.4
	netbsd-8-1-RELEASE:1.1.1.2
	netbsd-8-1-RC1:1.1.1.2
	pgoyette-compat-merge-20190127:1.1.1.2.14.2
	pgoyette-compat-20190127:1.1.1.4
	gcc-7-4-0:1.1.1.4
	pgoyette-compat-20190118:1.1.1.3
	pgoyette-compat-1226:1.1.1.3
	pgoyette-compat-1126:1.1.1.3
	gcc-6-5-0:1.1.1.3
	pgoyette-compat-1020:1.1.1.2
	pgoyette-compat-0930:1.1.1.2
	pgoyette-compat-0906:1.1.1.2
	netbsd-7-2-RELEASE:1.1.1.1
	pgoyette-compat-0728:1.1.1.2
	netbsd-8-0-RELEASE:1.1.1.2
	phil-wifi:1.1.1.2.0.16
	phil-wifi-base:1.1.1.2
	pgoyette-compat-0625:1.1.1.2
	netbsd-8-0-RC2:1.1.1.2
	pgoyette-compat-0521:1.1.1.2
	pgoyette-compat-0502:1.1.1.2
	pgoyette-compat-0422:1.1.1.2
	netbsd-8-0-RC1:1.1.1.2
	pgoyette-compat-0415:1.1.1.2
	pgoyette-compat-0407:1.1.1.2
	pgoyette-compat-0330:1.1.1.2
	pgoyette-compat-0322:1.1.1.2
	pgoyette-compat-0315:1.1.1.2
	netbsd-7-1-2-RELEASE:1.1.1.1
	pgoyette-compat:1.1.1.2.0.14
	pgoyette-compat-base:1.1.1.2
	gcc-6-4-0:1.1.1.2
	netbsd-7-1-1-RELEASE:1.1.1.1
	gcc-5-5-0:1.1.1.2
	matt-nb8-mediatek:1.1.1.2.0.12
	matt-nb8-mediatek-base:1.1.1.2
	perseant-stdc-iso10646:1.1.1.2.0.10
	perseant-stdc-iso10646-base:1.1.1.2
	netbsd-8:1.1.1.2.0.8
	netbsd-8-base:1.1.1.2
	prg-localcount2-base3:1.1.1.2
	prg-localcount2-base2:1.1.1.2
	prg-localcount2-base1:1.1.1.2
	prg-localcount2:1.1.1.2.0.6
	prg-localcount2-base:1.1.1.2
	pgoyette-localcount-20170426:1.1.1.2
	bouyer-socketcan-base1:1.1.1.2
	pgoyette-localcount-20170320:1.1.1.2
	netbsd-7-1:1.1.1.1.0.14
	netbsd-7-1-RELEASE:1.1.1.1
	netbsd-7-1-RC2:1.1.1.1
	netbsd-7-nhusb-base-20170116:1.1.1.1
	bouyer-socketcan:1.1.1.2.0.4
	bouyer-socketcan-base:1.1.1.2
	pgoyette-localcount-20170107:1.1.1.2
	netbsd-7-1-RC1:1.1.1.1
	pgoyette-localcount-20161104:1.1.1.2
	netbsd-7-0-2-RELEASE:1.1.1.1
	localcount-20160914:1.1.1.2
	netbsd-7-nhusb:1.1.1.1.0.12
	netbsd-7-nhusb-base:1.1.1.1
	pgoyette-localcount-20160806:1.1.1.2
	pgoyette-localcount-20160726:1.1.1.2
	pgoyette-localcount:1.1.1.2.0.2
	pgoyette-localcount-base:1.1.1.2
	gcc-5-4-0:1.1.1.2
	netbsd-7-0-1-RELEASE:1.1.1.1
	gcc-5-3-0:1.1.1.2
	netbsd-7-0:1.1.1.1.0.10
	netbsd-7-0-RELEASE:1.1.1.1
	gcc-4-8-5-pre-gcc-old-import:1.1.1.1
	netbsd-7-0-RC3:1.1.1.1
	netbsd-7-0-RC2:1.1.1.1
	post-gcc-4-8-5-merge:1.1.1.1
	gcc-4-8-5:1.1.1.1
	netbsd-7-0-RC1:1.1.1.1
	gcc-4-8-4:1.1.1.1
	gcc-4-8-20141009:1.1.1.1
	tls-maxphys-base:1.1.1.1
	tls-maxphys:1.1.1.1.0.8
	netbsd-7:1.1.1.1.0.6
	netbsd-7-base:1.1.1.1
	gcc-4-8-3:1.1.1.1
	yamt-pagecache:1.1.1.1.0.4
	yamt-pagecache-base9:1.1.1.1
	tls-earlyentropy:1.1.1.1.0.2
	tls-earlyentropy-base:1.1.1.1
	riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.1.1.1
	riastradh-drm2-base3:1.1.1.1
	gcc-4-8-3-pre-r208254:1.1.1.1
	gcc-4-8-3-pre-r206687:1.1.1.1
	FSF:1.1.1;
locks; strict;
comment	@# @;


1.1
date	2014.03.01.08.41.30;	author mrg;	state Exp;
branches
	1.1.1.1;
next	;
commitid	TtaB91QNTknAoYqx;

1.1.1.1
date	2014.03.01.08.41.30;	author mrg;	state Exp;
branches
	1.1.1.1.4.1
	1.1.1.1.8.1;
next	1.1.1.2;
commitid	TtaB91QNTknAoYqx;

1.1.1.2
date	2016.01.24.06.05.43;	author mrg;	state Exp;
branches
	1.1.1.2.14.1
	1.1.1.2.16.1;
next	1.1.1.3;
commitid	uWWfbLp08zOK79Sy;

1.1.1.3
date	2018.11.04.00.12.37;	author mrg;	state Exp;
branches;
next	1.1.1.4;
commitid	bulspy67pMB6EyYA;

1.1.1.4
date	2019.01.19.10.14.11;	author mrg;	state Exp;
branches;
next	1.1.1.5;
commitid	VQ8OwWIg5RS9kn8B;

1.1.1.5
date	2020.09.05.07.52.18;	author mrg;	state Exp;
branches;
next	;
commitid	ZRYA7IOuwfMjAPmC;

1.1.1.1.4.1
date	2014.03.01.08.41.30;	author yamt;	state dead;
branches;
next	1.1.1.1.4.2;
commitid	DX8bafDLmqEbpyBx;

1.1.1.1.4.2
date	2014.05.22.16.37.45;	author yamt;	state Exp;
branches;
next	;
commitid	DX8bafDLmqEbpyBx;

1.1.1.1.8.1
date	2014.03.01.08.41.30;	author tls;	state dead;
branches;
next	1.1.1.1.8.2;
commitid	jTnpym9Qu0o4R1Nx;

1.1.1.1.8.2
date	2014.08.19.23.54.46;	author tls;	state Exp;
branches;
next	;
commitid	jTnpym9Qu0o4R1Nx;

1.1.1.2.14.1
date	2018.11.26.01.50.57;	author pgoyette;	state Exp;
branches;
next	1.1.1.2.14.2;
commitid	Zj4q5SspGdKXto1B;

1.1.1.2.14.2
date	2019.01.26.21.59.32;	author pgoyette;	state Exp;
branches;
next	;
commitid	JKpcmvSjdT25dl9B;

1.1.1.2.16.1
date	2019.06.10.21.54.49;	author christos;	state Exp;
branches;
next	;
commitid	jtc8rnCzWiEEHGqB;


desc
@@


1.1
log
@Initial revision
@
text
@<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>Diagnostics</title><meta name="generator" content="DocBook XSL-NS Stylesheets V1.78.1" /><meta name="keywords" content="C++, library, profile" /><meta name="keywords" content="ISO C++, library" /><meta name="keywords" content="ISO C++, runtime, library" /><link rel="home" href="../index.html" title="The GNU C++ Library" /><link rel="up" href="profile_mode.html" title="Chapter 19. Profile Mode" /><link rel="prev" href="profile_mode_devel.html" title="Developer Information" /><link rel="next" href="mt_allocator.html" title="Chapter 20. The mt_allocator" /></head><body><div class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="3" align="center">Diagnostics</th></tr><tr><td width="20%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><th width="60%" align="center">Chapter 19. Profile Mode</th><td width="20%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr></table><hr /></div><div class="section"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a id="manual.ext.profile_mode.diagnostics"></a>Diagnostics</h2></div></div></div><p>
  The table below presents all the diagnostics we intend to implement.
  Each diagnostic has a corresponding compile time switch
  <code class="code">-D_GLIBCXX_PROFILE_&lt;diagnostic&gt;</code>.
  Groups of related diagnostics can be turned on with a single switch.
  For instance, <code class="code">-D_GLIBCXX_PROFILE_LOCALITY</code> is equivalent to
  <code class="code">-D_GLIBCXX_PROFILE_SOFTWARE_PREFETCH
  -D_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
  </p><p>
  The benefit, cost, expected frequency and accuracy of each diagnostic
  was given a grade from 1 to 10, where 10 is highest.
  A high benefit means that, if the diagnostic is accurate, the expected
  performance improvement is high.
  A high cost means that turning this diagnostic on leads to high slowdown.
  A high frequency means that we expect this to occur relatively often.
  A high accuracy means that the diagnostic is unlikely to be wrong.
  These grades are not perfect.  They are just meant to guide users with
  specific needs or time budgets.
  </p><div class="table"><a id="idm269998442544"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
    CONTAINERS</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_too_small" title="Hashtable Too Small">
    HASHTABLE_TOO_SMALL</a></td><td align="left">10</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_too_large" title="Hashtable Too Large">
    HASHTABLE_TOO_LARGE</a></td><td align="left">5</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.inefficient_hash" title="Inefficient Hash">
    INEFFICIENT_HASH</a></td><td align="left">7</td><td align="left">3</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_too_small" title="Vector Too Small">
    VECTOR_TOO_SMALL</a></td><td align="left">8</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_too_large" title="Vector Too Large">
    VECTOR_TOO_LARGE</a></td><td align="left">5</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_to_hashtable" title="Vector to Hashtable">
    VECTOR_TO_HASHTABLE</a></td><td align="left">7</td><td align="left">7</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_to_vector" title="Hashtable to Vector">
    HASHTABLE_TO_VECTOR</a></td><td align="left">7</td><td align="left">7</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_to_list" title="Vector to List">
    VECTOR_TO_LIST</a></td><td align="left">8</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.list_to_vector" title="List to Vector">
    LIST_TO_VECTOR</a></td><td align="left">10</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.assoc_ord_to_unord" title="Ordered to Unordered Associative Container">
    ORDERED_TO_UNORDERED</a></td><td align="left">10</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">only map/unordered_map</td></tr><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.algorithms" title="Algorithms">
    ALGORITHMS</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.algorithms.sort" title="Sort Algorithm Performance">
    SORT</a></td><td align="left">7</td><td align="left">8</td><td align="left"> </td><td align="left">7</td><td align="left">no</td></tr><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality" title="Data Locality">
    LOCALITY</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality.sw_prefetch" title="Need Software Prefetch">
    SOFTWARE_PREFETCH</a></td><td align="left">8</td><td align="left">8</td><td align="left"> </td><td align="left">5</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality.linked" title="Linked Structure Locality">
    RBTREE_LOCALITY</a></td><td align="left">4</td><td align="left">8</td><td align="left"> </td><td align="left">5</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.mthread.false_share" title="False Sharing">
    FALSE_SHARING</a></td><td align="left">8</td><td align="left">10</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr></tbody></table></div></div><br class="table-break" /><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.template"></a>Diagnostic Template</h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_&lt;diagnostic&gt;</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>  What problem will it diagnose?
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>.
  What is the fundamental reason why this is a problem</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>
  Percentage reduction in execution time.  When reduction is more than
  a constant factor, describe the reduction rate formula.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  What would the advise look like?</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  What stdlibc++ components need to be instrumented?</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  How do we decide when to issue the advice?</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  How do we measure benefits?  Math goes here.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
program code
...
advice sample
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.containers"></a>Containers</h3></div></div></div><p>
<span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_CONTAINERS</code>.
</p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_too_small"></a>Hashtable Too Small</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TOO_SMALL</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables with many
  rehash operations, small construction size and large destruction size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Rehash is very expensive.
  Read content, follow chains within bucket, evaluate hash function, place at
  new location in different order.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> 36%.
  Code similar to example below.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, rehash.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_[multi]set|map</code>,
  record initial size and call context of the constructor.
  Record size increase, if any, after each relevant operation such as insert.
  Record the estimated rehash cost.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of individual rehash operations * cost per rehash.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 unordered_set&lt;int&gt; us;
2 for (int k = 0; k &lt; 1000000; ++k) {
3   us.insert(k);
4 }

foo.cc:1: advice: Changing initial unordered_set size from 10 to 1000000 saves 1025530 rehash operations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_too_large"></a>Hashtable Too Large</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TOO_LARGE</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables which are
  never filled up because fewer elements than reserved are ever
  inserted.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Save memory, which
  is good in itself and may also improve memory reference performance through
  fewer cache and TLB misses.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> unknown.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, rehash.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_[multi]set|map</code>,
  record initial size and call context of the constructor, and correlate it
  with its size at destruction time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of iteration operations + memory saved.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;unordered_set&lt;int&gt;&gt; v(100000, unordered_set&lt;int&gt;(100)) ;
2 for (int k = 0; k &lt; 100000; ++k) {
3   for (int j = 0; j &lt; 10; ++j) {
4     v[k].insert(k + j);
5  }
6 }

foo.cc:1: advice: Changing initial unordered_set size from 100 to 10 saves N
bytes of memory and M iteration steps.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.inefficient_hash"></a>Inefficient Hash</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_INEFFICIENT_HASH</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables with polarized
  distribution.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> A non-uniform
  distribution may lead to long chains, thus possibly increasing complexity
  by a factor up to the number of elements.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> factor up
   to container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change hash function
  for container built at site S.  Distribution score = N.  Access score = S.
  Longest chain = C, in bucket B.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, [],
  insert, iterator.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Count the exact number of link traversals.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total number of links traversed.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
class dumb_hash {
 public:
  size_t operator() (int i) const { return 0; }
};
...
  unordered_set&lt;int, dumb_hash&gt; hs;
  ...
  for (int i = 0; i &lt; COUNT; ++i) {
    hs.find(i);
  }
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_too_small"></a>Vector Too Small</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TOO_SMALL</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>Detect vectors with many
  resize operations, small construction size and large destruction size..
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Resizing can be expensive.
  Copying large amounts of data takes time.  Resizing many small vectors may
  have allocation overhead and affect locality.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record initial size and call context of the constructor.
  Record size increase, if any, after each relevant operation such as
  <code class="code">push_back</code>.  Record the estimated resize cost.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total number of words copied * time to copy a word.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;int&gt; v;
2 for (int k = 0; k &lt; 1000000; ++k) {
3   v.push_back(k);
4 }

foo.cc:1: advice: Changing initial vector size from 10 to 1000000 saves
copying 4000000 bytes and 20 memory allocations and deallocations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_too_large"></a>Vector Too Large</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TOO_LARGE</code>
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>Detect vectors which are
  never filled up because fewer elements than reserved are ever
  inserted.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Save memory, which
  is good in itself and may also improve memory reference performance through
  fewer cache and TLB misses.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record initial size and call context of the constructor, and correlate it
  with its size at destruction time.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total amount of memory saved.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;vector&lt;int&gt;&gt; v(100000, vector&lt;int&gt;(100)) ;
2 for (int k = 0; k &lt; 100000; ++k) {
3   for (int j = 0; j &lt; 10; ++j) {
4     v[k].insert(k + j);
5  }
6 }

foo.cc:1: advice: Changing initial vector size from 100 to 10 saves N
bytes of memory and may reduce the number of cache and TLB misses.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_to_hashtable"></a>Vector to Hashtable</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TO_HASHTABLE</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect uses of
  <code class="code">vector</code> that can be substituted with <code class="code">unordered_set</code>
  to reduce execution time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Linear search in a vector is very expensive, whereas searching in a hashtable
  is very quick.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>factor up
   to container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace
  <code class="code">vector</code> with <code class="code">unordered_set</code> at site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record call context of the constructor.  Issue the advice only if the
  only methods called on this <code class="code">vector</code> are <code class="code">push_back</code>,
  <code class="code">insert</code> and <code class="code">find</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Cost(vector::push_back) + cost(vector::insert) + cost(find, vector) -
  cost(unordered_set::insert) + cost(unordered_set::find).
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  vector&lt;int&gt; v;
...
2  for (int i = 0; i &lt; 1000; ++i) {
3    find(v.begin(), v.end(), i);
4  }

foo.cc:1: advice: Changing "vector" to "unordered_set" will save about 500,000
comparisons.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_to_vector"></a>Hashtable to Vector</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TO_VECTOR</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect uses of
  <code class="code">unordered_set</code> that can be substituted with <code class="code">vector</code>
  to reduce execution time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Hashtable iterator is slower than vector iterator.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>95%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace
  <code class="code">unordered_set</code> with <code class="code">vector</code> at site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">unordered_set</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_set</code>,
  record call context of the constructor.  Issue the advice only if the
  number of <code class="code">find</code>, <code class="code">insert</code> and <code class="code">[]</code>
  operations on this <code class="code">unordered_set</code> are small relative to the
  number of elements, and methods <code class="code">begin</code> or <code class="code">end</code>
  are invoked (suggesting iteration).</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of .</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  unordered_set&lt;int&gt; us;
...
2  int s = 0;
3  for (unordered_set&lt;int&gt;::iterator it = us.begin(); it != us.end(); ++it) {
4    s += *it;
5  }

foo.cc:1: advice: Changing "unordered_set" to "vector" will save about N
indirections and may achieve better data locality.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_to_list"></a>Vector to List</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TO_LIST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">vector</code> could be substituted with <code class="code">list</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Inserting in the middle of a vector is expensive compared to inserting in a
  list.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>factor up to
   container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace vector with list
  at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record the call context of the constructor.  Record the overhead of each
  <code class="code">insert</code> operation based on current size and insert position.
  Report instance with high insertion overhead.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  (Sum(cost(vector::method)) - Sum(cost(list::method)), for
  method in [push_back, insert, erase])
  + (Cost(iterate vector) - Cost(iterate list))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  vector&lt;int&gt; v;
2  for (int i = 0; i &lt; 10000; ++i) {
3    v.insert(v.begin(), i);
4  }

foo.cc:1: advice: Changing "vector" to "list" will save about 5,000,000
operations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.list_to_vector"></a>List to Vector</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LIST_TO_VECTOR</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">list</code> could be substituted with <code class="code">vector</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Iterating through a vector is faster than through a list.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>64%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace list with vector
  at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if there are no <code class="code">insert</code> operations.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
    (Sum(cost(vector::method)) - Sum(cost(list::method)), for
  method in [push_back, insert, erase])
  + (Cost(iterate vector) - Cost(iterate list))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  list&lt;int&gt; l;
...
2  int sum = 0;
3  for (list&lt;int&gt;::iterator it = l.begin(); it != l.end(); ++it) {
4    sum += *it;
5  }

foo.cc:1: advice: Changing "list" to "vector" will save about 1000000 indirect
memory references.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.list_to_slist"></a>List to Forward List (Slist)</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LIST_TO_SLIST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">list</code> could be substituted with <code class="code">forward_list</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  The memory footprint of a forward_list is smaller than that of a list.
  This has beneficial effects on memory subsystem, e.g., fewer cache misses.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>40%.
  Note that the reduction is only noticeable if the size of the forward_list
  node is in fact larger than that of the list node.  For memory allocators
  with size classes, you will only notice an effect when the two node sizes
  belong to different allocator size classes.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace list with
  forward_list at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">list</code>
  operations and iteration methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if there are no <code class="code">backwards</code> traversals
  or insertion before a given node.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Always true.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  list&lt;int&gt; l;
...
2  int sum = 0;
3  for (list&lt;int&gt;::iterator it = l.begin(); it != l.end(); ++it) {
4    sum += *it;
5  }

foo.cc:1: advice: Change "list" to "forward_list".
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.assoc_ord_to_unord"></a>Ordered to Unordered Associative Container</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_ORDERED_TO_UNORDERED</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>  Detect cases where ordered
  associative containers can be replaced with unordered ones.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Insert and search are quicker in a hashtable than in
  a red-black tree.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>52%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Replace set with unordered_set at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">set</code>, <code class="code">multiset</code>, <code class="code">map</code>,
  <code class="code">multimap</code> methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice only if we are not using operator <code class="code">++</code> on any
  iterator on a particular <code class="code">[multi]set|map</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  (Sum(cost(hashtable::method)) - Sum(cost(rbtree::method)), for
  method in [insert, erase, find])
  + (Cost(iterate hashtable) - Cost(iterate rbtree))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  set&lt;int&gt; s;
2  for (int i = 0; i &lt; 100000; ++i) {
3    s.insert(i);
4  }
5  int sum = 0;
6  for (int i = 0; i &lt; 100000; ++i) {
7    sum += *s.find(i);
8  }
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.algorithms"></a>Algorithms</h3></div></div></div><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_ALGORITHMS</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.algorithms.sort"></a>Sort Algorithm Performance</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_SORT</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Give measure of sort algorithm
  performance based on actual input.  For instance, advise Radix Sort over
  Quick Sort for a particular call context.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  See papers:
  <a class="link" href="https://dl.acm.org/citation.cfm?doid=1065944.1065981" target="_top">
  A framework for adaptive algorithm selection in STAPL</a> and
  <a class="link" href="http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=4228227" target="_top">
  Optimizing Sorting with Machine Learning Algorithms</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>60%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change sort algorithm
  at site S from X Sort to Y Sort.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> <code class="code">sort</code>
  algorithm.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if the cost model tells us that another sort algorithm
  would do better on this input.  Requires us to know what algorithm we
  are using in our sort implementation in release mode.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Runtime(algo) for algo in [radix, quick, merge, ...]</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.locality"></a>Data Locality</h3></div></div></div><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LOCALITY</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.locality.sw_prefetch"></a>Need Software Prefetch</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_SOFTWARE_PREFETCH</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Discover sequences of indirect
  memory accesses that are not regular, thus cannot be predicted by
  hardware prefetchers.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Indirect references are hard to predict and are very expensive when they
  miss in caches.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>25%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Insert prefetch
  instruction.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Vector iterator and
  access operator [].
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get cache line size and page size from system.
  Then record iterator dereference sequences for which the value is a pointer.
  For each sequence within a container, issue a warning if successive pointer
  addresses are not within cache lines and do not form a linear pattern
  (otherwise they may be prefetched by hardware).
  If they also step across page boundaries, make the warning stronger.
  </p><p>The same analysis applies to containers other than vector.
  However, we cannot give the same advice for linked structures, such as list,
  as there is no random access to the n-th element.  The user may still be
  able to benefit from this information, for instance by employing frays (user
  level light weight threads) to hide the latency of chasing pointers.
  </p><p>
  This analysis is a little oversimplified.  A better cost model could be
  created by understanding the capability of the hardware prefetcher.
  This model could be trained automatically by running a set of synthetic
  cases.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total distance between pointer values of successive elements in vectors
  of pointers.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 int zero = 0;
2 vector&lt;int*&gt; v(10000000, &amp;zero);
3 for (int k = 0; k &lt; 10000000; ++k) {
4   v[random() % 10000000] = new int(k);
5 }
6 for (int j = 0; j &lt; 10000000; ++j) {
7   count += (*v[j] == 0 ? 0 : 1);
8 }

foo.cc:7: advice: Insert prefetch instruction.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.locality.linked"></a>Linked Structure Locality</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Give measure of locality of
  objects stored in linked structures (lists, red-black trees and hashtables)
  with respect to their actual traversal patterns.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Allocation can be tuned
  to a specific traversal pattern, to result in better data locality.
  See paper:
  <a class="link" href="http://www.springerlink.com/content/8085744l00x72662/" target="_top">
  Custom Memory Allocation for Free</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>30%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  High scatter score N for container built at site S.
  Consider changing allocation sequence or choosing a structure conscious
  allocator.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Methods of all
  containers using linked structures.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get cache line size and page size from system.
  Then record the number of successive elements that are on different line
  or page, for each traversal method such as <code class="code">find</code>.  Give advice
  only if the ratio between this number and the number of total node hops
  is above a threshold.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Sum(same_cache_line(this,previous))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
 1  set&lt;int&gt; s;
 2  for (int i = 0; i &lt; 10000000; ++i) {
 3    s.insert(i);
 4  }
 5  set&lt;int&gt; s1, s2;
 6  for (int i = 0; i &lt; 10000000; ++i) {
 7    s1.insert(i);
 8    s2.insert(i);
 9  }
...
      // Fast, better locality.
10    for (set&lt;int&gt;::iterator it = s.begin(); it != s.end(); ++it) {
11      sum += *it;
12    }
      // Slow, elements are further apart.
13    for (set&lt;int&gt;::iterator it = s1.begin(); it != s1.end(); ++it) {
14      sum += *it;
15    }

foo.cc:5: advice: High scatter score NNN for set built here.  Consider changing
the allocation sequence or switching to a structure conscious allocator.
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.mthread"></a>Multithreaded Data Access</h3></div></div></div><p>
  The diagnostics in this group are not meant to be implemented short term.
  They require compiler support to know when container elements are written
  to.  Instrumentation can only tell us when elements are referenced.
  </p><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_MULTITHREADED</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.mthread.ddtest"></a>Data Dependence Violations at Container Level</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_DDTEST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect container elements
  that are referenced from multiple threads in the parallel region or
  across parallel regions.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Sharing data between threads requires communication and perhaps locking,
  which may be expensive.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>?%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change data
  distribution or parallel algorithm.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Container access methods
  and iterators.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Keep a shadow for each container.  Record iterator dereferences and
  container member accesses.  Issue advice for elements referenced by
  multiple threads.
  See paper: <a class="link" href="https://dl.acm.org/citation.cfm?id=207110.207148" target="_top">
  The LRPD test: speculative run-time parallelization of loops with
  privatization and reduction parallelization</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of accesses to elements referenced from multiple threads
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.mthread.false_share"></a>False Sharing</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_FALSE_SHARING</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect elements in the
  same container which share a cache line, are written by at least one
  thread, and accessed by different threads.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Under these assumptions,
  cache protocols require
  communication to invalidate lines, which may be expensive.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>68%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Reorganize container
  or use padding to avoid false sharing.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Container access methods
  and iterators.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get the cache line size.
  For each shared container, record all the associated iterator dereferences
  and member access methods with the thread id.  Compare the address lists
  across threads to detect references in two different threads to the same
  cache line.  Issue a warning only if the ratio to total references is
  significant.  Do the same for iterator dereference values if they are
  pointers.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of accesses to same cache line from different threads.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1     vector&lt;int&gt; v(2, 0);
2 #pragma omp parallel for shared(v, SIZE) schedule(static, 1)
3     for (i = 0; i &lt; SIZE; ++i) {
4       v[i % 2] += i;
5     }

OMP_NUM_THREADS=2 ./a.out
foo.cc:1: advice: Change container structure or padding to avoid false
sharing in multithreaded access at foo.cc:4.  Detected N shared cache lines.
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.statistics"></a>Statistics</h3></div></div></div><p>
<span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_STATISTICS</code>.
</p><p>
  In some cases the cost model may not tell us anything because the costs
  appear to offset the benefits.  Consider the choice between a vector and
  a list.  When there are both inserts and iteration, an automatic advice
  may not be issued.  However, the programmer may still be able to make use
  of this information in a different way.
</p><p>
  This diagnostic will not issue any advice, but it will print statistics for
  each container construction site.  The statistics will contain the cost
  of each operation actually performed on the container.
</p></div></div><div class="navfooter"><hr /><table width="100%" summary="Navigation footer"><tr><td width="40%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><td width="20%" align="center"><a accesskey="u" href="profile_mode.html">Up</a></td><td width="40%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr><tr><td width="40%" align="left" valign="top">Developer Information </td><td width="20%" align="center"><a accesskey="h" href="../index.html">Home</a></td><td width="40%" align="right" valign="top"> Chapter 20. The mt_allocator</td></tr></table></div></body></html>@


1.1.1.1
log
@import GCC 4.8 branch at r206687.

highlights from: http://gcc.gnu.org/gcc-4.6/changes.html

   GCC now has stricter checks for invalid command-line options
   New -Wunused-but-set-variable and -Wunused-but-set-parameter
      warnings
   Many platforms have been obsoleted
   Link-time optimization improvements
   A new switch -fstack-usage has been added
   A new function attribute leaf was introduced
   A new warning, enabled by -Wdouble-promotion
   Support for selectively enabling and disabling warnings via
      #pragma GCC diagnostic has been added
   There is now experimental support for some features from the
      upcoming C1X revision of the ISO C standard
   Improved experimental support for the upcoming C++0x ISO C++
      standard
   G++ now issues clearer diagnostics in several cases
   Updates for ARM, x86, MIPS, PPC/PPC64, SPARC
   Darwin, FreeBSD, Solaris 2, MinGW and Cygwin now all support
      __float128 on 32-bit and 64-bit x86 targets. [*1]

highlights from: http://gcc.gnu.org/gcc-4.7/changes.html

   The -fconserve-space flag has been deprecated
   Support for a new parameter --param case-values-threshold=n
      was added
   Interprocedural and Link-time optimization improvements
   A new built-in, __builtin_assume_aligned, has been added
   A new warning option -Wunused-local-typedefs was added
   A new experimental command-line option -ftrack-macro-expansion
      was added
   Support for atomic operations specifying the C++11/C11 memory
      model has been added
   There is support for some more features from the C11 revision
      of the ISO C standard
   Improved experimental support for the new ISO C++ standard,
      C++11
   Updates for ARM, x86, MIPS, PPC/PPC64, SH, SPARC, TILE*
   A new option (-grecord-gcc-switches) was added

highlights from: http://gcc.gnu.org/gcc-4.8/changes.html

   GCC now uses C++ as its implementation language.  This means
      that to build GCC from sources, you will need a C++
      compiler that understands C++ 2003
   DWARF4 is now the default when generating DWARF debug
      information
   A new general optimization level, -Og, has been introduced
   A new option -ftree-partial-pre was added
   The option -fconserve-space has been removed
   The command-line options -fipa-struct-reorg and
      -fipa-matrix-reorg have been removed
   Interprocedural and Link-time optimization improvements
   AddressSanitizer, a fast memory error detector, has been
      added  [*2]
   A new -Wsizeof-pointer-memaccess warning has been added
   G++ now supports a -std=c++1y option for experimentation
      with features proposed for the next revision of the
      standard, expected around 2014
   Improved experimental support for the new ISO C++ standard,
      C++11
   A new port has been added to support AArch64
   Updates for ARM, x86, MIPS, PPC/PPC64, SH, SPARC, TILE*


[*1] we should support this too!
[*2] we should look into this.
     https://code.google.com/p/address-sanitizer/
@
text
@@


1.1.1.2
log
@import GCC 5.3.0.  see these urls for details which are too large to
include here:

	http://gcc.gnu.org/gcc-4.9/changes.html
	http://gcc.gnu.org/gcc-5/changes.html

(note that GCC 5.x is a release stream like GCC 4.9.x, 4.8.x, etc.)


the main issues we will have are:

The default mode for C is now -std=gnu11 instead of -std=gnu89.

ARM:
The deprecated option -mwords-little-endian has been removed.
The options -mapcs, -mapcs-frame, -mtpcs-frame and -mtpcs-leaf-frame
 which are only applicable to the old ABI have been deprecated.

MIPS:
The o32 ABI has been modified and extended. The o32 64-bit
 floating-point register support is now obsolete and has been removed.
 It has been replaced by three ABI extensions FPXX, FP64A, and FP64.
 The meaning of the -mfp64 command-line option has changed. It is now
 used to enable the FP64A and FP64 ABI extensions.
@
text
@d20 1
a20 1
  </p><div class="table"><a id="table.profile_diagnostics"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
@


1.1.1.2.16.1
log
@Sync with HEAD
@
text
@d2 1
a2 1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>Diagnostics</title><meta name="generator" content="DocBook XSL Stylesheets Vsnapshot" /><meta name="keywords" content="C++, library, profile" /><meta name="keywords" content="ISO C++, library" /><meta name="keywords" content="ISO C++, runtime, library" /><link rel="home" href="../index.html" title="The GNU C++ Library" /><link rel="up" href="profile_mode.html" title="Chapter 19. Profile Mode" /><link rel="prev" href="profile_mode_devel.html" title="Developer Information" /><link rel="next" href="mt_allocator.html" title="Chapter 20. The mt_allocator" /></head><body><div class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="3" align="center">Diagnostics</th></tr><tr><td width="20%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><th width="60%" align="center">Chapter 19. Profile Mode</th><td width="20%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr></table><hr /></div><div class="section"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a id="manual.ext.profile_mode.diagnostics"></a>Diagnostics</h2></div></div></div><p>
d20 1
a20 1
  </p><div class="table"><a id="table.profile_diagnostics"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table class="table" summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
d379 1
a379 1
  <a class="link" href="http://ieeexplore.ieee.org/document/4228227/" target="_top">
d444 2
a445 2
  <a class="link" href="https://parasol.tamu.edu/publications/download.php?file_id=570" target="_top">
  Custom Memory Allocation for Free</a> by Jula and Rauchwerger.
@


1.1.1.2.14.1
log
@Sync with HEAD, resolve a couple of conflicts
@
text
@d2 1
a2 1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>Diagnostics</title><meta name="generator" content="DocBook XSL Stylesheets Vsnapshot" /><meta name="keywords" content="C++, library, profile" /><meta name="keywords" content="ISO C++, library" /><meta name="keywords" content="ISO C++, runtime, library" /><link rel="home" href="../index.html" title="The GNU C++ Library" /><link rel="up" href="profile_mode.html" title="Chapter 19. Profile Mode" /><link rel="prev" href="profile_mode_devel.html" title="Developer Information" /><link rel="next" href="mt_allocator.html" title="Chapter 20. The mt_allocator" /></head><body><div class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="3" align="center">Diagnostics</th></tr><tr><td width="20%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><th width="60%" align="center">Chapter 19. Profile Mode</th><td width="20%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr></table><hr /></div><div class="section"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a id="manual.ext.profile_mode.diagnostics"></a>Diagnostics</h2></div></div></div><p>
d20 1
a20 1
  </p><div class="table"><a id="table.profile_diagnostics"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table class="table" summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
@


1.1.1.2.14.2
log
@Sync with HEAD
@
text
@d379 1
a379 1
  <a class="link" href="http://ieeexplore.ieee.org/document/4228227/" target="_top">
d444 2
a445 2
  <a class="link" href="https://parasol.tamu.edu/publications/download.php?file_id=570" target="_top">
  Custom Memory Allocation for Free</a> by Jula and Rauchwerger.
@


1.1.1.3
log
@import GCC 6.5.0.  this is largely a maint release with no
particularly features listed here:

   http://gcc.gnu.org/gcc-6/changes.html

this fixes over 250 PRs in the GCC bugzilla:

   https://gcc.gnu.org/bugzilla/buglist.cgi?bug_status=RESOLVED&resolution=FIXED&target_milestone=6.5
@
text
@d2 1
a2 1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>Diagnostics</title><meta name="generator" content="DocBook XSL Stylesheets Vsnapshot" /><meta name="keywords" content="C++, library, profile" /><meta name="keywords" content="ISO C++, library" /><meta name="keywords" content="ISO C++, runtime, library" /><link rel="home" href="../index.html" title="The GNU C++ Library" /><link rel="up" href="profile_mode.html" title="Chapter 19. Profile Mode" /><link rel="prev" href="profile_mode_devel.html" title="Developer Information" /><link rel="next" href="mt_allocator.html" title="Chapter 20. The mt_allocator" /></head><body><div class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="3" align="center">Diagnostics</th></tr><tr><td width="20%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><th width="60%" align="center">Chapter 19. Profile Mode</th><td width="20%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr></table><hr /></div><div class="section"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a id="manual.ext.profile_mode.diagnostics"></a>Diagnostics</h2></div></div></div><p>
d20 1
a20 1
  </p><div class="table"><a id="table.profile_diagnostics"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table class="table" summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
@


1.1.1.4
log
@import GCC 7.4.0.  main changes include:

The non-standard C++0x type traits has_trivial_default_constructor,
has_trivial_copy_constructor and has_trivial_copy_assign have been
removed.

On ARM targets (arm*-*-*), a bug introduced in GCC 5 that affects
conformance to the procedure call standard (AAPCS) has been fixed.

Many optimiser improvements

DWARF-5 support.

Many new and enhanced warnings.

Warnings about format strings now underline the pertinent part of
the string, and can offer suggested fixes.

Several new warnings related to buffer overflows and buffer
truncation.

New __builtin_add_overflow_p, __builtin_sub_overflow_p,
__builtin_mul_overflow_p built-ins added that test for overflow.

The C++ front end has experimental support for all of the current
C++17 draft.

The -fverbose-asm option has been expanded to prints comments
showing the source lines that correspond to the assembly.

The gcc and g++ driver programs will now provide suggestions for
misspelled arguments to command-line options.


AArch64 specific:

GCC has been updated to the latest revision of the procedure call
standard (AAPCS64) to provide support for parameter passing when
data types have been over-aligned.

The ARMv8.2-A and ARMv8.3-A architecture are now supported.

ARM specific:

Support for the ARMv5 and ARMv5E architectures has been
deprecated (which have no known implementations).

A new command-line option -mpure-code has been added. It does not
allow constant data to be placed in code sections.

x86 specific:

Support for the AVX-512 4FMAPS, 4VNNIW, VPOPCNTDQ and Software
Guard Extensions (SGX) ISA extensions has been added.

PPC specific:

GCC now diagnoses inline assembly that clobbers register r2.

RISC-V specific:

Support for the RISC-V instruction set has been added.

SH specific:

Support for SH5/SH64 has been removed.

Support for SH2A has been enhanced.
@
text
@d379 1
a379 1
  <a class="link" href="http://ieeexplore.ieee.org/document/4228227/" target="_top">
d444 2
a445 2
  <a class="link" href="https://parasol.tamu.edu/publications/download.php?file_id=570" target="_top">
  Custom Memory Allocation for Free</a> by Jula and Rauchwerger.
@


1.1.1.5
log
@initial import of GCC 9.3.0.  changes include:

- live patching support
- shell completion help
- generally better diagnostic output (less verbose/more useful)
- diagnostics and optimisation choices can be emitted in json
- asan memory usage reduction
- many general, and specific to switch, inter-procedure,
  profile and link-time optimisations.  from the release notes:
  "Overall compile time of Firefox 66 and LibreOffice 6.2.3 on
  an 8-core machine was reduced by about 5% compared to GCC 8.3"
- OpenMP 5.0 support
- better spell-guesser
- partial experimental support for c2x and c++2a
- c++17 is no longer experimental
- arm AAPCS GCC 6-8 structure passing bug fixed, may cause
  incompatibility (restored compat with GCC 5 and earlier.)
- openrisc support
@
text
@d379 1
a379 1
  <a class="link" href="https://ieeexplore.ieee.org/document/4228227/" target="_top">
@


1.1.1.1.8.1
log
@file profile_mode_diagnostics.html was added on branch tls-maxphys on 2014-08-19 23:54:46 +0000
@
text
@d1 557
@


1.1.1.1.8.2
log
@Rebase to HEAD as of a few days ago.
@
text
@a0 557
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>Diagnostics</title><meta name="generator" content="DocBook XSL-NS Stylesheets V1.78.1" /><meta name="keywords" content="C++, library, profile" /><meta name="keywords" content="ISO C++, library" /><meta name="keywords" content="ISO C++, runtime, library" /><link rel="home" href="../index.html" title="The GNU C++ Library" /><link rel="up" href="profile_mode.html" title="Chapter 19. Profile Mode" /><link rel="prev" href="profile_mode_devel.html" title="Developer Information" /><link rel="next" href="mt_allocator.html" title="Chapter 20. The mt_allocator" /></head><body><div class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="3" align="center">Diagnostics</th></tr><tr><td width="20%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><th width="60%" align="center">Chapter 19. Profile Mode</th><td width="20%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr></table><hr /></div><div class="section"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a id="manual.ext.profile_mode.diagnostics"></a>Diagnostics</h2></div></div></div><p>
  The table below presents all the diagnostics we intend to implement.
  Each diagnostic has a corresponding compile time switch
  <code class="code">-D_GLIBCXX_PROFILE_&lt;diagnostic&gt;</code>.
  Groups of related diagnostics can be turned on with a single switch.
  For instance, <code class="code">-D_GLIBCXX_PROFILE_LOCALITY</code> is equivalent to
  <code class="code">-D_GLIBCXX_PROFILE_SOFTWARE_PREFETCH
  -D_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
  </p><p>
  The benefit, cost, expected frequency and accuracy of each diagnostic
  was given a grade from 1 to 10, where 10 is highest.
  A high benefit means that, if the diagnostic is accurate, the expected
  performance improvement is high.
  A high cost means that turning this diagnostic on leads to high slowdown.
  A high frequency means that we expect this to occur relatively often.
  A high accuracy means that the diagnostic is unlikely to be wrong.
  These grades are not perfect.  They are just meant to guide users with
  specific needs or time budgets.
  </p><div class="table"><a id="idm269998442544"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
    CONTAINERS</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_too_small" title="Hashtable Too Small">
    HASHTABLE_TOO_SMALL</a></td><td align="left">10</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_too_large" title="Hashtable Too Large">
    HASHTABLE_TOO_LARGE</a></td><td align="left">5</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.inefficient_hash" title="Inefficient Hash">
    INEFFICIENT_HASH</a></td><td align="left">7</td><td align="left">3</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_too_small" title="Vector Too Small">
    VECTOR_TOO_SMALL</a></td><td align="left">8</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_too_large" title="Vector Too Large">
    VECTOR_TOO_LARGE</a></td><td align="left">5</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_to_hashtable" title="Vector to Hashtable">
    VECTOR_TO_HASHTABLE</a></td><td align="left">7</td><td align="left">7</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_to_vector" title="Hashtable to Vector">
    HASHTABLE_TO_VECTOR</a></td><td align="left">7</td><td align="left">7</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_to_list" title="Vector to List">
    VECTOR_TO_LIST</a></td><td align="left">8</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.list_to_vector" title="List to Vector">
    LIST_TO_VECTOR</a></td><td align="left">10</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.assoc_ord_to_unord" title="Ordered to Unordered Associative Container">
    ORDERED_TO_UNORDERED</a></td><td align="left">10</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">only map/unordered_map</td></tr><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.algorithms" title="Algorithms">
    ALGORITHMS</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.algorithms.sort" title="Sort Algorithm Performance">
    SORT</a></td><td align="left">7</td><td align="left">8</td><td align="left"> </td><td align="left">7</td><td align="left">no</td></tr><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality" title="Data Locality">
    LOCALITY</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality.sw_prefetch" title="Need Software Prefetch">
    SOFTWARE_PREFETCH</a></td><td align="left">8</td><td align="left">8</td><td align="left"> </td><td align="left">5</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality.linked" title="Linked Structure Locality">
    RBTREE_LOCALITY</a></td><td align="left">4</td><td align="left">8</td><td align="left"> </td><td align="left">5</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.mthread.false_share" title="False Sharing">
    FALSE_SHARING</a></td><td align="left">8</td><td align="left">10</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr></tbody></table></div></div><br class="table-break" /><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.template"></a>Diagnostic Template</h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_&lt;diagnostic&gt;</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>  What problem will it diagnose?
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>.
  What is the fundamental reason why this is a problem</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>
  Percentage reduction in execution time.  When reduction is more than
  a constant factor, describe the reduction rate formula.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  What would the advise look like?</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  What stdlibc++ components need to be instrumented?</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  How do we decide when to issue the advice?</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  How do we measure benefits?  Math goes here.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
program code
...
advice sample
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.containers"></a>Containers</h3></div></div></div><p>
<span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_CONTAINERS</code>.
</p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_too_small"></a>Hashtable Too Small</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TOO_SMALL</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables with many
  rehash operations, small construction size and large destruction size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Rehash is very expensive.
  Read content, follow chains within bucket, evaluate hash function, place at
  new location in different order.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> 36%.
  Code similar to example below.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, rehash.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_[multi]set|map</code>,
  record initial size and call context of the constructor.
  Record size increase, if any, after each relevant operation such as insert.
  Record the estimated rehash cost.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of individual rehash operations * cost per rehash.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 unordered_set&lt;int&gt; us;
2 for (int k = 0; k &lt; 1000000; ++k) {
3   us.insert(k);
4 }

foo.cc:1: advice: Changing initial unordered_set size from 10 to 1000000 saves 1025530 rehash operations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_too_large"></a>Hashtable Too Large</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TOO_LARGE</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables which are
  never filled up because fewer elements than reserved are ever
  inserted.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Save memory, which
  is good in itself and may also improve memory reference performance through
  fewer cache and TLB misses.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> unknown.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, rehash.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_[multi]set|map</code>,
  record initial size and call context of the constructor, and correlate it
  with its size at destruction time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of iteration operations + memory saved.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;unordered_set&lt;int&gt;&gt; v(100000, unordered_set&lt;int&gt;(100)) ;
2 for (int k = 0; k &lt; 100000; ++k) {
3   for (int j = 0; j &lt; 10; ++j) {
4     v[k].insert(k + j);
5  }
6 }

foo.cc:1: advice: Changing initial unordered_set size from 100 to 10 saves N
bytes of memory and M iteration steps.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.inefficient_hash"></a>Inefficient Hash</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_INEFFICIENT_HASH</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables with polarized
  distribution.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> A non-uniform
  distribution may lead to long chains, thus possibly increasing complexity
  by a factor up to the number of elements.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> factor up
   to container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change hash function
  for container built at site S.  Distribution score = N.  Access score = S.
  Longest chain = C, in bucket B.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, [],
  insert, iterator.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Count the exact number of link traversals.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total number of links traversed.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
class dumb_hash {
 public:
  size_t operator() (int i) const { return 0; }
};
...
  unordered_set&lt;int, dumb_hash&gt; hs;
  ...
  for (int i = 0; i &lt; COUNT; ++i) {
    hs.find(i);
  }
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_too_small"></a>Vector Too Small</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TOO_SMALL</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>Detect vectors with many
  resize operations, small construction size and large destruction size..
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Resizing can be expensive.
  Copying large amounts of data takes time.  Resizing many small vectors may
  have allocation overhead and affect locality.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record initial size and call context of the constructor.
  Record size increase, if any, after each relevant operation such as
  <code class="code">push_back</code>.  Record the estimated resize cost.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total number of words copied * time to copy a word.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;int&gt; v;
2 for (int k = 0; k &lt; 1000000; ++k) {
3   v.push_back(k);
4 }

foo.cc:1: advice: Changing initial vector size from 10 to 1000000 saves
copying 4000000 bytes and 20 memory allocations and deallocations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_too_large"></a>Vector Too Large</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TOO_LARGE</code>
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>Detect vectors which are
  never filled up because fewer elements than reserved are ever
  inserted.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Save memory, which
  is good in itself and may also improve memory reference performance through
  fewer cache and TLB misses.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record initial size and call context of the constructor, and correlate it
  with its size at destruction time.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total amount of memory saved.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;vector&lt;int&gt;&gt; v(100000, vector&lt;int&gt;(100)) ;
2 for (int k = 0; k &lt; 100000; ++k) {
3   for (int j = 0; j &lt; 10; ++j) {
4     v[k].insert(k + j);
5  }
6 }

foo.cc:1: advice: Changing initial vector size from 100 to 10 saves N
bytes of memory and may reduce the number of cache and TLB misses.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_to_hashtable"></a>Vector to Hashtable</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TO_HASHTABLE</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect uses of
  <code class="code">vector</code> that can be substituted with <code class="code">unordered_set</code>
  to reduce execution time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Linear search in a vector is very expensive, whereas searching in a hashtable
  is very quick.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>factor up
   to container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace
  <code class="code">vector</code> with <code class="code">unordered_set</code> at site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record call context of the constructor.  Issue the advice only if the
  only methods called on this <code class="code">vector</code> are <code class="code">push_back</code>,
  <code class="code">insert</code> and <code class="code">find</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Cost(vector::push_back) + cost(vector::insert) + cost(find, vector) -
  cost(unordered_set::insert) + cost(unordered_set::find).
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  vector&lt;int&gt; v;
...
2  for (int i = 0; i &lt; 1000; ++i) {
3    find(v.begin(), v.end(), i);
4  }

foo.cc:1: advice: Changing "vector" to "unordered_set" will save about 500,000
comparisons.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_to_vector"></a>Hashtable to Vector</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TO_VECTOR</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect uses of
  <code class="code">unordered_set</code> that can be substituted with <code class="code">vector</code>
  to reduce execution time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Hashtable iterator is slower than vector iterator.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>95%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace
  <code class="code">unordered_set</code> with <code class="code">vector</code> at site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">unordered_set</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_set</code>,
  record call context of the constructor.  Issue the advice only if the
  number of <code class="code">find</code>, <code class="code">insert</code> and <code class="code">[]</code>
  operations on this <code class="code">unordered_set</code> are small relative to the
  number of elements, and methods <code class="code">begin</code> or <code class="code">end</code>
  are invoked (suggesting iteration).</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of .</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  unordered_set&lt;int&gt; us;
...
2  int s = 0;
3  for (unordered_set&lt;int&gt;::iterator it = us.begin(); it != us.end(); ++it) {
4    s += *it;
5  }

foo.cc:1: advice: Changing "unordered_set" to "vector" will save about N
indirections and may achieve better data locality.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_to_list"></a>Vector to List</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TO_LIST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">vector</code> could be substituted with <code class="code">list</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Inserting in the middle of a vector is expensive compared to inserting in a
  list.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>factor up to
   container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace vector with list
  at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record the call context of the constructor.  Record the overhead of each
  <code class="code">insert</code> operation based on current size and insert position.
  Report instance with high insertion overhead.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  (Sum(cost(vector::method)) - Sum(cost(list::method)), for
  method in [push_back, insert, erase])
  + (Cost(iterate vector) - Cost(iterate list))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  vector&lt;int&gt; v;
2  for (int i = 0; i &lt; 10000; ++i) {
3    v.insert(v.begin(), i);
4  }

foo.cc:1: advice: Changing "vector" to "list" will save about 5,000,000
operations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.list_to_vector"></a>List to Vector</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LIST_TO_VECTOR</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">list</code> could be substituted with <code class="code">vector</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Iterating through a vector is faster than through a list.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>64%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace list with vector
  at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if there are no <code class="code">insert</code> operations.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
    (Sum(cost(vector::method)) - Sum(cost(list::method)), for
  method in [push_back, insert, erase])
  + (Cost(iterate vector) - Cost(iterate list))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  list&lt;int&gt; l;
...
2  int sum = 0;
3  for (list&lt;int&gt;::iterator it = l.begin(); it != l.end(); ++it) {
4    sum += *it;
5  }

foo.cc:1: advice: Changing "list" to "vector" will save about 1000000 indirect
memory references.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.list_to_slist"></a>List to Forward List (Slist)</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LIST_TO_SLIST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">list</code> could be substituted with <code class="code">forward_list</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  The memory footprint of a forward_list is smaller than that of a list.
  This has beneficial effects on memory subsystem, e.g., fewer cache misses.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>40%.
  Note that the reduction is only noticeable if the size of the forward_list
  node is in fact larger than that of the list node.  For memory allocators
  with size classes, you will only notice an effect when the two node sizes
  belong to different allocator size classes.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace list with
  forward_list at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">list</code>
  operations and iteration methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if there are no <code class="code">backwards</code> traversals
  or insertion before a given node.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Always true.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  list&lt;int&gt; l;
...
2  int sum = 0;
3  for (list&lt;int&gt;::iterator it = l.begin(); it != l.end(); ++it) {
4    sum += *it;
5  }

foo.cc:1: advice: Change "list" to "forward_list".
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.assoc_ord_to_unord"></a>Ordered to Unordered Associative Container</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_ORDERED_TO_UNORDERED</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>  Detect cases where ordered
  associative containers can be replaced with unordered ones.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Insert and search are quicker in a hashtable than in
  a red-black tree.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>52%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Replace set with unordered_set at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">set</code>, <code class="code">multiset</code>, <code class="code">map</code>,
  <code class="code">multimap</code> methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice only if we are not using operator <code class="code">++</code> on any
  iterator on a particular <code class="code">[multi]set|map</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  (Sum(cost(hashtable::method)) - Sum(cost(rbtree::method)), for
  method in [insert, erase, find])
  + (Cost(iterate hashtable) - Cost(iterate rbtree))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  set&lt;int&gt; s;
2  for (int i = 0; i &lt; 100000; ++i) {
3    s.insert(i);
4  }
5  int sum = 0;
6  for (int i = 0; i &lt; 100000; ++i) {
7    sum += *s.find(i);
8  }
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.algorithms"></a>Algorithms</h3></div></div></div><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_ALGORITHMS</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.algorithms.sort"></a>Sort Algorithm Performance</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_SORT</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Give measure of sort algorithm
  performance based on actual input.  For instance, advise Radix Sort over
  Quick Sort for a particular call context.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  See papers:
  <a class="link" href="https://dl.acm.org/citation.cfm?doid=1065944.1065981" target="_top">
  A framework for adaptive algorithm selection in STAPL</a> and
  <a class="link" href="http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=4228227" target="_top">
  Optimizing Sorting with Machine Learning Algorithms</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>60%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change sort algorithm
  at site S from X Sort to Y Sort.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> <code class="code">sort</code>
  algorithm.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if the cost model tells us that another sort algorithm
  would do better on this input.  Requires us to know what algorithm we
  are using in our sort implementation in release mode.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Runtime(algo) for algo in [radix, quick, merge, ...]</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.locality"></a>Data Locality</h3></div></div></div><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LOCALITY</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.locality.sw_prefetch"></a>Need Software Prefetch</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_SOFTWARE_PREFETCH</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Discover sequences of indirect
  memory accesses that are not regular, thus cannot be predicted by
  hardware prefetchers.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Indirect references are hard to predict and are very expensive when they
  miss in caches.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>25%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Insert prefetch
  instruction.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Vector iterator and
  access operator [].
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get cache line size and page size from system.
  Then record iterator dereference sequences for which the value is a pointer.
  For each sequence within a container, issue a warning if successive pointer
  addresses are not within cache lines and do not form a linear pattern
  (otherwise they may be prefetched by hardware).
  If they also step across page boundaries, make the warning stronger.
  </p><p>The same analysis applies to containers other than vector.
  However, we cannot give the same advice for linked structures, such as list,
  as there is no random access to the n-th element.  The user may still be
  able to benefit from this information, for instance by employing frays (user
  level light weight threads) to hide the latency of chasing pointers.
  </p><p>
  This analysis is a little oversimplified.  A better cost model could be
  created by understanding the capability of the hardware prefetcher.
  This model could be trained automatically by running a set of synthetic
  cases.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total distance between pointer values of successive elements in vectors
  of pointers.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 int zero = 0;
2 vector&lt;int*&gt; v(10000000, &amp;zero);
3 for (int k = 0; k &lt; 10000000; ++k) {
4   v[random() % 10000000] = new int(k);
5 }
6 for (int j = 0; j &lt; 10000000; ++j) {
7   count += (*v[j] == 0 ? 0 : 1);
8 }

foo.cc:7: advice: Insert prefetch instruction.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.locality.linked"></a>Linked Structure Locality</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Give measure of locality of
  objects stored in linked structures (lists, red-black trees and hashtables)
  with respect to their actual traversal patterns.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Allocation can be tuned
  to a specific traversal pattern, to result in better data locality.
  See paper:
  <a class="link" href="http://www.springerlink.com/content/8085744l00x72662/" target="_top">
  Custom Memory Allocation for Free</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>30%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  High scatter score N for container built at site S.
  Consider changing allocation sequence or choosing a structure conscious
  allocator.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Methods of all
  containers using linked structures.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get cache line size and page size from system.
  Then record the number of successive elements that are on different line
  or page, for each traversal method such as <code class="code">find</code>.  Give advice
  only if the ratio between this number and the number of total node hops
  is above a threshold.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Sum(same_cache_line(this,previous))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
 1  set&lt;int&gt; s;
 2  for (int i = 0; i &lt; 10000000; ++i) {
 3    s.insert(i);
 4  }
 5  set&lt;int&gt; s1, s2;
 6  for (int i = 0; i &lt; 10000000; ++i) {
 7    s1.insert(i);
 8    s2.insert(i);
 9  }
...
      // Fast, better locality.
10    for (set&lt;int&gt;::iterator it = s.begin(); it != s.end(); ++it) {
11      sum += *it;
12    }
      // Slow, elements are further apart.
13    for (set&lt;int&gt;::iterator it = s1.begin(); it != s1.end(); ++it) {
14      sum += *it;
15    }

foo.cc:5: advice: High scatter score NNN for set built here.  Consider changing
the allocation sequence or switching to a structure conscious allocator.
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.mthread"></a>Multithreaded Data Access</h3></div></div></div><p>
  The diagnostics in this group are not meant to be implemented short term.
  They require compiler support to know when container elements are written
  to.  Instrumentation can only tell us when elements are referenced.
  </p><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_MULTITHREADED</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.mthread.ddtest"></a>Data Dependence Violations at Container Level</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_DDTEST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect container elements
  that are referenced from multiple threads in the parallel region or
  across parallel regions.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Sharing data between threads requires communication and perhaps locking,
  which may be expensive.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>?%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change data
  distribution or parallel algorithm.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Container access methods
  and iterators.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Keep a shadow for each container.  Record iterator dereferences and
  container member accesses.  Issue advice for elements referenced by
  multiple threads.
  See paper: <a class="link" href="https://dl.acm.org/citation.cfm?id=207110.207148" target="_top">
  The LRPD test: speculative run-time parallelization of loops with
  privatization and reduction parallelization</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of accesses to elements referenced from multiple threads
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.mthread.false_share"></a>False Sharing</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_FALSE_SHARING</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect elements in the
  same container which share a cache line, are written by at least one
  thread, and accessed by different threads.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Under these assumptions,
  cache protocols require
  communication to invalidate lines, which may be expensive.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>68%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Reorganize container
  or use padding to avoid false sharing.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Container access methods
  and iterators.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get the cache line size.
  For each shared container, record all the associated iterator dereferences
  and member access methods with the thread id.  Compare the address lists
  across threads to detect references in two different threads to the same
  cache line.  Issue a warning only if the ratio to total references is
  significant.  Do the same for iterator dereference values if they are
  pointers.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of accesses to same cache line from different threads.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1     vector&lt;int&gt; v(2, 0);
2 #pragma omp parallel for shared(v, SIZE) schedule(static, 1)
3     for (i = 0; i &lt; SIZE; ++i) {
4       v[i % 2] += i;
5     }

OMP_NUM_THREADS=2 ./a.out
foo.cc:1: advice: Change container structure or padding to avoid false
sharing in multithreaded access at foo.cc:4.  Detected N shared cache lines.
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.statistics"></a>Statistics</h3></div></div></div><p>
<span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_STATISTICS</code>.
</p><p>
  In some cases the cost model may not tell us anything because the costs
  appear to offset the benefits.  Consider the choice between a vector and
  a list.  When there are both inserts and iteration, an automatic advice
  may not be issued.  However, the programmer may still be able to make use
  of this information in a different way.
</p><p>
  This diagnostic will not issue any advice, but it will print statistics for
  each container construction site.  The statistics will contain the cost
  of each operation actually performed on the container.
</p></div></div><div class="navfooter"><hr /><table width="100%" summary="Navigation footer"><tr><td width="40%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><td width="20%" align="center"><a accesskey="u" href="profile_mode.html">Up</a></td><td width="40%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr><tr><td width="40%" align="left" valign="top">Developer Information </td><td width="20%" align="center"><a accesskey="h" href="../index.html">Home</a></td><td width="40%" align="right" valign="top"> Chapter 20. The mt_allocator</td></tr></table></div></body></html>@


1.1.1.1.4.1
log
@file profile_mode_diagnostics.html was added on branch yamt-pagecache on 2014-05-22 16:37:45 +0000
@
text
@d1 557
@


1.1.1.1.4.2
log
@sync with head.

for a reference, the tree before this commit was tagged
as yamt-pagecache-tag8.

this commit was splitted into small chunks to avoid
a limitation of cvs.  ("Protocol error: too many arguments")
@
text
@a0 557
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>Diagnostics</title><meta name="generator" content="DocBook XSL-NS Stylesheets V1.78.1" /><meta name="keywords" content="C++, library, profile" /><meta name="keywords" content="ISO C++, library" /><meta name="keywords" content="ISO C++, runtime, library" /><link rel="home" href="../index.html" title="The GNU C++ Library" /><link rel="up" href="profile_mode.html" title="Chapter 19. Profile Mode" /><link rel="prev" href="profile_mode_devel.html" title="Developer Information" /><link rel="next" href="mt_allocator.html" title="Chapter 20. The mt_allocator" /></head><body><div class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="3" align="center">Diagnostics</th></tr><tr><td width="20%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><th width="60%" align="center">Chapter 19. Profile Mode</th><td width="20%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr></table><hr /></div><div class="section"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a id="manual.ext.profile_mode.diagnostics"></a>Diagnostics</h2></div></div></div><p>
  The table below presents all the diagnostics we intend to implement.
  Each diagnostic has a corresponding compile time switch
  <code class="code">-D_GLIBCXX_PROFILE_&lt;diagnostic&gt;</code>.
  Groups of related diagnostics can be turned on with a single switch.
  For instance, <code class="code">-D_GLIBCXX_PROFILE_LOCALITY</code> is equivalent to
  <code class="code">-D_GLIBCXX_PROFILE_SOFTWARE_PREFETCH
  -D_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
  </p><p>
  The benefit, cost, expected frequency and accuracy of each diagnostic
  was given a grade from 1 to 10, where 10 is highest.
  A high benefit means that, if the diagnostic is accurate, the expected
  performance improvement is high.
  A high cost means that turning this diagnostic on leads to high slowdown.
  A high frequency means that we expect this to occur relatively often.
  A high accuracy means that the diagnostic is unlikely to be wrong.
  These grades are not perfect.  They are just meant to guide users with
  specific needs or time budgets.
  </p><div class="table"><a id="idm269998442544"></a><p class="title"><strong>Table 19.2. Profile Diagnostics</strong></p><div class="table-contents"><table summary="Profile Diagnostics" border="1"><colgroup><col align="left" class="c1" /><col align="left" class="c2" /><col align="left" class="c3" /><col align="left" class="c4" /><col align="left" class="c5" /><col align="left" class="c6" /><col align="left" class="c7" /></colgroup><thead><tr><th align="left">Group</th><th align="left">Flag</th><th align="left">Benefit</th><th align="left">Cost</th><th align="left">Freq.</th><th align="left">Implemented</th><td class="auto-generated"> </td></tr></thead><tbody><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.containers" title="Containers">
    CONTAINERS</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_too_small" title="Hashtable Too Small">
    HASHTABLE_TOO_SMALL</a></td><td align="left">10</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_too_large" title="Hashtable Too Large">
    HASHTABLE_TOO_LARGE</a></td><td align="left">5</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.inefficient_hash" title="Inefficient Hash">
    INEFFICIENT_HASH</a></td><td align="left">7</td><td align="left">3</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_too_small" title="Vector Too Small">
    VECTOR_TOO_SMALL</a></td><td align="left">8</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_too_large" title="Vector Too Large">
    VECTOR_TOO_LARGE</a></td><td align="left">5</td><td align="left">1</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_to_hashtable" title="Vector to Hashtable">
    VECTOR_TO_HASHTABLE</a></td><td align="left">7</td><td align="left">7</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.hashtable_to_vector" title="Hashtable to Vector">
    HASHTABLE_TO_VECTOR</a></td><td align="left">7</td><td align="left">7</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.vector_to_list" title="Vector to List">
    VECTOR_TO_LIST</a></td><td align="left">8</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">yes</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.list_to_vector" title="List to Vector">
    LIST_TO_VECTOR</a></td><td align="left">10</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.assoc_ord_to_unord" title="Ordered to Unordered Associative Container">
    ORDERED_TO_UNORDERED</a></td><td align="left">10</td><td align="left">5</td><td align="left"> </td><td align="left">10</td><td align="left">only map/unordered_map</td></tr><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.algorithms" title="Algorithms">
    ALGORITHMS</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.algorithms.sort" title="Sort Algorithm Performance">
    SORT</a></td><td align="left">7</td><td align="left">8</td><td align="left"> </td><td align="left">7</td><td align="left">no</td></tr><tr><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality" title="Data Locality">
    LOCALITY</a></td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality.sw_prefetch" title="Need Software Prefetch">
    SOFTWARE_PREFETCH</a></td><td align="left">8</td><td align="left">8</td><td align="left"> </td><td align="left">5</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.locality.linked" title="Linked Structure Locality">
    RBTREE_LOCALITY</a></td><td align="left">4</td><td align="left">8</td><td align="left"> </td><td align="left">5</td><td align="left">no</td></tr><tr><td align="left"> </td><td align="left"><a class="link" href="profile_mode_diagnostics.html#manual.ext.profile_mode.analysis.mthread.false_share" title="False Sharing">
    FALSE_SHARING</a></td><td align="left">8</td><td align="left">10</td><td align="left"> </td><td align="left">10</td><td align="left">no</td></tr></tbody></table></div></div><br class="table-break" /><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.template"></a>Diagnostic Template</h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_&lt;diagnostic&gt;</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>  What problem will it diagnose?
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>.
  What is the fundamental reason why this is a problem</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>
  Percentage reduction in execution time.  When reduction is more than
  a constant factor, describe the reduction rate formula.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  What would the advise look like?</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  What stdlibc++ components need to be instrumented?</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  How do we decide when to issue the advice?</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  How do we measure benefits?  Math goes here.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
program code
...
advice sample
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.containers"></a>Containers</h3></div></div></div><p>
<span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_CONTAINERS</code>.
</p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_too_small"></a>Hashtable Too Small</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TOO_SMALL</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables with many
  rehash operations, small construction size and large destruction size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Rehash is very expensive.
  Read content, follow chains within bucket, evaluate hash function, place at
  new location in different order.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> 36%.
  Code similar to example below.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, rehash.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_[multi]set|map</code>,
  record initial size and call context of the constructor.
  Record size increase, if any, after each relevant operation such as insert.
  Record the estimated rehash cost.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of individual rehash operations * cost per rehash.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 unordered_set&lt;int&gt; us;
2 for (int k = 0; k &lt; 1000000; ++k) {
3   us.insert(k);
4 }

foo.cc:1: advice: Changing initial unordered_set size from 10 to 1000000 saves 1025530 rehash operations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_too_large"></a>Hashtable Too Large</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TOO_LARGE</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables which are
  never filled up because fewer elements than reserved are ever
  inserted.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Save memory, which
  is good in itself and may also improve memory reference performance through
  fewer cache and TLB misses.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> unknown.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, rehash.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_[multi]set|map</code>,
  record initial size and call context of the constructor, and correlate it
  with its size at destruction time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of iteration operations + memory saved.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;unordered_set&lt;int&gt;&gt; v(100000, unordered_set&lt;int&gt;(100)) ;
2 for (int k = 0; k &lt; 100000; ++k) {
3   for (int j = 0; j &lt; 10; ++j) {
4     v[k].insert(k + j);
5  }
6 }

foo.cc:1: advice: Changing initial unordered_set size from 100 to 10 saves N
bytes of memory and M iteration steps.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.inefficient_hash"></a>Inefficient Hash</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_INEFFICIENT_HASH</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect hashtables with polarized
  distribution.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> A non-uniform
  distribution may lead to long chains, thus possibly increasing complexity
  by a factor up to the number of elements.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span> factor up
   to container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change hash function
  for container built at site S.  Distribution score = N.  Access score = S.
  Longest chain = C, in bucket B.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">unordered_set, unordered_map</code> constructor, destructor, [],
  insert, iterator.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Count the exact number of link traversals.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total number of links traversed.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
class dumb_hash {
 public:
  size_t operator() (int i) const { return 0; }
};
...
  unordered_set&lt;int, dumb_hash&gt; hs;
  ...
  for (int i = 0; i &lt; COUNT; ++i) {
    hs.find(i);
  }
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_too_small"></a>Vector Too Small</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TOO_SMALL</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>Detect vectors with many
  resize operations, small construction size and large destruction size..
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Resizing can be expensive.
  Copying large amounts of data takes time.  Resizing many small vectors may
  have allocation overhead and affect locality.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record initial size and call context of the constructor.
  Record size increase, if any, after each relevant operation such as
  <code class="code">push_back</code>.  Record the estimated resize cost.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total number of words copied * time to copy a word.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;int&gt; v;
2 for (int k = 0; k &lt; 1000000; ++k) {
3   v.push_back(k);
4 }

foo.cc:1: advice: Changing initial vector size from 10 to 1000000 saves
copying 4000000 bytes and 20 memory allocations and deallocations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_too_large"></a>Vector Too Large</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TOO_LARGE</code>
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>Detect vectors which are
  never filled up because fewer elements than reserved are ever
  inserted.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Save memory, which
  is good in itself and may also improve memory reference performance through
  fewer cache and TLB misses.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Set initial size to N at construction site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record initial size and call context of the constructor, and correlate it
  with its size at destruction time.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total amount of memory saved.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 vector&lt;vector&lt;int&gt;&gt; v(100000, vector&lt;int&gt;(100)) ;
2 for (int k = 0; k &lt; 100000; ++k) {
3   for (int j = 0; j &lt; 10; ++j) {
4     v[k].insert(k + j);
5  }
6 }

foo.cc:1: advice: Changing initial vector size from 100 to 10 saves N
bytes of memory and may reduce the number of cache and TLB misses.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_to_hashtable"></a>Vector to Hashtable</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TO_HASHTABLE</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect uses of
  <code class="code">vector</code> that can be substituted with <code class="code">unordered_set</code>
  to reduce execution time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Linear search in a vector is very expensive, whereas searching in a hashtable
  is very quick.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>factor up
   to container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace
  <code class="code">vector</code> with <code class="code">unordered_set</code> at site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record call context of the constructor.  Issue the advice only if the
  only methods called on this <code class="code">vector</code> are <code class="code">push_back</code>,
  <code class="code">insert</code> and <code class="code">find</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Cost(vector::push_back) + cost(vector::insert) + cost(find, vector) -
  cost(unordered_set::insert) + cost(unordered_set::find).
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  vector&lt;int&gt; v;
...
2  for (int i = 0; i &lt; 1000; ++i) {
3    find(v.begin(), v.end(), i);
4  }

foo.cc:1: advice: Changing "vector" to "unordered_set" will save about 500,000
comparisons.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.hashtable_to_vector"></a>Hashtable to Vector</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_HASHTABLE_TO_VECTOR</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect uses of
  <code class="code">unordered_set</code> that can be substituted with <code class="code">vector</code>
  to reduce execution time.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Hashtable iterator is slower than vector iterator.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>95%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace
  <code class="code">unordered_set</code> with <code class="code">vector</code> at site S.
  </p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">unordered_set</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">unordered_set</code>,
  record call context of the constructor.  Issue the advice only if the
  number of <code class="code">find</code>, <code class="code">insert</code> and <code class="code">[]</code>
  operations on this <code class="code">unordered_set</code> are small relative to the
  number of elements, and methods <code class="code">begin</code> or <code class="code">end</code>
  are invoked (suggesting iteration).</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of .</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  unordered_set&lt;int&gt; us;
...
2  int s = 0;
3  for (unordered_set&lt;int&gt;::iterator it = us.begin(); it != us.end(); ++it) {
4    s += *it;
5  }

foo.cc:1: advice: Changing "unordered_set" to "vector" will save about N
indirections and may achieve better data locality.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.vector_to_list"></a>Vector to List</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_VECTOR_TO_LIST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">vector</code> could be substituted with <code class="code">list</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Inserting in the middle of a vector is expensive compared to inserting in a
  list.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>factor up to
   container size.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace vector with list
  at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  For each dynamic instance of <code class="code">vector</code>,
  record the call context of the constructor.  Record the overhead of each
  <code class="code">insert</code> operation based on current size and insert position.
  Report instance with high insertion overhead.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  (Sum(cost(vector::method)) - Sum(cost(list::method)), for
  method in [push_back, insert, erase])
  + (Cost(iterate vector) - Cost(iterate list))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  vector&lt;int&gt; v;
2  for (int i = 0; i &lt; 10000; ++i) {
3    v.insert(v.begin(), i);
4  }

foo.cc:1: advice: Changing "vector" to "list" will save about 5,000,000
operations.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.list_to_vector"></a>List to Vector</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LIST_TO_VECTOR</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">list</code> could be substituted with <code class="code">vector</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Iterating through a vector is faster than through a list.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>64%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace list with vector
  at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">vector</code>
  operations and access methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if there are no <code class="code">insert</code> operations.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
    (Sum(cost(vector::method)) - Sum(cost(list::method)), for
  method in [push_back, insert, erase])
  + (Cost(iterate vector) - Cost(iterate list))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  list&lt;int&gt; l;
...
2  int sum = 0;
3  for (list&lt;int&gt;::iterator it = l.begin(); it != l.end(); ++it) {
4    sum += *it;
5  }

foo.cc:1: advice: Changing "list" to "vector" will save about 1000000 indirect
memory references.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.list_to_slist"></a>List to Forward List (Slist)</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LIST_TO_SLIST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect cases where
  <code class="code">list</code> could be substituted with <code class="code">forward_list</code> for
  better performance.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  The memory footprint of a forward_list is smaller than that of a list.
  This has beneficial effects on memory subsystem, e.g., fewer cache misses.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>40%.
  Note that the reduction is only noticeable if the size of the forward_list
  node is in fact larger than that of the list node.  For memory allocators
  with size classes, you will only notice an effect when the two node sizes
  belong to different allocator size classes.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>Replace list with
  forward_list at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span><code class="code">list</code>
  operations and iteration methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if there are no <code class="code">backwards</code> traversals
  or insertion before a given node.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Always true.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  list&lt;int&gt; l;
...
2  int sum = 0;
3  for (list&lt;int&gt;::iterator it = l.begin(); it != l.end(); ++it) {
4    sum += *it;
5  }

foo.cc:1: advice: Change "list" to "forward_list".
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.assoc_ord_to_unord"></a>Ordered to Unordered Associative Container</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_ORDERED_TO_UNORDERED</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span>  Detect cases where ordered
  associative containers can be replaced with unordered ones.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Insert and search are quicker in a hashtable than in
  a red-black tree.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>52%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  Replace set with unordered_set at site S.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span>
  <code class="code">set</code>, <code class="code">multiset</code>, <code class="code">map</code>,
  <code class="code">multimap</code> methods.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice only if we are not using operator <code class="code">++</code> on any
  iterator on a particular <code class="code">[multi]set|map</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  (Sum(cost(hashtable::method)) - Sum(cost(rbtree::method)), for
  method in [insert, erase, find])
  + (Cost(iterate hashtable) - Cost(iterate rbtree))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1  set&lt;int&gt; s;
2  for (int i = 0; i &lt; 100000; ++i) {
3    s.insert(i);
4  }
5  int sum = 0;
6  for (int i = 0; i &lt; 100000; ++i) {
7    sum += *s.find(i);
8  }
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.algorithms"></a>Algorithms</h3></div></div></div><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_ALGORITHMS</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.algorithms.sort"></a>Sort Algorithm Performance</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_SORT</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Give measure of sort algorithm
  performance based on actual input.  For instance, advise Radix Sort over
  Quick Sort for a particular call context.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  See papers:
  <a class="link" href="https://dl.acm.org/citation.cfm?doid=1065944.1065981" target="_top">
  A framework for adaptive algorithm selection in STAPL</a> and
  <a class="link" href="http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=4228227" target="_top">
  Optimizing Sorting with Machine Learning Algorithms</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>60%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change sort algorithm
  at site S from X Sort to Y Sort.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> <code class="code">sort</code>
  algorithm.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Issue the advice if the cost model tells us that another sort algorithm
  would do better on this input.  Requires us to know what algorithm we
  are using in our sort implementation in release mode.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Runtime(algo) for algo in [radix, quick, merge, ...]</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.locality"></a>Data Locality</h3></div></div></div><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_LOCALITY</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.locality.sw_prefetch"></a>Need Software Prefetch</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_SOFTWARE_PREFETCH</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Discover sequences of indirect
  memory accesses that are not regular, thus cannot be predicted by
  hardware prefetchers.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Indirect references are hard to predict and are very expensive when they
  miss in caches.</p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>25%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Insert prefetch
  instruction.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Vector iterator and
  access operator [].
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get cache line size and page size from system.
  Then record iterator dereference sequences for which the value is a pointer.
  For each sequence within a container, issue a warning if successive pointer
  addresses are not within cache lines and do not form a linear pattern
  (otherwise they may be prefetched by hardware).
  If they also step across page boundaries, make the warning stronger.
  </p><p>The same analysis applies to containers other than vector.
  However, we cannot give the same advice for linked structures, such as list,
  as there is no random access to the n-th element.  The user may still be
  able to benefit from this information, for instance by employing frays (user
  level light weight threads) to hide the latency of chasing pointers.
  </p><p>
  This analysis is a little oversimplified.  A better cost model could be
  created by understanding the capability of the hardware prefetcher.
  This model could be trained automatically by running a set of synthetic
  cases.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Total distance between pointer values of successive elements in vectors
  of pointers.</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1 int zero = 0;
2 vector&lt;int*&gt; v(10000000, &amp;zero);
3 for (int k = 0; k &lt; 10000000; ++k) {
4   v[random() % 10000000] = new int(k);
5 }
6 for (int j = 0; j &lt; 10000000; ++j) {
7   count += (*v[j] == 0 ? 0 : 1);
8 }

foo.cc:7: advice: Insert prefetch instruction.
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.locality.linked"></a>Linked Structure Locality</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Give measure of locality of
  objects stored in linked structures (lists, red-black trees and hashtables)
  with respect to their actual traversal patterns.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>Allocation can be tuned
  to a specific traversal pattern, to result in better data locality.
  See paper:
  <a class="link" href="http://www.springerlink.com/content/8085744l00x72662/" target="_top">
  Custom Memory Allocation for Free</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>30%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span>
  High scatter score N for container built at site S.
  Consider changing allocation sequence or choosing a structure conscious
  allocator.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Methods of all
  containers using linked structures.</p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get cache line size and page size from system.
  Then record the number of successive elements that are on different line
  or page, for each traversal method such as <code class="code">find</code>.  Give advice
  only if the ratio between this number and the number of total node hops
  is above a threshold.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Sum(same_cache_line(this,previous))</p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
 1  set&lt;int&gt; s;
 2  for (int i = 0; i &lt; 10000000; ++i) {
 3    s.insert(i);
 4  }
 5  set&lt;int&gt; s1, s2;
 6  for (int i = 0; i &lt; 10000000; ++i) {
 7    s1.insert(i);
 8    s2.insert(i);
 9  }
...
      // Fast, better locality.
10    for (set&lt;int&gt;::iterator it = s.begin(); it != s.end(); ++it) {
11      sum += *it;
12    }
      // Slow, elements are further apart.
13    for (set&lt;int&gt;::iterator it = s1.begin(); it != s1.end(); ++it) {
14      sum += *it;
15    }

foo.cc:5: advice: High scatter score NNN for set built here.  Consider changing
the allocation sequence or switching to a structure conscious allocator.
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.mthread"></a>Multithreaded Data Access</h3></div></div></div><p>
  The diagnostics in this group are not meant to be implemented short term.
  They require compiler support to know when container elements are written
  to.  Instrumentation can only tell us when elements are referenced.
  </p><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_MULTITHREADED</code>.
  </p><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.mthread.ddtest"></a>Data Dependence Violations at Container Level</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_DDTEST</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect container elements
  that are referenced from multiple threads in the parallel region or
  across parallel regions.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span>
  Sharing data between threads requires communication and perhaps locking,
  which may be expensive.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>?%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Change data
  distribution or parallel algorithm.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Container access methods
  and iterators.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  Keep a shadow for each container.  Record iterator dereferences and
  container member accesses.  Issue advice for elements referenced by
  multiple threads.
  See paper: <a class="link" href="https://dl.acm.org/citation.cfm?id=207110.207148" target="_top">
  The LRPD test: speculative run-time parallelization of loops with
  privatization and reduction parallelization</a>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of accesses to elements referenced from multiple threads
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
</pre><p>
</p></li></ul></div></div><div class="section"><div class="titlepage"><div><div><h4 class="title"><a id="manual.ext.profile_mode.analysis.mthread.false_share"></a>False Sharing</h4></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p><span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_FALSE_SHARING</code>.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Goal:</em></span> Detect elements in the
  same container which share a cache line, are written by at least one
  thread, and accessed by different threads.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Fundamentals:</em></span> Under these assumptions,
  cache protocols require
  communication to invalidate lines, which may be expensive.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Sample runtime reduction:</em></span>68%.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Recommendation:</em></span> Reorganize container
  or use padding to avoid false sharing.</p></li><li class="listitem"><p><span class="emphasis"><em>To instrument:</em></span> Container access methods
  and iterators.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Analysis:</em></span>
  First, get the cache line size.
  For each shared container, record all the associated iterator dereferences
  and member access methods with the thread id.  Compare the address lists
  across threads to detect references in two different threads to the same
  cache line.  Issue a warning only if the ratio to total references is
  significant.  Do the same for iterator dereference values if they are
  pointers.</p></li><li class="listitem"><p><span class="emphasis"><em>Cost model:</em></span>
  Number of accesses to same cache line from different threads.
  </p></li><li class="listitem"><p><span class="emphasis"><em>Example:</em></span>
</p><pre class="programlisting">
1     vector&lt;int&gt; v(2, 0);
2 #pragma omp parallel for shared(v, SIZE) schedule(static, 1)
3     for (i = 0; i &lt; SIZE; ++i) {
4       v[i % 2] += i;
5     }

OMP_NUM_THREADS=2 ./a.out
foo.cc:1: advice: Change container structure or padding to avoid false
sharing in multithreaded access at foo.cc:4.  Detected N shared cache lines.
</pre><p>
</p></li></ul></div></div></div><div class="section"><div class="titlepage"><div><div><h3 class="title"><a id="manual.ext.profile_mode.analysis.statistics"></a>Statistics</h3></div></div></div><p>
<span class="emphasis"><em>Switch:</em></span>
  <code class="code">_GLIBCXX_PROFILE_STATISTICS</code>.
</p><p>
  In some cases the cost model may not tell us anything because the costs
  appear to offset the benefits.  Consider the choice between a vector and
  a list.  When there are both inserts and iteration, an automatic advice
  may not be issued.  However, the programmer may still be able to make use
  of this information in a different way.
</p><p>
  This diagnostic will not issue any advice, but it will print statistics for
  each container construction site.  The statistics will contain the cost
  of each operation actually performed on the container.
</p></div></div><div class="navfooter"><hr /><table width="100%" summary="Navigation footer"><tr><td width="40%" align="left"><a accesskey="p" href="profile_mode_devel.html">Prev</a> </td><td width="20%" align="center"><a accesskey="u" href="profile_mode.html">Up</a></td><td width="40%" align="right"> <a accesskey="n" href="mt_allocator.html">Next</a></td></tr><tr><td width="40%" align="left" valign="top">Developer Information </td><td width="20%" align="center"><a accesskey="h" href="../index.html">Home</a></td><td width="40%" align="right" valign="top"> Chapter 20. The mt_allocator</td></tr></table></div></body></html>@