@STRING{jan = "January"} @STRING{feb = "February"} @STRING{mar = "March"} @STRING{apr = "April"} @STRING{may = "May"} @STRING{jun = "June"} @STRING{jul = "July"} @STRING{aug = "August"} @STRING{sep = "September"} @STRING{oct = "October"} @STRING{nov = "November"} @STRING{dec = "December"} @Article{memsys-sg, author = {Lavin, Patrick and Young, Jeffrey and Vuduc, Richard and Riedy, Jason and Vose, Aaron and Ernst, Daniel}, title = {Evaluating Gather and Scatter Performance on CPUs and GPUs}, journal = {The International Symposium on Memory Systems (MEMSYS)}, address = {Washington, DC}, year = 2020, month = {Sep}, doi = {10.1145/3422575.3422794}, url = {http://dx.doi.org/10.1145/3422575.3422794}, publisher = {ACM} } @InProceedings{rg-icrc-2019, author = {Jeffrey Young and Jason Riedy and Tom Conte and Vivek Sarkar and Prasanth Chatarasi and Srisehan Srikanth }, title = {Experimental Insights from the {Rogues} {Gallery} Testbed}, booktitle = {IEEE International Conference on Rebooting Computing (ICRC19)}, year = 2019, month = nov, address = {San Mateo, CA}, doi = {10.1109/ICRC.2019.8914707}, } @InProceedings{hpec19-yin, author = {Chunxing Yin and Jason Riedy}, title = {Concurrent {Katz} Centrality for Streaming Graphs}, booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)}, year = 2019, month = sep, address = {Waltham, MA}, doi = {10.1109/HPEC.2019.8916572}, keywords = {hpda, graph analysis, parallel algorithm}, } @InProceedings{pearc19-rogues, author = {Will Powell and Jason Riedy and Jeffrey S. Young and Tom Conte}, title = {Wrangling {Rogues}: A Case Study on Managing Experimental Post-{Moore} Architectures}, booktitle = {Practice and Experience in Advanced Research Computing ({PEARC} '19)}, year = 2019, month = jul, address = {Chicago, IL}, doi = {10.1145/3332186.3332223}, } @InProceedings{arith18-ejr, author = {Jason Riedy and James Demmel}, title = {Augmented Arithmetic Operations Proposed for {IEEE}-754 2018}, booktitle = {25th {IEEE} Symposium on Computer Arithmetic ({ARITH} 25)}, year = 2018, dom = 26, month = jun, doi = {10.1109/ARITH.2018.8464813}, projtag = {ieee754, lapack, grateful, xscala}, keywords = {floating point, ieee754}, ejr-proj = {floating-point}, ejr-grant = {xscala, grateful}, } @inproceedings{mlg2018_23, title = {A New Algorithmic Model for Graph Analysis of Streaming Data}, author = {Chunxing Yin and Jason Riedy and David A. Bader}, booktitle = {Proceedings of the 14th International Workshop on Mining and Learning with Graphs ({MLG})}, month = may, year = 2018, url = {http://www.mlgworkshop.org/2018/papers/MLG2018_paper_23.pdf}, keywords = {hpda, graph analysis, streaming data}, ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch}, ejr-grant = {hpda, grateful}, } @InProceedings{ashes2018-ejr, author = {Eric Hein and Tom Conte and Jeffrey S. Young and Srinivas Eswar and Jiajia Li and Patrick Lavin and Richard Vuduc and Jason Riedy}, title = {An Initial Characterization of the {Emu} {Chick}}, booktitle = {The Eighth International Workshop on Accelerators and Hybrid Exascale Systems ({AsHES})}, role = {proceedings}, OPTtags = {parallel; graph; streaming; community detection}, year = 2018, month = may, doi = {10.1109/IPDPSW.2018.00097}, pages = {579--588}, isbn = 9781538655559, abstract = {The Emu Chick is a prototype system designed around the concept of migratory memory-side processing. Rather than transferring large amounts of data across power-hungry, high-latency interconnects, the Emu Chick moves lightweight thread contexts to near-memory cores before the beginning of each memory read. The current prototype hardware uses FPGAs to implement cache-less "Gossamer" cores for doing computational work and a stationary core to run basic operating system functions and migrate threads between nodes. In this initial characterization of the Emu Chick, we study the memory bandwidth characteristics of the system through benchmarks like STREAM, pointer chasing, and sparse matrix vector multiply. We compare the Emu Chick hardware to architectural simulation and Intel Xeon-based platforms. While it is difficult to accurately compare prototype hardware with existing systems, our initial evaluation demonstrates that the Emu Chick uses available memory bandwidth more efficiently than a more traditional, cache-based architecture. Moreover, the Emu Chick provides stable, predictable performance with 80\% bandwidth utilization on a random-access pointer chasing benchmark with weak locality.}, keywords = {Instruction sets;Bandwidth;Computer architecture;Benchmark testing;Hardware;Prototypes;Kernel;benchmarking;streaming graphs;computer architecture;sparse tensors;emu}, projtag = {memory-centric, crnch-rg}, keywords = {hpda, memory-centric, novel architectures}, ejr-proj = {high-performance-data-analysis, novel-arch}, ejr-grant = {hpda, iarpa-emu}, } @InProceedings{siamns17-ejr, author = {E. Jason Riedy and Chunxing Yin and David A. Bader}, title = {A New Algorithm Model for Massive-Scale Streaming Graph Analysis}, booktitle = {SIAM Workshop on Network Science}, year = 2017, month = jul, dom = 14, address = {Pittsburgh, PA}, url = {https://www.slideshare.net/jasonriedy/a-new-algorithm-model-for-massivescale-streaming-graph-analysis}, projtag = {hpda, memory-centric, grateful, crnch-rg}, keywords = {hpda, graph analysis, streaming data}, ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch}, ejr-grant = {hpda, grateful}, } @InProceedings{pmma16-fpaddre, author = {{Dukhan}, Marat and {Vuduc}, Richard and {Riedy}, Jason}, title = {Wanted: Floating-Point Add Round-off Error Instruction}, booktitle = {The 2nd International Workshop on Performance Modeling: Methods and Applications ({PMMA16})}, year = 2016, month = jun, dom = 23, address = {Frankfurt, Germany}, note = {(Workshop with ISC High Performance)}, abstract = {We propose a new instruction (FPADDRE) that computes the round-off error in floating-point addition. We explain how this instruction benefits high-precision arithmetic operations in applications where double precision is not sufficient. Performance estimates on Intel Haswell, Intel Skylake, and AMD Steamroller processors, as well as Intel Knights Corner co-processor, demonstrate that such an instruction would improve the latency of double-double addition by up to 55\% and increase double-double addition throughput by up to 103\%, with smaller, but non-negligible benefits for double-double multiplication. The new instruction delivers up to 2x speedups on three benchmarks that use high-precision floating-point arithmetic: double-double matrix-matrix multiplication, compensated dot product, and polynomial evaluation via the compensated Horner scheme.}, url = {https://blogs.fau.de/hager/files/2016/06/pmma2016-slides_Dukhan.pdf}, eprint = {arXiv:1603.00491}, OPTprimaryClass = "cs.NA", projtag = {ieee754, grateful, lapack, xscala}, keywords = {floating point, ieee754}, ejr-proj = {floating-point}, ejr-grant = {xscala, grateful}, } @InProceedings{gabb16-pr, author = {Jason Riedy}, title = {Updating {PageRank} for Streaming Graphs}, booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB} 2016)}, year = 2016, dom = 23, month = may, address = {Chicago, IL}, note = {(Workshop with IPDPS 2016)}, OPTtags = {parallel; graph; streaming; pagerank}, abstract = {Incremental graph algorithms can respond quickly to small changes in massive graphs by updating rather than recomputing analysis metrics. Here we use the linear system formulation of PageRank and ideas from iterative refinement to compute the update to a PageRank vector accurately and quickly. The core idea is to express the residual of the original solution with respect to the updated matrix representing the graph. The update to the residual is sparse. Solving for the solution update with a straight-forward iterative method spreads the change outward from the change locations but converges before traversing the entire graph. We achieve speed-ups of 2$\times$ to over 40$\times$ relative to a restarted, highly parallel PageRank iteration for small, low-latency batches of edge insertions. These cases traverse 2$\times$ to nearly 10\,000$\times$ fewer edges than the restarted PageRank iteration. This provides an interesting test case for the ongoing GraphBLAS effort: Can the APIs support our incremental algorithms cleanly and efficiently?}, file = {material/streaming-pagerank-gabb2016.pdf}, projtag = {grateful, hpda, xscala}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch}, ejr-grant = {hpda, grateful, xscala}, } @InProceedings{caa16, author = {David Bader and Aleksandra Michalewicz and Oded Green and Jessie Birkett-Rees and Jason Riedy and James Fairbanks and Anita Zakrzewska}, title = {Semantic database applications at the {Samtavro} {Cemetery}, {Georgia}}, booktitle = {The 44th Computer Applications and Quantitative Methods in Archaeology Conference ({CAA})}, year = 2016, month = mar, dom = 30, address = {Oslo, Norway}, abstract = {In 2013 a paper was offered to the CAA concerning archaeological legacy data and semantic database applications, with some preliminary results for a study conducted into the Samtavro cemetery, situated in the South Caucasus in the modern republic of Georgia. The present paper presents further research outcomes of data mining the Samtavro material. Over four thousand graves were excavated at this site, used most intensively during the Late Bronze and Iron Ages, and later in the Roman and Late Antique periods. The current project focuses on the latter period—and the legacy of Soviet and post-Soviet excavations—in a collaborative effort between computer scientists based at the Georgia Institute of Technology, USA, and archaeologists at the University of Melbourne and Monash University, Australia. Data for 1075 tombs, 1249 individuals, and 5842 grave accoutrements were collected across 74 data fields, resulting in the identification of 9 tomb types, 37 artefact types and 320 artefact subtypes. Methods tested against the Samtavro material culture included the application of clustering techniques to understand associations of related items based on patterns of co-occurrence, using traditional data mining (hierarchical link clustering) and spectral graph theory—focusing on tomb types in relation to artefact types. The other method calculated the probability of each event occurring and comparing this to what we would expect if these were truly random—focusing on artefact types in relation to biological sex and age brackets. In some instances, our work confirmed previously established relationships, but it likewise revealed new results concerning particular entities. The project demonstrates that although sites for which comprehensive archival records exist can benefit from these types of approaches, often the greatest limitation in taking a ‘big data’ approach is the relative scarcity of archaeological data.}, projtag = {xscala}, keywords = {graph analysis, archaeology}, ejr-proj = {graph-analysis}, ejr-grant = {xscala}, } @InProceedings{hpec15, author = {Adam McLaughlin and Jason Riedy and David A. Bader}, title = {An Energy-Efficient Abstraction for Simultaneous Breadth-First Searches}, ejr-withauthor ={Adam McLaughlin and David A. Bader}, OPTtags = {parallel; graph; energy}, booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)}, year = 2015, month = sep, address = {Waltham, MA}, dom = 17, role = {proceedings}, abstract = {Optimized GPU kernels are sufficiently complicated to write that they often are specialized to specific input data, target architectures, or applications. This paper presents a multi-search abstraction for computing multiple breadth-first searches in parallel and demonstrates a high-performance, general implementation. Our abstraction removes the burden of orchestrating graph traversal from the user while providing high performance and low energy usage, an often overlooked component of algorithm design. Energy consumption has become a first-class hardware design constraint for both massive and embedded computing platforms. Our abstraction can be applied to such problems as the all-pairs shortest-path problem, community detection, reachability querying, and others. To map graph traversal efficiently to NVIDIA GPUs, our hybrid implementation chooses between processing active vertices with a single thread or an entire warp based on vertex outdegree. For a set of twelve varied graphs, the implementation of our abstraction saves 42\% time and 62\% energy on average compared to representative implementations of specific applications from existing literature.}, file = {material/multi_search_energy.pdf}, projtag = {xscala, grateful, hpda}, keywords = {hpda, graph analysis, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {grateful, xscala}, } @InProceedings{bc-hpec14, author = {Adam McLaughlin and Jason Riedy and David A. Bader}, ejr-withauthor ={Adam McLaughlin and David A. Bader}, title = {Optimizing Energy Consumption and Parallel Performance for Betweenness Centrality using {GPU}s}, OPTtags = {parallel; graph; energy}, booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)}, year = 2014, month = sep, address = {Waltham, MA}, note = {``Rising Stars'' section}, dom = 11, role = {proceedings}, doi = {10.1109/HPEC.2014.7040980}, file = {material/Optimizing_BC_HPEC14.pdf}, abstract = {Applications of high-performance graph analysis range from computational biology to network security and even transportation. These applications often consider graphs under rapid change and are moving beyond HPC platforms into energy-constrained embedded systems. This paper optimizes one successful and demanding analysis kernel, betweenness centrality, for NVIDIA GPU accelerators in both environments. Our algorithm for static analysis is capable of exceeding 2 million traversed edges per second per watt (MTEPS/W). Optimizing the parallel algorithm and treating the dynamic problem directly achieves a 6.39$\times$ average speed-up and 84\% average reduction in energy consumption.}, projtag = {xscala, grateful, hpda}, keywords = {hpda, graph analysis, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {grateful, xscala}, } @InProceedings{mtaap13, file = {material/mtaap13-streaming-community-monitoring.pdf}, author = {E. Jason Riedy and David A. Bader}, ejr-withauthor ={David A. Bader}, title = {Multithreaded Community Monitoring for Massive Streaming Graph Data}, booktitle = {7th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, OPTtags = {parallel; graph; streaming; community detection}, year = 2013, month = may, dom = 24, doi = {10.1109/IPDPSW.2013.229}, address = {Boston, MA}, acc-note = {(11/16 papers accepted, 69\% acceptance)}, abstract = {Analyzing static snapshots of massive, graph-structured data cannot keep pace with the growth of social networks, financial transactions, and other valuable data sources. Current state-of-the-art industrial methods analyze these streaming sources using only simple, aggregate metrics. There are few existing scalable algorithms for monitoring complex global quantities like decomposition into community structure. Using our framework STING, we present the first known parallel algorithm specifically for monitoring communities in this massive, streaming, graph-structured data. Our algorithm performs incremental re-agglomeration rather than starting from scratch after each batch of changes, reducing the problem's size to that of the change rather than the entire graph. We analyze our initial implementation's performance on multithreaded platforms for execution time and latency. On an Intel-based multithreaded platform, our algorithm handles up to 100 million updates per second on social networks with one to 30 million edges, providing a speed-up from 4$\times$ to 3700$\times$ over statically recomputing the decomposition after each batch of changes. Possibly because of our artificial graph generator, resulting communities' modularity varies little from the initial graph.}, projtag = {xscala, grateful, cassmt, intel-sting}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {grateful, xscala, intel-sting, cassmt}, } @InProceedings{stinger-hpec12, author = {David Ediger and Robert McColl and Jason Riedy and David A. Bader}, ejr-withauthor ={David Ediger and Robert McColl and David A. Bader}, title = {{STINGER}: High Performance Data Structure for Streaming Graphs}, OPTtags = {parallel; graph; streaming}, booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)}, year = 2012, month = sep, address = {Waltham, MA}, note = {Best paper award}, dom = 12, role = {proceedings}, doi = {10.1109/HPEC.2012.6408680}, file = {material/hpec12-stinger.pdf}, abstract = {The current research focus on ``big data'' problems highlights the scale and complexity of analytics required and the high rate at which data may be changing. In this paper, we present our high performance, scalable and portable software, Spatio-Temporal Interaction Networks and Graphs Extensible Representation (STINGER), that includes a graph data structure that enables these applications. Key attributes of STINGER are fast insertions, deletions, and updates on semantic graphs with skewed degree distributions. We demonstrate a process of algorithmic and architectural optimizations that enable high performance on the Cray XMT family and Intel multicore servers. Our implementation of STINGER on the Cray XMT processes over 3 million updates per second on a scale-free graph with 537 million edges.}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InProceedings{mtaap12, author = {E. Jason Riedy and David A. Bader and Henning Meyerhenke}, ejr-withauthor ={David A. Bader and Henning Meyerhenke}, title = {Scalable Multi-threaded Community Detection in Social Networks}, booktitle = {6th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, OPTtags = {parallel; graph; community detection}, year = 2012, month = may, dom = 25, abstract = {The volume of existing graph-structured data requires improved parallel tools and algorithms. Finding communities, smaller subgraphs densely connected within the subgraph than to the rest of the graph, plays a role both in developing new parallel algorithms as well as opening smaller portions of the data to current analysis tools. We improve performance of our parallel community detection algorithm by 20\% on the massively multithreaded Cray XMT, evaluate its performance on the next-generation Cray XMT2, and extend its reach to Intel-based platforms with OpenMP. To our knowledge, not only is this the first massively parallel community detection algorithm but also the only such algorithm that achieves excellent performance and good parallel scalability across all these platforms. Our implementation analyzes a moderate sized graph with 105 million vertices and 3.3 billion edges in around 500 seconds on a four processor, 80-logical-core Intel-based system and 1100 seconds on a 64-processor Cray XMT2.}, acc-note = {(9/15 papers accepted, 60\% acceptance)}, slide-url = {http://www.slideshare.net/jasonriedy/mtaap12-scalable-community-detection}, doi = {10.1109/IPDPSW.2012.203}, file = {material/mtaap12.pdf}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InCollection{icassp2012-stinger, author = {Jason Riedy and Henning Meyerhenke and David A. Bader and David Ediger and Timothy G. Mattson}, ejr-withauthor ={Henning Meyerhenke and David Ediger and David A. Bader and Timothy G. Mattson}, booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}, title = {Analysis of Streaming Social Networks and Graphs on Multicore Architectures}, year = 2012, month = mar, dom = 29, address = {Kyoto, Japan}, file = {material/icassp2012.pdf}, slide-url = {http://ur1.ca/i6dz6}, doi = {10.1109/ICASSP.2012.6289126}, url = {http://www.slideshare.net/jasonriedy/icassp-2012-analysis-of-streaming-social-networks-and-graphs-on-multicore-architectures}, abstract = {Analyzing static snapshots of massive, graph-structured data cannot keep pace with the growth of social networks, financial transactions, and other valuable data sources. We introduce a framework, STING (Spatio-Temporal Interaction Networks and Graphs), and evaluate its performance on multicore, multisocket Intel(R)-based platforms. STING achieves rates of around 100\,000 edge updates per second on large, dynamic graphs with a single, general data structure. We achieve speed-ups of up to 1000$\times$ over parallel static computation, improve monitoring a dynamic graph's connected components, and show an exact algorithm for maintaining local clustering coefficients performs better on Intel-based platforms than our earlier approximate algorithm.}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InCollection{dimacs10-workshop, author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader}, ejr-withauthor ={Henning Meyerhenke and David Ediger and David A. Bader}, title = {Parallel Community Detection for Massive Graphs}, booktitle = {10th DIMACS Implementation Challenge Workshop - Graph Partitioning and Graph Clustering}, OPTtags = {parallel; graph; community detection}, publisher = {(workshop paper)}, year = 2012, month = feb, dom = 14, address = {Atlanta, Georgia}, note = {Won first place in the Mix Challenge and Mix Pareto Challenge}, url = {http://www.cc.gatech.edu/dimacs10/papers/\&\#91;15\&\#93;-dimacs10-community-detection.pdf}, file = {material/dimacs10-community-detection.pdf}, abstract = {Tackling the current volume of graph-structured data requires parallel tools. We extend our work on analyzing such massive graph data with a massively parallel algorithm for community detection that scales to current data sizes, clustering a real-world graph of over 100 million vertices and over 3 billion edges in under 500 seconds on a four- processor Intel E7-8870-based server. Our algorithm achieves moderate parallel scalability without sacrificing sequential operational complexity. Community detection partitions a graph into subgraphs more densely connected within the subgraph than to the rest of the graph. We take an agglomerative approach similar to Clauset, Newman, and Moore’s sequential algorithm, merging pairs of connected intermediate subgraphs to optimize different graph properties. Working in parallel opens new approaches to high performance. We improve performance of our parallel community detection algorithm on both the Cray XMT2 and OpenMP platforms and adapt our algorithm to the DIMACS Implementation Challenge data set.}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InProceedings{ppam11, author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader}, ejr-withauthor ={Henning Meyerhenke and David Ediger and David A. Bader}, title = {Parallel Community Detection for Massive Graphs}, booktitle = {9th International Conference on Parallel Processing and Applied Mathematics (PPAM11)}, year = 2011, month = sep, publisher = {Springer}, role = {proceedings}, OPTtags = {parallel; graph; community detection}, abstract = {Tackling the current volume of graph-structured data requires parallel tools. We extend our work on analyzing such massive graph data with the first massively parallel algorithm for community detection that scales to current data sizes, scaling to graphs of over 122 million vertices and nearly 2 billion edges in under 7300 seconds on a massively multithreaded Cray XMT. Our algorithm achieves moderate parallel scalability without sacrificing sequential operational complexity. Community detection partitions a graph into subgraphs more densely connected within the subgraph than to the rest of the graph. We take an agglomerative approach similar to Clauset, Newman, and Moore's sequential algorithm, merging pairs of connected intermediate subgraphs to optimize different graph properties. Working in parallel opens new approaches to high performance. On smaller data sets, we find the output's modularity compares well with the standard sequential algorithms.}, acc-note = {(134/243 papers accepted, 55\% acceptance rate)}, doi = {10.1007/978-3-642-31464-3\_29}, file = {material/ppam11-community-detection.pdf}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InProceedings{mtaap11, author = {David Ediger and E. Jason Riedy and David A. Bader and Henning Meyerhenke}, ejr-withauthor ={David Ediger and David A. Bader and Henning Meyerhenke}, title = {Tracking Structure of Streaming Social Networks}, booktitle = {5th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, OPTtags = {parallel; graph; streaming}, year = 2011, month = may, abstract = {Current online social networks are massive and still growing. For example, Facebook has over 500 million active users sharing over 30 billion items per month. The scale within these data streams has outstripped traditional graph analysis methods. Monitoring requires dynamic analysis rather than repeated static analysis. The massive state behind multiple persistent queries requires shared data structures and not problem-specific representations. We present a framework based on the STINGER data structure that can monitor a global property, connected components, on a graph of 16 million vertices at rates of up to 240\,000 updates per second on a 32 processor Cray XMT. For very large scale-free graphs, our implementation uses novel batching techniques that exploit the scale-free nature of the data and run over three times faster than prior methods. Our framework handles, for the first time, real-world data rates, opening the door to higher-level analytics such as community and anomaly detection.}, acc-note = {(10/17 papers accepted, 59\% acceptance rate)}, doi = {10.1109/IPDPS.2011.326}, file = {material/TrackingComponents-MTAAP11.pdf}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InProceedings{icpp10, author = {David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader and Courtney Corley and Rob Farber and William N. Reynolds}, ejr-withauthor ={David Ediger and Karl Jiang and David A. Bader and Courtney Corley and Rob Farber and William N. Reynolds}, title = {Massive Social Network Analysis: Mining Twitter for Social Good}, booktitle = {39th International Conference on Parallel Processing ({ICPP})}, role = {proceedings}, OPTtags = {parallel; graph}, year = 2010, address = {San Diego, CA}, month = sep, dom = 16, acc-note = {(70/225 papers accepted: 31.1\% acceptance rate)}, doi = {10.1109/ICPP.2010.66}, file = {material/ICPP10-GraphCT.pdf}, abstract = {Social networks produce an enormous quantity of data. Facebook consists of over 400 million active users sharing over 5 \emph{billion} pieces of information each month. Analyzing this vast quantity of unstructured data presents challenges for software and hardware. We present GraphCT, a \emph{Graph} \emph{C}haracterization \emph{T}ooklit for massive graphs representing social network data. On a 128-processor Cray XMT, GraphCT estimates the betweenness centrality of an artificially generated (R-MAT) 537 million vertex, 8.6 billion edge graph in 55 minutes. We use GraphCT to analyze public data from Twitter, a microblogging network. Twitter's message connections appear primarily tree-structured as a news dissemination system. Within the public data, however, are clusters of conversations. Using GraphCT, we can rank actors within these conversations and help analysts focus attention on a much smaller data subset.}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InProceedings{mtaap10, author = {David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader}, ejr-withauthor ={David Ediger and Karl Jiang and David A. Bader}, title = {Massive Streaming Data Analytics: A Case Study with Clustering Coefficients}, booktitle = {4th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, OPTtags = {parallel; graph; streaming}, year = 2010, address = {Atlanta, GA}, month = apr, dom = 23, acc-note = {(11/22 papers accepted, 50\% acceptance rate)}, doi = {10.1109/IPDPSW.2010.5470687}, file = {material/StreamingCC-MTAAP10.pdf}, abstract = {We present a new approach for parallel massive graph analysis of streaming, temporal data with a dynamic and extensible representation. Handling the constant stream of new data from health care, security, business, and social network applications requires new algorithms and data structures. We examine data structure and algorithm trade-offs that extract the parallelism necessary for high-performance updating analysis of massive graphs. Static analysis kernels often rely on storing input data in a specific structure. Maintaining these structures for each possible kernel with high data rates incurs a significant performance cost. A case study computing clustering coefficients on a general-purpose data structure demonstrates incremental updates can be more efficient than global recomputation. Within this kernel, we compare three methods for dynamically updating local clustering coefficients: a brute-force local recalculation, a sorting algorithm, and our new approximation method using a Bloom filter. On 32 processors of a Cray XMT with a synthetic scale-free graph of $2^{24} \approx 16$ million vertices and $2^{29} \approx 537$ million edges, the brute-force method processes a mean of over 50\,000 updates per second and our Bloom filter approaches 200\,000 updates per second.}, projtag = {cassmt, intel-sting}, keywords = {hpda, graph analysis, streaming data, parallel algorithm}, ejr-proj = {high-performance-data-analysis, graph-analysis}, ejr-grant = {intel-sting, cassmt}, } @InProceedings{lapack-prospectus, author = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, ejr-withauthor ={James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, title = {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries}, booktitle = {{PARA'06}: State-of-the-Art in Scientific and Parallel Computing}, year = 2006, address = {Ume{\aa}, Sweden}, month = jun, organization = {High Performance Computing Center North ({HPC2N}) and the Department of Computing Science, Ume{\aa} University}, publisher = {Springer}, role = {proceedings}, OPTtags = {lapack}, url = {http://www.netlib.org/utk/people/JackDongarra/PAPERS/para06-lapack.pdf}, abstract = {LAPACK and ScaLAPACK are widely used software libraries for numerical linear algebra. There have been over 68M web hits at www.netlib.org for the associated libraries LAPACK, ScaLAPACK, CLAPACK and LAPACK95. LAPACK and ScaLAPACK are used to solve leading edge science problems and they have been adopted by many vendors and software providers as the basis for their own libraries, including AMD, Apple (under Mac OS X), Cray, Fujitsu, HP, IBM, Intel, NEC, SGI, several Linux distributions (such as Debian), NAG, IMSL, the MathWorks (producers of MATLAB), Interactive Supercomputing, and PGI. Future improvements in these libraries will therefore have a large impact on users.}, doi = {10.1007/978-3-540-75755-9\_2}, file = {material/lapack-prospectus.pdf}, projtag = {lapack}, keywords = {lapack, linear algebra, floating point}, ejr-proj = {linear-algebra, floating-point}, } @InProceedings{arith-lang, author = {David Hough and Bill Hay and Jeff Kidder and E. Jason Riedy and Guy L. Steele Jr. and Jim Thomas}, ejr-withauthor ={David Hough and Bill Hay and Jeff Kidder and Guy L. Steele Jr. and Jim Thomas}, title = {Arithmetic Interactions: From Hardware to Applications}, booktitle = {17th {IEEE} Symposium on Computer Arithmetic ({ARITH}'05)}, year = 2005, dom = 28, month = jun, note = {See \hrefx{http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf}{related presentation}}, ISBN = "0-7695-2366-8", role = {proceedings; panel}, OPTtags = {ieee754; floating point}, doi = {10.1109/ARITH.2005.10}, abstract = {The entire process of creating and executing applications that solve interesting problems with acceptable cost and accuracy involves a complex interaction among hardware, system software, programming environments, mathematical software libraries, and applications software, all mediated by standards for arithmetic, operating systems, and programming environments. This panel will discuss various issues arising among these various contending points of view, sometimes from the point of view of issues raised during the current IEEE 754R standards revision effort.}, projtag = {ieee754}, keywords = {ieee754, floating point}, ejr-proj = {floating-point}, } @InProceedings{ia-cost, author = {Joseph N. Wilson and E. Jason Riedy}, ejr-withauthor ={Joseph N. Wilson}, title = {Efficient {SIMD} evaluation of image processing programs}, booktitle = {Parallel and Distributed Methods for Image Processing}, pages = {199--210}, year = 1997, month = jul, dom = 28, editor = {Hongchi Shi and Patrick C. Coffield}, volume = 3166, address = {San Diego, CA}, organization = {SPIE}, role = {proceedings}, OPTtags = {spie; image algebra; parallel algorithms}, file = {material/ia-cost.pdf}, doi = {10.1117/12.279618}, abstract = {SIMD parallel systems have been employed for image processing and computer vision applications since their inception. This paper describes a system in which parallel programs are implemented using a machine-independent, retargetable object library that provides SIMD execution on the Lockheed Martin PAL-I SIMD parallel processor. Programs' performance on this machine is improved through on-the-fly execution analysis and scheduling. We describe the relevant elements of the system structure, the general scheme for execution analysis, and the current cost model for scheduling.}, projtag = {image-algebra}, keywords = {image algebra, parallel algorithm}, }