/PTXInstructionGeneratorAndTimer

Converts Cuda PTX ISA instruction to an expanded out version with timings/benchmarks.

Primary LanguageC#MIT LicenseMIT

PTX Instruction Generator and Timer

a tool to create a list of all the different types of PTX instructions from a main list

This tool takes a list of PTX instructions and expands them out to all their different possible variants and then times the latency of the instruction.

###What this tool does...

  1. It will take an input file like so:
[class=add,popularity=4] add.(rn|rz).(|sat).(f32) _d$3, _s$3, _s$3;
[class=sub,popularity=4] sub.(rn|rz).(|sat).(f32) _d$3, _s$3, _s$3;
[class=mul,popularity=4] mul.(rn|rz).(|sat).(f32) _d$3, _s$3, _s$3;
[class=mad,popularity=3] mad.(lo|hi).(s32|u32) _d$2, _s$2, _s$2, _s$2;
[class=mad,popularity=3] mad.hi.sat.(s32) _d$1, _s$1, _s$1, _s$1;
[class=div,popularity=4] div.approx.(f32) _d$1, _s$1, _s$1;
  1. It will then expand each line to every possible version of that instruction. In the first line above, it would take the "add.(rn|rz).(|sat).(f32) _d$3, _s$3, _s$3;" and expand it out to all the possible different versions. It will basically take all the versions of ("rn" or "rz") and ("" or "sat") to generate four different combinations.

add.rn.f32 _df32, _sf32, _sf32; add.rn.sat.f32 _df32, _sf32, _sf32; add.rz.f32 _df32, _sf32, _sf32; add.rz.sat.f32 _df32, _sf32, _sf32;

The "$3" in the params was converted to what's in the 3rd parentheses. In this case "f32" was.

One final note here is the "s" or "d" on the parameters right after the underscore. These specify if it's a source(input) or destination(output) register.

  1. After expanding each item it will then time it. Please note that some items will have zero ticks. This happens when the PTX is converted to SASS and the compiler optimizes out the instruction we are trying to test. So in a way it is 0 clock cycles but if the test code is changed up a bit it will probably reflect the real clock cycle cost.

  2. Now everything gets outputted into a xml file and C# class file. Here are some examples of each: ####The XML output file:

<?xml version="1.0"?>
<!--PTX Instructions List-->
<!--Generated by the PTX instruction generator-->
<Everything>
<InstructionClasses>
  <Class text="add" count="4" popularity="3640" id="0"/>
  <Class text="sub" count="4" popularity="3640" id="1"/>
  <Class text="mul" count="4" popularity="3640" id="2"/>
  <Class text="mad" count="4" popularity="3640" id="3"/>
  <Class text="mad" count="1" popularity="3640" id="4"/>
  <Class text="div" count="1" popularity="3640" id="5"/>
  <Class text="abs" count="1" popularity="3640" id="6"/>
  <Class text="neg" count="1" popularity="3640" id="7"/>
  <Class text="max" count="1" popularity="3640" id="8"/>
   ...
</InstructionClasses>
<Instructions>
  <Instruction text="add.rn.f32" id="0" outputType="0" ticks="3" subpopularity="910.0000000" cat="0"/>
  <Instruction text="add.rn.sat.f32" id="1" outputType="0" ticks="14" subpopularity="910.0000000" cat="0"/>
  <Instruction text="add.rz.f32" id="2" outputType="0" ticks="12" subpopularity="910.0000000" cat="0"/>
  <Instruction text="add.rz.sat.f32" id="3" outputType="0" ticks="12" subpopularity="910.0000000" cat="0"/>
  <Instruction text="sub.rn.f32" id="4" outputType="0" ticks="3" subpopularity="910.0000000" cat="1"/>
  <Instruction text="sub.rn.sat.f32" id="5" outputType="0" ticks="3" subpopularity="910.0000000" cat="1"/>
  <Instruction text="sub.rz.f32" id="6" outputType="0" ticks="3" subpopularity="910.0000000" cat="1"/>
  <Instruction text="sub.rz.sat.f32" id="7" outputType="0" ticks="3" subpopularity="910.0000000" cat="1"/>
  <Instruction text="mul.rn.f32" id="8" outputType="0" ticks="3" subpopularity="910.0000000" cat="2"/>
  <Instruction text="mul.rn.sat.f32" id="9" outputType="0" ticks="3" subpopularity="910.0000000" cat="2"/>
  <Instruction text="mul.rz.f32" id="10" outputType="0" ticks="12" subpopularity="910.0000000" cat="2"/>
  <Instruction text="mul.rz.sat.f32" id="11" outputType="0" ticks="14" subpopularity="910.0000000" cat="2"/>
  <Instruction text="mad.lo.s32" id="12" outputType="10" ticks="3" subpopularity="910.0000000" cat="3"/>
  <Instruction text="mad.lo.u32" id="13" outputType="6" ticks="3" subpopularity="910.0000000" cat="3"/>
  <Instruction text="mad.hi.s32" id="14" outputType="10" ticks="3" subpopularity="910.0000000" cat="3"/>
  <Instruction text="mad.hi.u32" id="15" outputType="6" ticks="12" subpopularity="910.0000000" cat="3"/>
  <Instruction text="mad.hi.sat.s32" id="16" outputType="10" ticks="12" subpopularity="3640.0000000" cat="4"/>
  <Instruction text="div.approx.f32" id="17" outputType="0" ticks="44" subpopularity="3640.0000000" cat="5"/>
  <Instruction text="abs.f32" id="18" outputType="0" ticks="14" subpopularity="3640.0000000" cat="6"/>
  <Instruction text="neg.f32" id="19" outputType="0" ticks="3" subpopularity="3640.0000000" cat="7"/>
  <Instruction text="max.f32" id="20" outputType="0" ticks="3" subpopularity="3640.0000000" cat="8"/>
   ....
</Instructions>
</Everything>

####The C# output version..

using System;
using System.Collections.Generic;

namespace InstNS
{
    public static class Tools
    {
        public static InstCat[] instCats = {
		  new InstCat(0,4,3640,"add"),
          new InstCat(1,4,3640,"sub"),
          new InstCat(2,4,3640,"mul"),
          new InstCat(3,4,3640,"mad"),
          new InstCat(4,1,3640,"mad"),
          new InstCat(5,1,3640,"div"),
          new InstCat(6,1,3640,"abs"),
          new InstCat(7,1,3640,"neg"),
          new InstCat(8,1,3640,"max"),
        };
public static Inst[] insts = {
          new Inst(0,0,910,1,"add.rn.f32",VarType.f32,0, false),
          new Inst(1,0,910,4,"add.rn.sat.f32",VarType.f32,0, false),
          new Inst(2,0,910,4,"add.rz.f32",VarType.f32,0, false),
          new Inst(3,0,910,4,"add.rz.sat.f32",VarType.f32,0, false),
          new Inst(4,1,910,1,"sub.rn.f32",VarType.f32,0, false),
          new Inst(5,1,910,1,"sub.rn.sat.f32",VarType.f32,0, false),
          new Inst(6,1,910,1,"sub.rz.f32",VarType.f32,0, false),
          new Inst(7,1,910,1,"sub.rz.sat.f32",VarType.f32,0, false),
          new Inst(8,2,910,1,"mul.rn.f32",VarType.f32,0, false),
          new Inst(9,2,910,1,"mul.rn.sat.f32",VarType.f32,0, false),
          new Inst(10,2,910,4,"mul.rz.f32",VarType.f32,0, false),
          new Inst(11,2,910,4,"mul.rz.sat.f32",VarType.f32,0, false),
          new Inst(12,3,910,1,"mad.lo.s32",VarType.s32,0, false),
          new Inst(13,3,910,1,"mad.lo.u32",VarType.u32,0, false),
          new Inst(14,3,910,1,"mad.hi.s32",VarType.s32,0, false),
          new Inst(15,3,910,4,"mad.hi.u32",VarType.u32,0, false),
          new Inst(16,4,3640,4,"mad.hi.sat.s32",VarType.s32,0, false),
          new Inst(17,5,3640,14,"div.approx.f32",VarType.f32,0, false),
          new Inst(18,6,3640,4,"abs.f32",VarType.f32,0, false),
          new Inst(19,7,3640,1,"neg.f32",VarType.f32,0, false),
          new Inst(20,8,3640,1,"max.f32",VarType.f32,0, false),
          new Inst(21,9,3640,1,"min.f32",VarType.f32,0, false),
        };
  }
... plus some useful functions and classes
}

###Change History ####Apr 27 2009 - Created first working for PTX 1.x ####Sep 25 2011 - Updated for newer PTX version ####Sep 9 2016 - Posted on GitHub and added license headers ... exactly 5 years later =) ####Oct 2 2016 - Updated C# Cuda wrapper from Cuda.Net to ManagedCuda. It now runs but it could still use more updating. Some documentation and code cleanup was also done.