# HG changeset patch
# User Nina Engelhardt <nengel@mailbox.tu-berlin.de>
# Date 1348581333 -7200
# Node ID 897f711a7157c133b3bbdd2d9bb2e924baa9de8b
# Parent  11d15c47beaf309fb4478842555b5448df024fbf
rearrange to work with autoconf

diff -r 11d15c47beaf -r 897f711a7157 COPYING.GPLv3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COPYING.GPLv3	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff -r 11d15c47beaf -r 897f711a7157 README.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,79 @@
+App: h264dec
+
+This application decodes H.264 raw videos.
+
+Build Sequential/Pthreads:
+
+autoreconf -i -f
+mkdir build
+cd build
+../configure --enable-ssse3 --enable-sdl2
+make
+
+Build OmpSs:
+
+autoreconf -i -f
+mkdir build
+cd build-ss
+../configure CC=sscc --enable-ssse3 --enable-sdl2
+make
+
+ssse3 enables assembler optimizations up to ssse3 (optional)
+sdl enables a rudimentary viewing capability      (optional)
+
+Usage Sequential/Pthreads:
+./h264dec -i $(INPUT_VIDEO) -s
+./h264dec -i $(INPUT_VIDEO) -t $(THREADS)
+
+Usage OmpSs:
+NX_PES=<numthreads> ./h264dec -i <inputfile> -e <num parallel entropy frames> -z <width> <height> --static-3d
+
+-e specify the number of entropy decode pipeline buffers and should be ideally
+the same as the number of threads.
+
+-z allows to set the MB reconstruction grouped block size. A size between 6 by 6 to 10 by 10
+was found to strike a good balance between overhead and parallelism, but is machine and input
+dependent.
+
+--static-3d performs overlapping wavefront decoding.
+
+General usage:
+-d 				displays output
+-f 				fullscreen
+-o $(OUT_FILE)  write raw YUV
+-v  			show framerate
+
+
+The INPUT_VIDEOs are in "inputs_encore", but should be able to decode any raw H.264 stream using
+one slice per frame, non-interlaced, and CABAC, YUV420.
+
+
+Integrated OmpSs player demo
+----------------------------
+NOTE: for the player demo SDL2 must be installed.
+
+1. Go to the OmpSs build directory (/home/cchi/Projects/ffmpeg_smp/build-ss)
+
+2. Launch the H.264 decoder with the desired options:
+
+NX_PES=<numthreads> ./h264dec <inputfile> -v (verbose) -e <num parallel entropy frames> -z <width> <height> -d (display) -f (fullscreen)
+
+note that <num parallel entropy frames> should be equal or higher than <numthreads> for optimal performance
+
+Examples:
+
+NX_PES=7 ./h264dec -i ../../h264_movies/park_joy_2160px5.h264 -v -z 8 8 -df -e 9
+NX_PES=7 ./h264dec -i ../../h264_movies/big_buck_bunny_1080p24.h264 -v -d  -z 6 6 -e 9
+
+Interacting with the program
+----------------------------
+<CTRL+F>    Fullscreen mode
+<ESCAPE>    Window mode
+<SPACE>     Pause/resume
+<M>         Show/hide macroblock borders
+<arrows>    When macroblock borders are shown resizes the macroblocks
+<ALT+F4>    Close
+
+Force close in case of lockup
+-----------------------------
+On a terminal: killall -9 h264dec
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/benchmark.sh
--- a/ffmpeg_smp/benchmark.sh	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,126 +0,0 @@
-#! /bin/bash
-
-workers=(1 4 8 12 16 20 24 28 32)
-cpus=(0 3 7 15 15 23 23 31 31)
-nodes=(0 0 0 1 1 2 2 3 3)
-
-confs=( "1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17"		#small
-		"1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15")    #large
-
-
-
-#confsmall=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17")
-# "7 10 21" "8 12 25" "10 15 29" "11 17 32")
-#conflarge=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15")
-#"5 12 21" "6 15 25" "7 17 30" "8 19 36")
-
-
-configs=9
-
-average_ompss_2d=0
-average_ompss_3d=0
-average_pthread=0
-average_serial=0
-
-iterations_low=4
-iterations_high=8
-
-nframes=10000  # max frames limit for debug purpose
-inputs=("14" "10")
-inputs_vebose=("Big Bug Bunny 1920x1080 10000 frames" "Park Joy 3840x2160 2500 frames")
-osargs=("-z 8 8" "-z 12 12 --static-3d")
-
-time_stamp=`date +%Y.%m.%d_%H.%M.%S`
-outputdir="/home/stefan.hauser/ffmpeg_smp/ppopp_results/rx600s5-1t/$time_stamp"
-ompss_2d="$outputdir/ompss_2d.txt"
-ompss_3d="$outputdir/ompss_3d.txt"
-pthread="$outputdir/pthread.txt"
-serial="$outputdir/serial.txt"
-
-#executes the experiments for a single conf $1=confnum $2 iterations $3 input_idx
-function execute_single_conf {
-	conf=$1
-	iter=$2
-	iidx=$3
-
-	average_ompss_2d=0
-	average_ompss_3d=0
-	average_pthread=0
-
-	echo "Workers: " ${workers[$conf]} | tee -a $ompss_2d $ompss_3d $pthread $serial
-
-	cd build-ss
-	for ((i=1;i<=$iter;i+=1)); do
-	    # OMPSS
-	    #export CSS_NUM_CPUS=$worker
-	    NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[0]} 2> output
-		runtime=$(cat output | grep real | sed s/^.*l.//g)
-	    average_ompss_2d=$(echo "$average_ompss_2d + $runtime"|bc)
-	    echo -n $runtime " " >> $ompss_2d
-	done
-
-	for ((i=1;i<=$iter;i+=1)); do
-		NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[1]} 2> output
-		runtime=$(cat output | grep real | sed s/^.*l.//g)
-	    average_ompss_3d=$(echo "$average_ompss_3d + $runtime"|bc)
-	    echo -n $runtime " " >> $ompss_3d
-	done
-	cd ..
-
-	cd build
-	for ((i=1;i<=$iter;i+=1)); do
-		# Pthreads
-	    numactl --physcpubind=0-$((${cpus[$conf]})) time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -t ${confs[$(($conf + $iidx * $configs))]} 2> output
-		runtime=$(cat output | grep real | sed s/^.*l.//g)
-	    average_pthread=$(echo "$average_pthread + $runtime"|bc)
-	    echo -n $runtime " " >> $pthread
-	done
-	cd ..
-
-	echo "" | tee -a $pthread $ompss_2d $ompss_3d
-	average_ompss_2d=$(echo "scale=5;$average_ompss_2d/$iter"|bc)
-	average_ompss_3d=$(echo "scale=5;$average_ompss_3d/$iter"|bc)
-	average_pthread=$(echo "scale=5;$average_pthread/$iter"|bc)
-
-	echo "time: " $average_ompss_2d >> $ompss_2d
-	echo "time: " $average_ompss_3d >> $ompss_3d
-	echo "time: " $average_pthread >> $pthread
-	echo "time: " $average_serial >> $serial
-}
-
-
-mkdir $outputdir
-
-echo "Processing inputs ..."
-
-echo "h264dec Benchmark" | tee $ompss_2d $ompss_3d $pthread $serial
-
-for n in 0 1; do
-	echo "Input: ${inputs_vebose[$n]}" | tee -a $ompss_2d $ompss_3d $pthread $serial
-	echo "" | tee -a $ompss_2d $ompss_3d $pthread $serial
-
-	# Serial
-	cd build
-	numactl --physcpubind=0 time -p ./ffmpeg -i ${inputs[$n]} -n $nframes -s 2> output
-	runtime=$(cat output | grep real | sed s/^.*l.//g)
-	average_serial=$runtime
-	cd ..
-
-	execute_single_conf 0 1 $n
-
-	#Parallel
-	for ((confidx=1;confidx<=4;confidx+=1)); do
-		execute_single_conf $confidx $iterations_low $n		
-	done
-
-	for ((confidx=5;confidx<=$(($configs-1));confidx+=1)); do
-		execute_single_conf $confidx $iterations_high $n		
-	done
-
-	echo "-------------------" | tee -a $ompss_2d $ompss_3d $pthread $serial
-done
-
-echo "FINISHED"
-
-rm build/output build-ss/output
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/COPYING.GPLv3
--- a/ffmpeg_smp/h264dec/COPYING.GPLv3	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/README.txt
--- a/ffmpeg_smp/h264dec/README.txt	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,79 +0,0 @@
-App: h264dec
-
-This application decodes H.264 raw videos.
-
-Build Sequential/Pthreads:
-
-autoreconf -i -f
-mkdir build
-cd build
-../configure --enable-ssse3 --enable-sdl2
-make
-
-Build OmpSs:
-
-autoreconf -i -f
-mkdir build
-cd build-ss
-../configure CC=sscc --enable-ssse3 --enable-sdl2
-make
-
-ssse3 enables assembler optimizations up to ssse3 (optional)
-sdl enables a rudimentary viewing capability      (optional)
-
-Usage Sequential/Pthreads:
-./h264dec -i $(INPUT_VIDEO) -s
-./h264dec -i $(INPUT_VIDEO) -t $(THREADS)
-
-Usage OmpSs:
-NX_PES=<numthreads> ./h264dec -i <inputfile> -e <num parallel entropy frames> -z <width> <height> --static-3d
-
--e specify the number of entropy decode pipeline buffers and should be ideally
-the same as the number of threads.
-
--z allows to set the MB reconstruction grouped block size. A size between 6 by 6 to 10 by 10
-was found to strike a good balance between overhead and parallelism, but is machine and input
-dependent.
-
---static-3d performs overlapping wavefront decoding.
-
-General usage:
--d 				displays output
--f 				fullscreen
--o $(OUT_FILE)  write raw YUV
--v  			show framerate
-
-
-The INPUT_VIDEOs are in "inputs_encore", but should be able to decode any raw H.264 stream using
-one slice per frame, non-interlaced, and CABAC, YUV420.
-
-
-Integrated OmpSs player demo
-----------------------------
-NOTE: for the player demo SDL2 must be installed.
-
-1. Go to the OmpSs build directory (/home/cchi/Projects/ffmpeg_smp/build-ss)
-
-2. Launch the H.264 decoder with the desired options:
-
-NX_PES=<numthreads> ./h264dec <inputfile> -v (verbose) -e <num parallel entropy frames> -z <width> <height> -d (display) -f (fullscreen)
-
-note that <num parallel entropy frames> should be equal or higher than <numthreads> for optimal performance
-
-Examples:
-
-NX_PES=7 ./h264dec -i ../../h264_movies/park_joy_2160px5.h264 -v -z 8 8 -df -e 9
-NX_PES=7 ./h264dec -i ../../h264_movies/big_buck_bunny_1080p24.h264 -v -d  -z 6 6 -e 9
-
-Interacting with the program
-----------------------------
-<CTRL+F>    Fullscreen mode
-<ESCAPE>    Window mode
-<SPACE>     Pause/resume
-<M>         Show/hide macroblock borders
-<arrows>    When macroblock borders are shown resizes the macroblocks
-<ALT+F4>    Close
-
-Force close in case of lockup
------------------------------
-On a terminal: killall -9 h264dec
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/configure.ac
--- a/ffmpeg_smp/h264dec/configure.ac	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,171 +0,0 @@
-#                                               -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-
-AC_PREREQ(2.61)
-AC_INIT([h264_mt], [0.1], [cchi@cs.tu-berlin.de])
-#AM_INIT_AUTOMAKE(AC_PACKAGE_NAME, AC_PACKAGE_VERSION)
-AM_INIT_AUTOMAKE([-Wall -Werror foreign])
-
-AC_CONFIG_SRCDIR([h264dec.c])
-AC_PROG_RANLIB
-
-# Checks for programs.
-AC_GNU_SOURCE
-AC_PROG_CC
-AM_CONDITIONAL([HAVE_OMPSS], [test $CC = "sscc"])
-AC_DEFINE([OMPSS], [0], [Define to 1 on when using the OmpSs compiler sscc])
-if test $CC = "sscc";then
-AC_DEFINE([OMPSS], [1], [Define to 1 on when using the OmpSs compiler sscc])
-fi
-
-#if [ test -n "${CFLAGS+x}" ] ; then
-#    CFLAGS="-O3 -g"
-#fi
-
-# Checks for libraries.
-AC_CHECK_LIB([pthread], [pthread_yield])
-AC_CHECK_LIB([spe2], [spe_image_open])
-AC_CHECK_LIB([sync], [mutex_init])
-AC_CHECK_LIB([rt], [clock_gettime])
-
-AC_ARG_ENABLE([sdl2], AS_HELP_STRING([--enable-sdl2], [Enable SDL2 playback]))
-if test "$enable_sdl2" = "yes"; then
-	AC_CHECK_LIB([SDL2], [SDL_CreateWindow], [], [echo "Error! libSDL2 required for playback." exit -1])
-fi
-
-if test "$enable_sdl2" = "yes"; then
-	AC_CHECK_LIB([X11], [XInitThreads], [], [echo "Error! libX11 currently required for SDL2 workaround." exit -1])
-fi
-
-AC_ARG_ENABLE([sdl_ttf], AS_HELP_STRING([--enable-sdl_ttf], [Enable SDL_ttf for overlaying fonts]))
-if test "$enable_sdl_ttf" = "yes"; then
-    AC_CHECK_LIB([SDL_ttf], [TTF_Init], [], [echo "Error! libSDL_ttf required for font rendering." exit -1])
-fi
-
-
-
-AC_ARG_ENABLE([opencl], AS_HELP_STRING([--enable-opencl], [Enable GPU decoder]))
-if test "$enable_opencl" = "yes"; then
-	AC_CHECK_LIB([OpenCL], [clGetPlatformIDs], [], [echo "Error! libOpenCL required for GPU functionality." exit -1])
-fi
-AM_CONDITIONAL([HAVE_OPENCL], [test "$enable_opencl" = "yes"])
-
-
-# Checks for header files.
-AC_HEADER_STDC
-AC_CHECK_HEADERS([stdint.h stdlib.h string.h unistd.h])
-
-# Checks for typedefs, structures, and compiler characteristics.
-AC_C_CONST
-AC_TYPE_UINT32_T
-AC_TYPE_UINT64_T
-AC_TYPE_UINT8_T
-AC_C_VOLATILE
-AC_C_BIGENDIAN
-
-# Checks for library functions.
-AC_CHECK_FUNCS([malloc realloc memalign posix_memalign memmove memset])
-
-AC_CANONICAL_HOST
-AC_CANONICAL_BUILD
-
-AC_MSG_CHECKING([for architecture])
-
-AC_DEFINE([ARCH_ARM], [0], [Define to 1 on arm architectures.])
-AC_DEFINE([ARCH_X86_32], [0], [Define to 1 on x86 architectures.])
-AC_DEFINE([ARCH_X86_64], [0], [Define to 1 on x86_64 architectures.])
-AC_DEFINE([ARCH_X86], [ARCH_X86_32 ||ARCH_X86_64], [True on x86])
-AC_DEFINE([ARCH_PPC], [0], [Define to 1 on ppc architectures.])
-AC_DEFINE([ARCH_PPC64], [0], [Define to 1 on ppc64 architectures.])
-AC_DEFINE([ARCH_CELL], [0], [Define to 1 on cell architectures.])
-
-if test "$enable_optimizations" != "no"; then
-	case $build_cpu in
-		arm )
-			arch="arm"
-			AC_MSG_RESULT([arm])
-			AC_DEFINE([ARCH_ARM], [1], [Define to 1 on arm architectures.])
-			;;
-		i686 )
-			arch="x86"
-			AC_MSG_RESULT([x86])
-			AC_DEFINE([ARCH_X86_32], [1], [Define to 1 on x86 architectures.])
-			;;
-		x86_64 )
-			arch="x86_64"
-			AC_MSG_RESULT([x86_64])
-			AC_DEFINE([ARCH_X86_64], [1], [Define to 1 on x86 architectures.])
-			;;
-		powerpc64 )
-			AC_DEFINE([HAVE_BIGENDIAN], [1], [Define to 1 on bigendian architectures.])
-			if grep -E ^cpu /proc/cpuinfo | grep -q Cell ; then
-				arch="cell"
-				AC_MSG_RESULT([cell])
-				AC_DEFINE([ARCH_CELL], [1], [Define to 1 on cell architectures.])
-			else
-				arch="powerpc64"
-				AC_MSG_RESULT([ppc64])
-				AC_DEFINE([ARCH_PPC64], [1], [Define to 1 on ppc64 architectures.])
-			fi
-			;;
-		* )
-			AC_MSG_RESULT([default (little endian).])
-			;;
-	esac
-fi
-
-AM_CONDITIONAL([HAVE_CELL], [test $arch = "cell"])
-
-# Additional options
-AC_ARG_ENABLE([optimizations], AS_HELP_STRING([--disable-optimizations], [Disable all architecture specific optimizations. Compiler optimizations are not disabled.]))
-
-AC_DEFINE([HAVE_SSE], [0], [Define to 1 to enable sse optimizations.])
-AC_DEFINE([HAVE_MMX], [0], [Define to 1 to enable mmx optimizations.])
-AC_DEFINE([HAVE_MMX2], [0], [Define to 1 to enable mmx2 optimizations.])
-AC_DEFINE([HAVE_SSSE3], [0], [Define to 1 to enable ssse3 optimizations.])
-AC_DEFINE([HAVE_ALTIVEC], [0], [Define to 1 to enable altivec optimizations.])
-AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.])
-
-AC_ARG_ENABLE([ssse3], AS_HELP_STRING([--enable-ssse3], [Enable ssse3 optimizations]))
-if test "$enable_ssse3" = "yes"; then
-	AC_DEFINE([HAVE_SSSE3], [1], [Define to 1 to enable ssse3 optimizations.])
-	AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.])
-	AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.])
-	AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.])
-	ARCH_SUBDIR=x86
-fi
-
-AC_ARG_ENABLE([sse], AS_HELP_STRING([--enable-sse], [Enable sse optimizations]))
-if test "$enable_sse" = "yes"; then
-	AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.])
-	AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.])
-	AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.])
-	ARCH_SUBDIR=x86
-fi
-
-AC_ARG_ENABLE([altivec], AS_HELP_STRING([--enable-altivec], [Enable altivec optimizations]))
-if test "$enable_altivec" = "yes"; then
-	AC_DEFINE([HAVE_ALTIVEC], [1], [Define to 1 to enable altivec optimizations.])
-	ARCH_SUBDIR="$ARCH_SUBDIR ppc"
-	TMPCLAGS=$CFLAGS
-	CFLAGS="$CFLAGS -maltivec"
-	AC_CHECK_HEADERS(altivec.h)
-	CFLAGS=$TMPCLAGS
-fi
-
-AC_ARG_ENABLE([neon], AS_HELP_STRING([--enable-neon], [Enable neon optimizations]))
-if test "$enable_neon" = "yes"; then
-	AC_DEFINE([HAVE_NEON], [1], [Define to 1 to enable neon optimizations.])
-	ARCH_SUBDIR=arm
-fi
-
-AM_CONDITIONAL([HAVE_ARCH_SUBDIR], [test "$ARCH_SUBDIR" != ""])
-AC_SUBST([ARCH_SUBDIR])
-
-AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.])
-
-AC_CONFIG_HEADER([config.h])
-
-AC_CONFIG_FILES([Makefile libavutil/Makefile libavcodec/Makefile libavcodec/x86/Makefile libavcodec/ppc/Makefile libavcodec/cell/Makefile])
-
-AC_OUTPUT
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/h264dec.c
--- a/ffmpeg_smp/h264dec/h264dec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,288 +0,0 @@
-/*
-* H264 decoder main
-*/
-
-#include "config.h"
-#include "libavcodec/h264.h"
-
-#include <string.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <signal.h>
-#include <unistd.h>
-#include <getopt.h>
-#include <fcntl.h>
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <time.h>
-
-#include <assert.h>
-
-
-static const char program_name[] = "h264dec";
-static const int program_birth_year = 2010;
-
-static const char *file_name;
-static int ifile, ofile;
-static int no_arch =0;
-static int parallel = 1;
-static int frame_width  = 0;
-static int frame_height = 0;
-
-static void av_exit(int ret)
-{
-    //do some free calls
-#undef exit
-    exit(ret);
-}
-
-static void opt_input_file(const char *filename)
-{
-    /* open the input file */
-    ifile = open(filename, O_RDONLY, 0666);
-    if (ifile < 0){
-        fprintf(stderr, "Failed to open %s\n", filename);
-        av_exit(-1);
-    }
-
-    //parse first frame to get resolution (other information available but not used)
-    H264Slice slice;
-    PictureInfo pi;
-    GetBitContext gb = {0,};
-    ParserContext *pc;
-    NalContext *nc;
-
-    pc = get_parse_context(ifile);
-    nc = get_nal_context(0, 0);
-
-    memset(&slice, 0, sizeof(H264Slice));
-    slice.current_picture_info=&pi;
-
-    av_read_frame_internal(pc, &gb);
-    decode_nal_units(nc, &slice, &gb);
-
-    frame_width = nc->width;
-    frame_height= nc->height;
-
-    //clean up
-    av_freep(&gb.raw);
-    if (gb.rbsp)
-        av_freep(&gb.rbsp);
-    free_parse_context(pc);
-    free_nal_context(nc);
-
-    //rewind file
-    int offset;
-    if ( (offset=lseek(ifile, 0, SEEK_SET)) ){
-        fprintf(stderr, "Rewind input file %s failed at offset %d\n", filename, offset);
-    }
-
-}
-
-static void opt_output_file(const char *filename)
-{
-    if (filename){
-        if (!strcmp(filename, "-"))
-            filename = "pipe:";
-
-        ofile = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0666);
-    }else{
-        ofile =0;
-    }
-}
-
-static void show_usage(void)
-{
-    printf("usage: ffmpeg [options] -i infile }...\n");
-    printf("\n");
-}
-
-static struct option long_options[] = {
-    {"static-sched", 0, 0, 0},
-    {"static-mbd", 0, 0, 0},
-    {"numamap", 0, 0, 0},
-    {"no-mbd", 0, 0, 0},
-    {"static-3d", 0, 0, 0},
-    {"slice-bufs", 1, 0, 0},
-    {"smt", 0, 0, 0},
-    {"noarch", 0, 0, 'a'},
-    {"display", 0, 0, 'd'},
-    {"fullscreen", 0, 0, 'f'},
-    {"numframes", 1, 0, 'n'},
-    {"use-ppe-ed", 1, 0, 'p'},
-    {"sequential", 0, 0, 's'},
-    {"threads", 1, 0, 't'},
-    {"verbose", 1, 0, 'v'},
-    {"wave-order", 1, 0, 'w'},
-    {"smb-size", 1, 0, 'z'},
-    {"pipe-bufs", 1, 0, 'e'},
-    {0, 0, 0, 0}
-};
-
-static h264_options cli_opts;
-static void parse_cmd(int argc, char **argv)
-{
-    int c;
-    int digit_optind = 0;
-    int option_index = 0;
-    char ofile_name[1024];
-    extern char *optarg;
-    extern int optind, optopt;
-
-    cli_opts.statsched =0;
-    cli_opts.numamap =0;
-    cli_opts.statmbd =0;
-    cli_opts.no_mbd= 0;
-    cli_opts.numframes = INT_MAX;
-    cli_opts.display=0;
-    cli_opts.fullscreen=0;
-    cli_opts.verbose=0;
-    cli_opts.ppe_ed=0;
-    cli_opts.profile=0;
-    cli_opts.threads = 1;
-    cli_opts.smb_size[0] = cli_opts.smb_size[1] = 1;
-    cli_opts.wave_order=0;
-    cli_opts.static_3d=0;
-    cli_opts.pipe_bufs=8;
-    cli_opts.slice_bufs=1;
-    cli_opts.smt= 0;
-    while ((c = getopt_long(argc, argv, "ade:fi:n:o:p:st:vwz:", long_options, &option_index)) != -1 ){
-        int this_option_optind = optind ? optind : 1;
-
-        switch (c){
-            case 0:
-                if (option_index==0){
-                    cli_opts.statsched=1;
-                }else if (option_index==1){
-                    cli_opts.statmbd= 1;
-                }else if (option_index==2){
-                    cli_opts.numamap= 1;
-                }else if (option_index==3){
-                    cli_opts.no_mbd= 1;
-                }else if (option_index==4){
-                    cli_opts.static_3d= 1;
-                }else if (option_index==5){
-                    cli_opts.slice_bufs= (unsigned) atoi(optarg);
-                }else if (option_index==6){
-                    cli_opts.smt= 1;
-                }
-                break;
-            case '0':
-            case '1':
-            case '2':
-                if (digit_optind != 0 && digit_optind != this_option_optind)
-                    printf("digits occur in two different argv-elements.\n");
-                digit_optind = this_option_optind;
-                printf("option %c\n", c);
-                break;
-            case 'a':
-                no_arch=1;
-                break;
-            case 'd':
-                cli_opts.display=1;
-                break;
-            case 'f':
-                cli_opts.fullscreen=1;
-                break;
-            case 'i':
-                file_name = (const char *)optarg;
-                opt_input_file(file_name);
-                break;
-            case 'n':
-                cli_opts.numframes = (unsigned) atoi(optarg);
-                break;
-            case 'o':
-                strcpy(ofile_name, optarg);
-                opt_output_file(ofile_name);
-                break;
-            case 'p':
-                cli_opts.profile = (unsigned) atoi(optarg);
-                break;
-            case 's':
-                cli_opts.threads = 0;
-                parallel = 0;
-                break;
-            case 't':
-                cli_opts.threads = atoi(optarg);
-                if (cli_opts.threads<=0){
-                    fprintf(stderr, "Option -%c requires thread numbers > 0\n", c);
-                    av_exit(-1);
-                }
-                break;
-            case 'v':
-                cli_opts.verbose = 1;
-                break;
-            case 'w':
-                cli_opts.wave_order = 1;
-                break;
-            case 'z': // only useful in ompss
-                if (argc < optind +1){
-                    fprintf(stderr, "Option -%c (--smb-size) requires 2 arguments\n", c);
-                    av_exit(-1);
-                }
-                optind--;
-                for (int i=0; i<2; i++){
-                    cli_opts.smb_size[i] = atoi(argv[optind++]);
-                    if (!(cli_opts.smb_size > 0)){
-                        fprintf(stderr, "Option -%c (--smb-size) requires dimensions > 0\n", c);
-                        av_exit(-1);
-                    }
-                }
-                break;
-            case 'e':
-                cli_opts.pipe_bufs = atoi(optarg);
-                break;
-            case ':':
-                fprintf(stderr, "Option -%c requires an operand\n", optopt);
-                av_exit(-1);
-                break;
-            case '?':
-                fprintf(stderr, "Unrecognized option: -%c\n", optopt);
-                av_exit(-1);
-                break;
-        }
-    }
-
-}
-
-int main(int argc, char **argv)
-{
-    /* parse options */
-    parse_cmd(argc, argv);
-
-    if(!ifile ) {
-        show_usage();
-        av_exit(1);
-    }
-
-    H264Context *h = get_h264dec_context(file_name, ifile, ofile, frame_width, frame_height, &cli_opts);
-#if OMPSS
-    if (h264_decode_ompss( h ) < 0)
-        av_exit(-1);
-#else
-    if (parallel){
-        if (ARCH_CELL && !no_arch){
-            if (h264_decode_cell( h ) < 0)
-                av_exit(-1);
-        }else{
-            if (h264_decode_pthread( h ) < 0)
-                av_exit(1);
-        }
-    }else{
-        if (ARCH_CELL && !no_arch){
-            if (h264_decode_cell_seq( h ) < 0)
-                av_exit(1);
-        }else{
-            if (h264_decode_seq( h ) < 0)
-                av_exit(1);
-        }
-    }
-#endif
-    free_h264dec_context(h);
-    close(ifile);
-    close(ofile);
-
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/aac.h
--- a/ffmpeg_smp/h264dec/libavcodec/arm/aac.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_AAC_H
-#define AVCODEC_ARM_AAC_H
-
-#include "config.h"
-
-#if HAVE_NEON && HAVE_INLINE_ASM
-
-#define VMUL2 VMUL2
-static inline float *VMUL2(float *dst, const float *v, unsigned idx,
-                           const float *scale)
-{
-    unsigned v0, v1;
-    __asm__ volatile ("ubfx     %0,  %4,  #0, #4      \n\t"
-                      "ubfx     %1,  %4,  #4, #4      \n\t"
-                      "ldr      %0,  [%3, %0, lsl #2] \n\t"
-                      "ldr      %1,  [%3, %1, lsl #2] \n\t"
-                      "vld1.32  {d1[]},   [%5,:32]    \n\t"
-                      "vmov     d0,  %0,  %1          \n\t"
-                      "vmul.f32 d0,  d0,  d1          \n\t"
-                      "vst1.32  {d0},     [%2,:64]!   \n\t"
-                      : "=&r"(v0), "=&r"(v1), "+r"(dst)
-                      : "r"(v), "r"(idx), "r"(scale)
-                      : "d0", "d1");
-    return dst;
-}
-
-#define VMUL4 VMUL4
-static inline float *VMUL4(float *dst, const float *v, unsigned idx,
-                           const float *scale)
-{
-    unsigned v0, v1, v2, v3;
-    __asm__ volatile ("ubfx     %0,  %6,  #0, #2      \n\t"
-                      "ubfx     %1,  %6,  #2, #2      \n\t"
-                      "ldr      %0,  [%5, %0, lsl #2] \n\t"
-                      "ubfx     %2,  %6,  #4, #2      \n\t"
-                      "ldr      %1,  [%5, %1, lsl #2] \n\t"
-                      "ubfx     %3,  %6,  #6, #2      \n\t"
-                      "ldr      %2,  [%5, %2, lsl #2] \n\t"
-                      "vmov     d0,  %0,  %1          \n\t"
-                      "ldr      %3,  [%5, %3, lsl #2] \n\t"
-                      "vld1.32  {d2[],d3[]},[%7,:32]  \n\t"
-                      "vmov     d1,  %2,  %3          \n\t"
-                      "vmul.f32 q0,  q0,  q1          \n\t"
-                      "vst1.32  {q0},     [%4,:128]!  \n\t"
-                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst)
-                      : "r"(v), "r"(idx), "r"(scale)
-                      : "d0", "d1", "d2", "d3");
-    return dst;
-}
-
-#define VMUL2S VMUL2S
-static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
-                            unsigned sign, const float *scale)
-{
-    unsigned v0, v1, v2, v3;
-    __asm__ volatile ("ubfx     %0,  %6,  #0, #4      \n\t"
-                      "ubfx     %1,  %6,  #4, #4      \n\t"
-                      "ldr      %0,  [%5, %0, lsl #2] \n\t"
-                      "lsl      %2,  %8,  #30         \n\t"
-                      "ldr      %1,  [%5, %1, lsl #2] \n\t"
-                      "lsl      %3,  %8,  #31         \n\t"
-                      "vmov     d0,  %0,  %1          \n\t"
-                      "bic      %2,  %2,  #1<<30      \n\t"
-                      "vld1.32  {d1[]},   [%7,:32]    \n\t"
-                      "vmov     d2,  %2,  %3          \n\t"
-                      "veor     d0,  d0,  d2          \n\t"
-                      "vmul.f32 d0,  d0,  d1          \n\t"
-                      "vst1.32  {d0},     [%4,:64]!   \n\t"
-                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst)
-                      : "r"(v), "r"(idx), "r"(scale), "r"(sign)
-                      : "d0", "d1", "d2");
-    return dst;
-}
-
-#define VMUL4S VMUL4S
-static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
-                            unsigned sign, const float *scale)
-{
-    unsigned v0, v1, v2, v3, nz;
-    __asm__ volatile ("vld1.32  {d2[],d3[]},[%9,:32]  \n\t"
-                      "ubfx     %0,  %8,  #0, #2      \n\t"
-                      "ubfx     %1,  %8,  #2, #2      \n\t"
-                      "ldr      %0,  [%7, %0, lsl #2] \n\t"
-                      "ubfx     %2,  %8,  #4, #2      \n\t"
-                      "ldr      %1,  [%7, %1, lsl #2] \n\t"
-                      "ubfx     %3,  %8,  #6, #2      \n\t"
-                      "ldr      %2,  [%7, %2, lsl #2] \n\t"
-                      "vmov     d0,  %0,  %1          \n\t"
-                      "ldr      %3,  [%7, %3, lsl #2] \n\t"
-                      "lsr      %6,  %8,  #12         \n\t"
-                      "rbit     %6,  %6               \n\t"
-                      "vmov     d1,  %2,  %3          \n\t"
-                      "lsls     %6,  %6,  #1          \n\t"
-                      "and      %0,  %5,  #1<<31      \n\t"
-                      "lslcs    %5,  %5,  #1          \n\t"
-                      "lsls     %6,  %6,  #1          \n\t"
-                      "and      %1,  %5,  #1<<31      \n\t"
-                      "lslcs    %5,  %5,  #1          \n\t"
-                      "lsls     %6,  %6,  #1          \n\t"
-                      "and      %2,  %5,  #1<<31      \n\t"
-                      "lslcs    %5,  %5,  #1          \n\t"
-                      "vmov     d4,  %0,  %1          \n\t"
-                      "and      %3,  %5,  #1<<31      \n\t"
-                      "vmov     d5,  %2,  %3          \n\t"
-                      "veor     q0,  q0,  q2          \n\t"
-                      "vmul.f32 q0,  q0,  q1          \n\t"
-                      "vst1.32  {q0},     [%4,:128]!  \n\t"
-                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
-                        "+r"(sign), "=r"(nz)
-                      : "r"(v), "r"(idx), "r"(scale)
-                      : "d0", "d1", "d2", "d3", "d4", "d5");
-    return dst;
-}
-
-#endif /* HAVE_NEON && HAVE_INLINE_ASM */
-
-#endif /* AVCODEC_ARM_AAC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/asm.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/asm.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#ifdef __ELF__
-#   define ELF
-#else
-#   define ELF @
-#endif
-
-        .macro require8, val=1
-ELF     .eabi_attribute 24, \val
-        .endm
-
-        .macro preserve8, val=1
-ELF     .eabi_attribute 25, \val
-        .endm
-
-        .macro function name, export=0
-        .macro endfunc
-ELF     .size   \name, . - \name
-        .endfunc
-        .purgem endfunc
-        .endm
-.if \export
-        .global EXTERN_ASM\name
-EXTERN_ASM\name:
-.endif
-ELF     .type   \name, %function
-        .func   \name
-\name:
-        .endm
-
-        .macro movrel rd, val
-#if HAVE_ARMV6T2 && !CONFIG_PIC
-        movw            \rd, #:lower16:\val
-        movt            \rd, #:upper16:\val
-#else
-        ldr             \rd, =\val
-#endif
-        .endm
-
-#if HAVE_VFP_ARGS
-        .eabi_attribute 28, 1
-#   define VFP
-#   define NOVFP @
-#else
-#   define VFP   @
-#   define NOVFP
-#endif
-
-#define GLUE(a, b) a ## b
-#define JOIN(a, b) GLUE(a, b)
-#define X(s) JOIN(EXTERN_ASM, s)
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
-                         int decifactor, float scale, float bias);
-
-void av_cold ff_dcadsp_init_arm(DCADSPContext *s)
-{
-    if (HAVE_NEON)
-        s->lfe_fir = ff_dca_lfe_fir_neon;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-function ff_dca_lfe_fir_neon, export=1
-        push            {r4-r6,lr}
-
-        add             r4,  r0,  r3,  lsl #2   @ out2
-        add             r5,  r2,  #256*4-16     @ cf1
-        sub             r1,  r1,  #12
-        cmp             r3,  #32
-        moveq           r6,  #256/32
-        movne           r6,  #256/64
-NOVFP   vldr            d0,  [sp, #16]          @ scale, bias
-        mov             lr,  #-16
-1:
-        vmov.f32        q2,  #0.0               @ v0
-        vmov.f32        q3,  #0.0               @ v1
-        mov             r12, r6
-2:
-        vld1.32         {q8},     [r2,:128]!    @ cf0
-        vld1.32         {q9},     [r5,:128], lr @ cf1
-        vld1.32         {q1},     [r1], lr      @ in
-        subs            r12, r12, #4
-        vrev64.32       q10, q8
-        vmla.f32        q3,  q1,  q9
-        vmla.f32        d4,  d2,  d21
-        vmla.f32        d5,  d3,  d20
-        bne             2b
-
-        add             r1,  r1,  r6,  lsl #2
-        subs            r3,  r3,  #1
-        vadd.f32        d4,  d4,  d5
-        vadd.f32        d6,  d6,  d7
-        vpadd.f32       d4,  d4,  d6
-        vdup.32         d5,  d0[1]
-        vmla.f32        d5,  d4,  d0[0]
-        vst1.32         {d5[0]},  [r0,:32]!
-        vst1.32         {d5[1]},  [r4,:32]!
-        bne             1b
-
-        pop             {r4-r6,pc}
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,712 +0,0 @@
-@
-@ ARMv4 optimized DSP utils
-@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
-@
-@ This file is part of FFmpeg.
-@
-@ FFmpeg is free software; you can redistribute it and/or
-@ modify it under the terms of the GNU Lesser General Public
-@ License as published by the Free Software Foundation; either
-@ version 2.1 of the License, or (at your option) any later version.
-@
-@ FFmpeg is distributed in the hope that it will be useful,
-@ but WITHOUT ANY WARRANTY; without even the implied warranty of
-@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-@ Lesser General Public License for more details.
-@
-@ You should have received a copy of the GNU Lesser General Public
-@ License along with FFmpeg; if not, write to the Free Software
-@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-@
-
-#include "config.h"
-#include "asm.S"
-
-        preserve8
-
-#if !HAVE_PLD
-.macro pld reg
-.endm
-#endif
-
-#if HAVE_ARMV5TE
-function ff_prefetch_arm, export=1
-        subs            r2,  r2,  #1
-        pld             [r0]
-        add             r0,  r0,  r1
-        bne             ff_prefetch_arm
-        bx              lr
-endfunc
-#endif
-
-.macro  ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
-        mov             \Rd0, \Rn0, lsr #(\shift * 8)
-        mov             \Rd1, \Rn1, lsr #(\shift * 8)
-        mov             \Rd2, \Rn2, lsr #(\shift * 8)
-        mov             \Rd3, \Rn3, lsr #(\shift * 8)
-        orr             \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
-        orr             \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
-        orr             \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
-        orr             \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
-.endm
-.macro  ALIGN_DWORD shift, R0, R1, R2
-        mov             \R0, \R0, lsr #(\shift * 8)
-        orr             \R0, \R0, \R1, lsl #(32 - \shift * 8)
-        mov             \R1, \R1, lsr #(\shift * 8)
-        orr             \R1, \R1, \R2, lsl #(32 - \shift * 8)
-.endm
-.macro  ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
-        mov             \Rdst0, \Rsrc0, lsr #(\shift * 8)
-        mov             \Rdst1, \Rsrc1, lsr #(\shift * 8)
-        orr             \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
-        orr             \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
-.endm
-
-.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
-        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
-        @ Rmask = 0xFEFEFEFE
-        @ Rn = destroy
-        eor             \Rd0, \Rn0, \Rm0
-        eor             \Rd1, \Rn1, \Rm1
-        orr             \Rn0, \Rn0, \Rm0
-        orr             \Rn1, \Rn1, \Rm1
-        and             \Rd0, \Rd0, \Rmask
-        and             \Rd1, \Rd1, \Rmask
-        sub             \Rd0, \Rn0, \Rd0, lsr #1
-        sub             \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
-        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
-        @ Rmask = 0xFEFEFEFE
-        @ Rn = destroy
-        eor             \Rd0, \Rn0, \Rm0
-        eor             \Rd1, \Rn1, \Rm1
-        and             \Rn0, \Rn0, \Rm0
-        and             \Rn1, \Rn1, \Rm1
-        and             \Rd0, \Rd0, \Rmask
-        and             \Rd1, \Rd1, \Rmask
-        add             \Rd0, \Rn0, \Rd0, lsr #1
-        add             \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro  JMP_ALIGN tmp, reg
-        ands            \tmp, \reg, #3
-        bic             \reg, \reg, #3
-        beq             1f
-        subs            \tmp, \tmp, #1
-        beq             2f
-        subs            \tmp, \tmp, #1
-        beq             3f
-        b    4f
-.endm
-
-@ ----------------------------------------------------------------
-        .align 5
-function ff_put_pixels16_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r11, lr}
-        JMP_ALIGN       r5,  r1
-1:
-        ldm             r1,  {r4-r7}
-        add             r1,  r1,  r2
-        stm             r0,  {r4-r7}
-        pld             [r1]
-        subs            r3,  r3,  #1
-        add             r0,  r0,  r2
-        bne             1b
-        pop             {r4-r11, pc}
-        .align 5
-2:
-        ldm             r1,  {r4-r8}
-        add             r1,  r1,  r2
-        ALIGN_QWORD_D   1,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
-        pld             [r1]
-        subs            r3,  r3,  #1
-        stm             r0,  {r9-r12}
-        add             r0,  r0,  r2
-        bne             2b
-        pop             {r4-r11, pc}
-        .align 5
-3:
-        ldm             r1,  {r4-r8}
-        add             r1,  r1,  r2
-        ALIGN_QWORD_D   2,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
-        pld             [r1]
-        subs            r3,  r3,  #1
-        stm             r0,  {r9-r12}
-        add             r0,  r0,  r2
-        bne             3b
-        pop             {r4-r11, pc}
-        .align 5
-4:
-        ldm             r1,  {r4-r8}
-        add             r1,  r1,  r2
-        ALIGN_QWORD_D   3,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
-        pld             [r1]
-        subs            r3,  r3,  #1
-        stm             r0,  {r9-r12}
-        add             r0,  r0,  r2
-        bne             4b
-        pop             {r4-r11,pc}
-endfunc
-
-@ ----------------------------------------------------------------
-        .align 5
-function ff_put_pixels8_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r5,lr}
-        JMP_ALIGN       r5,  r1
-1:
-        ldm             r1,  {r4-r5}
-        add             r1,  r1,  r2
-        subs            r3,  r3,  #1
-        pld             [r1]
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             1b
-        pop             {r4-r5,pc}
-        .align 5
-2:
-        ldm             r1,  {r4-r5, r12}
-        add             r1,  r1,  r2
-        ALIGN_DWORD     1,   r4,  r5,  r12
-        pld             [r1]
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             2b
-        pop             {r4-r5,pc}
-        .align 5
-3:
-        ldm             r1,  {r4-r5, r12}
-        add             r1,  r1,  r2
-        ALIGN_DWORD     2,   r4,  r5,  r12
-        pld             [r1]
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             3b
-        pop             {r4-r5,pc}
-        .align 5
-4:
-        ldm             r1,  {r4-r5, r12}
-        add             r1,  r1,  r2
-        ALIGN_DWORD     3,   r4,  r5,  r12
-        pld             [r1]
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             4b
-        pop             {r4-r5,pc}
-endfunc
-
-@ ----------------------------------------------------------------
-        .align 5
-function ff_put_pixels8_x2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r10,lr}
-        ldr             r12, =0xfefefefe
-        JMP_ALIGN       r5,  r1
-1:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
-        pld             [r1]
-        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        bne             1b
-        pop             {r4-r10,pc}
-        .align 5
-2:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
-        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
-        pld             [r1]
-        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             2b
-        pop             {r4-r10,pc}
-        .align 5
-3:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
-        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
-        pld             [r1]
-        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             3b
-        pop             {r4-r10,pc}
-        .align 5
-4:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
-        pld             [r1]
-        RND_AVG32       r8,  r9,  r6,  r7,  r5,  r10, r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        bne             4b
-        pop             {r4-r10,pc}
-endfunc
-
-        .align 5
-function ff_put_no_rnd_pixels8_x2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r10,lr}
-        ldr             r12, =0xfefefefe
-        JMP_ALIGN       r5,  r1
-1:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
-        pld             [r1]
-        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        bne             1b
-        pop             {r4-r10,pc}
-        .align 5
-2:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
-        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
-        pld             [r1]
-        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             2b
-        pop             {r4-r10,pc}
-        .align 5
-3:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
-        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
-        pld             [r1]
-        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bne             3b
-        pop             {r4-r10,pc}
-        .align 5
-4:
-        ldm             r1,  {r4-r5, r10}
-        add             r1,  r1,  r2
-        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
-        pld             [r1]
-        NO_RND_AVG32    r8,  r9,  r6,  r7,  r5,  r10, r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        bne             4b
-        pop             {r4-r10,pc}
-endfunc
-
-
-@ ----------------------------------------------------------------
-        .align 5
-function ff_put_pixels8_y2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r11,lr}
-        mov             r3,  r3,  lsr #1
-        ldr             r12, =0xfefefefe
-        JMP_ALIGN       r5,  r1
-1:
-        ldm             r1,  {r4-r5}
-        add             r1,  r1,  r2
-6:      ldm             r1,  {r6-r7}
-        add             r1,  r1,  r2
-        pld             [r1]
-        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
-        ldm             r1,  {r4-r5}
-        add             r1,  r1,  r2
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        pld             [r1]
-        RND_AVG32       r8,  r9,  r6,  r7,  r4,  r5,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-        .align 5
-2:
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     1,   r4,  r5,  r6
-6:      ldm             r1,  {r7-r9}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     1,   r7,  r8,  r9
-        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     1,   r4,  r5,  r6
-        subs            r3,  r3,  #1
-        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-        .align 5
-3:
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     2,   r4,  r5,  r6
-6:      ldm             r1,  {r7-r9}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     2,   r7,  r8,  r9
-        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     2,   r4,  r5,  r6
-        subs            r3,  r3,  #1
-        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-        .align 5
-4:
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     3,   r4,  r5,  r6
-6:      ldm             r1,  {r7-r9}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     3,   r7,  r8,  r9
-        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     3,   r4,  r5,  r6
-        subs            r3,  r3,  #1
-        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-endfunc
-
-        .align 5
-function ff_put_no_rnd_pixels8_y2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r11,lr}
-        mov             r3,  r3,  lsr #1
-        ldr             r12, =0xfefefefe
-        JMP_ALIGN       r5,  r1
-1:
-        ldm             r1,  {r4-r5}
-        add             r1,  r1,  r2
-6:      ldm             r1,  {r6-r7}
-        add             r1,  r1,  r2
-        pld             [r1]
-        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
-        ldm             r1,  {r4-r5}
-        add             r1,  r1,  r2
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        pld             [r1]
-        NO_RND_AVG32    r8,  r9,  r6,  r7,  r4,  r5,  r12
-        subs            r3,  r3,  #1
-        stm             r0,  {r8-r9}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-        .align 5
-2:
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     1,   r4,  r5,  r6
-6:      ldm             r1,  {r7-r9}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     1,   r7,  r8,  r9
-        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     1,   r4,  r5,  r6
-        subs            r3,  r3,  #1
-        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-        .align 5
-3:
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     2,   r4,  r5,  r6
-6:      ldm             r1,  {r7-r9}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     2,   r7,  r8,  r9
-        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     2,   r4,  r5,  r6
-        subs            r3,  r3,  #1
-        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-        .align 5
-4:
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     3,   r4,  r5,  r6
-6:      ldm             r1,  {r7-r9}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     3,   r7,  r8,  r9
-        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        ldm             r1,  {r4-r6}
-        add             r1,  r1,  r2
-        pld             [r1]
-        ALIGN_DWORD     3,   r4,  r5,  r6
-        subs            r3,  r3,  #1
-        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
-        stm             r0,  {r10-r11}
-        add             r0,  r0,  r2
-        bne             6b
-        pop             {r4-r11,pc}
-endfunc
-
-        .ltorg
-
-@ ----------------------------------------------------------------
-.macro  RND_XY2_IT align, rnd
-        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
-        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
-.if \align == 0
-        ldm             r1,  {r6-r8}
-.elseif \align == 3
-        ldm             r1,  {r5-r7}
-.else
-        ldm             r1,  {r8-r10}
-.endif
-        add             r1,  r1,  r2
-        pld             [r1]
-.if \align == 0
-        ALIGN_DWORD_D   1,   r4,  r5,  r6,  r7,  r8
-.elseif \align == 1
-        ALIGN_DWORD_D   1,   r4,  r5,  r8,  r9,  r10
-        ALIGN_DWORD_D   2,   r6,  r7,  r8,  r9,  r10
-.elseif \align == 2
-        ALIGN_DWORD_D   2,   r4,  r5,  r8,  r9,  r10
-        ALIGN_DWORD_D   3,   r6,  r7,  r8,  r9,  r10
-.elseif \align == 3
-        ALIGN_DWORD_D   3,   r4,  r5,  r5,  r6,  r7
-.endif
-        ldr             r14, =0x03030303
-        tst             r3,  #1
-        and             r8,  r4,  r14
-        and             r9,  r5,  r14
-        and             r10, r6,  r14
-        and             r11, r7,  r14
-        andeq           r14, r14, r14, \rnd #1
-        add             r8,  r8,  r10
-        add             r9,  r9,  r11
-        ldr             r12, =0xfcfcfcfc >> 2
-        addeq           r8,  r8,  r14
-        addeq           r9,  r9,  r14
-        and             r4,  r12, r4,  lsr #2
-        and             r5,  r12, r5,  lsr #2
-        and             r6,  r12, r6,  lsr #2
-        and             r7,  r12, r7,  lsr #2
-        add             r10, r4,  r6
-        add             r11, r5,  r7
-        subs            r3,  r3,  #1
-.endm
-
-.macro RND_XY2_EXPAND align, rnd
-        RND_XY2_IT      \align, \rnd
-6:      push            {r8-r11}
-        RND_XY2_IT      \align, \rnd
-        pop             {r4-r7}
-        add             r4,  r4,  r8
-        add             r5,  r5,  r9
-        ldr             r14, =0x0f0f0f0f
-        add             r6,  r6,  r10
-        add             r7,  r7,  r11
-        and             r4,  r14, r4,  lsr #2
-        and             r5,  r14, r5,  lsr #2
-        add             r4,  r4,  r6
-        add             r5,  r5,  r7
-        stm             r0,  {r4-r5}
-        add             r0,  r0,  r2
-        bge             6b
-        pop             {r4-r11,pc}
-.endm
-
-        .align 5
-function ff_put_pixels8_xy2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r11,lr} @ R14 is also called LR
-        JMP_ALIGN       r5,  r1
-1:      RND_XY2_EXPAND  0, lsl
-        .align 5
-2:      RND_XY2_EXPAND  1, lsl
-        .align 5
-3:      RND_XY2_EXPAND  2, lsl
-        .align 5
-4:      RND_XY2_EXPAND  3, lsl
-endfunc
-
-        .align 5
-function ff_put_no_rnd_pixels8_xy2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld             [r1]
-        push            {r4-r11,lr}
-        JMP_ALIGN       r5,  r1
-1:      RND_XY2_EXPAND  0, lsr
-        .align 5
-2:      RND_XY2_EXPAND  1, lsr
-        .align 5
-3:      RND_XY2_EXPAND  2, lsr
-        .align 5
-4:      RND_XY2_EXPAND  3, lsr
-endfunc
-
-        .align 5
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
-function ff_add_pixels_clamped_arm, export=1
-        push            {r4-r10}
-        mov             r10, #8
-1:
-        ldr             r4,  [r1]               /* load dest */
-        /* block[0] and block[1]*/
-        ldrsh           r5,  [r0]
-        ldrsh           r7,  [r0, #2]
-        and             r6,  r4,  #0xFF
-        and             r8,  r4,  #0xFF00
-        add             r6,  r5,  r6
-        add             r8,  r7,  r8,  lsr #8
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        mov             r9,  r6
-        ldrsh           r5,  [r0, #4]           /* moved form [A] */
-        orr             r9,  r9,  r8,  lsl #8
-        /* block[2] and block[3] */
-        /* [A] */
-        ldrsh           r7,  [r0, #6]
-        and             r6,  r4,  #0xFF0000
-        and             r8,  r4,  #0xFF000000
-        add             r6,  r5,  r6,  lsr #16
-        add             r8,  r7,  r8,  lsr #24
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        orr             r9,  r9,  r6,  lsl #16
-        ldr             r4,  [r1, #4]           /* moved form [B] */
-        orr             r9,  r9,  r8,  lsl #24
-        /* store dest */
-        ldrsh           r5,  [r0, #8]           /* moved form [C] */
-        str             r9,  [r1]
-
-        /* load dest */
-        /* [B] */
-        /* block[4] and block[5] */
-        /* [C] */
-        ldrsh           r7,  [r0, #10]
-        and             r6,  r4,  #0xFF
-        and             r8,  r4,  #0xFF00
-        add             r6,  r5,  r6
-        add             r8,  r7,  r8,  lsr #8
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        mov             r9,  r6
-        ldrsh           r5,  [r0, #12]          /* moved from [D] */
-        orr             r9,  r9,  r8,  lsl #8
-        /* block[6] and block[7] */
-        /* [D] */
-        ldrsh           r7,  [r0, #14]
-        and             r6,  r4,  #0xFF0000
-        and             r8,  r4,  #0xFF000000
-        add             r6,  r5,  r6,  lsr #16
-        add             r8,  r7,  r8,  lsr #24
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        orr             r9,  r9,  r6,  lsl #16
-        add             r0,  r0,  #16           /* moved from [E] */
-        orr             r9,  r9,  r8,  lsl #24
-        subs            r10, r10, #1            /* moved from [F] */
-        /* store dest */
-        str             r9,  [r1, #4]
-
-        /* [E] */
-        /* [F] */
-        add             r1,  r1,  r2
-        bne             1b
-
-        pop             {r4-r10}
-        bx              lr
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_DSPUTIL_H
-#define AVCODEC_ARM_DSPUTIL_H
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-
-void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx);
-void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx);
-void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx);
-void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
-void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,623 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        preserve8
-
-        .text
-
-.macro  call_2x_pixels  type, subp
-function ff_\type\()_pixels16\subp\()_armv6, export=1
-        push            {r0-r3, lr}
-        bl              ff_\type\()_pixels8\subp\()_armv6
-        pop             {r0-r3, lr}
-        add             r0,  r0,  #8
-        add             r1,  r1,  #8
-        b               ff_\type\()_pixels8\subp\()_armv6
-endfunc
-.endm
-
-call_2x_pixels          avg
-call_2x_pixels          put, _x2
-call_2x_pixels          put, _y2
-call_2x_pixels          put, _x2_no_rnd
-call_2x_pixels          put, _y2_no_rnd
-
-function ff_put_pixels16_armv6, export=1
-        push            {r4-r11}
-1:
-        ldr             r5,  [r1, #4]
-        ldr             r6,  [r1, #8]
-        ldr             r7,  [r1, #12]
-        ldr             r4,  [r1], r2
-        strd            r6,  r7,  [r0, #8]
-        ldr             r9,  [r1, #4]
-        strd            r4,  r5,  [r0],  r2
-        ldr             r10, [r1, #8]
-        ldr             r11, [r1, #12]
-        ldr             r8,  [r1], r2
-        strd            r10, r11, [r0, #8]
-        subs            r3,  r3,  #2
-        strd            r8,  r9,  [r0],  r2
-        bne             1b
-
-        pop             {r4-r11}
-        bx              lr
-endfunc
-
-function ff_put_pixels8_armv6, export=1
-        push            {r4-r7}
-1:
-        ldr             r5,  [r1, #4]
-        ldr             r4,  [r1], r2
-        ldr             r7,  [r1, #4]
-        strd            r4,  r5,  [r0],  r2
-        ldr             r6,  [r1], r2
-        subs            r3,  r3,  #2
-        strd            r6,  r7,  [r0],  r2
-        bne             1b
-
-        pop             {r4-r7}
-        bx              lr
-endfunc
-
-function ff_put_pixels8_x2_armv6, export=1
-        push            {r4-r11, lr}
-        mov             r12, #1
-        orr             r12, r12, r12, lsl #8
-        orr             r12, r12, r12, lsl #16
-1:
-        ldr             r4,  [r1]
-        subs            r3,  r3,  #2
-        ldr             r5,  [r1, #4]
-        ldr             r7,  [r1, #5]
-        lsr             r6,  r4,  #8
-        ldr             r8,  [r1, r2]!
-        orr             r6,  r6,  r5,  lsl #24
-        ldr             r9,  [r1, #4]
-        ldr             r11, [r1, #5]
-        lsr             r10, r8,  #8
-        add             r1,  r1,  r2
-        orr             r10, r10, r9,  lsl #24
-        eor             r14, r4,  r6
-        uhadd8          r4,  r4,  r6
-        eor             r6,  r5,  r7
-        uhadd8          r5,  r5,  r7
-        and             r14, r14, r12
-        and             r6,  r6,  r12
-        uadd8           r4,  r4,  r14
-        eor             r14, r8,  r10
-        uadd8           r5,  r5,  r6
-        eor             r6,  r9,  r11
-        uhadd8          r8,  r8,  r10
-        and             r14, r14, r12
-        uhadd8          r9,  r9,  r11
-        and             r6,  r6,  r12
-        uadd8           r8,  r8,  r14
-        strd            r4,  r5,  [r0],  r2
-        uadd8           r9,  r9,  r6
-        strd            r8,  r9,  [r0],  r2
-        bne             1b
-
-        pop             {r4-r11, pc}
-endfunc
-
-function ff_put_pixels8_y2_armv6, export=1
-        push            {r4-r11}
-        mov             r12, #1
-        orr             r12, r12, r12, lsl #8
-        orr             r12, r12, r12, lsl #16
-        ldr             r4,  [r1]
-        ldr             r5,  [r1, #4]
-        ldr             r6,  [r1, r2]!
-        ldr             r7,  [r1, #4]
-1:
-        subs            r3,  r3,  #2
-        uhadd8          r8,  r4,  r6
-        eor             r10, r4,  r6
-        uhadd8          r9,  r5,  r7
-        eor             r11, r5,  r7
-        and             r10, r10, r12
-        ldr             r4,  [r1, r2]!
-        uadd8           r8,  r8,  r10
-        and             r11, r11, r12
-        uadd8           r9,  r9,  r11
-        ldr             r5,  [r1, #4]
-        uhadd8          r10, r4,  r6
-        eor             r6,  r4,  r6
-        uhadd8          r11, r5,  r7
-        and             r6,  r6,  r12
-        eor             r7,  r5,  r7
-        uadd8           r10, r10, r6
-        and             r7,  r7,  r12
-        ldr             r6,  [r1, r2]!
-        uadd8           r11, r11, r7
-        strd            r8,  r9,  [r0],  r2
-        ldr             r7,  [r1, #4]
-        strd            r10, r11, [r0],  r2
-        bne             1b
-
-        pop             {r4-r11}
-        bx              lr
-endfunc
-
-function ff_put_pixels8_x2_no_rnd_armv6, export=1
-        push            {r4-r9, lr}
-1:
-        subs            r3,  r3,  #2
-        ldr             r4,  [r1]
-        ldr             r5,  [r1, #4]
-        ldr             r7,  [r1, #5]
-        ldr             r8,  [r1, r2]!
-        ldr             r9,  [r1, #4]
-        ldr             r14, [r1, #5]
-        add             r1,  r1,  r2
-        lsr             r6,  r4,  #8
-        orr             r6,  r6,  r5,  lsl #24
-        lsr             r12, r8,  #8
-        orr             r12, r12, r9,  lsl #24
-        uhadd8          r4,  r4,  r6
-        uhadd8          r5,  r5,  r7
-        uhadd8          r8,  r8,  r12
-        uhadd8          r9,  r9,  r14
-        stm             r0,  {r4,r5}
-        add             r0,  r0,  r2
-        stm             r0,  {r8,r9}
-        add             r0,  r0,  r2
-        bne             1b
-
-        pop             {r4-r9, pc}
-endfunc
-
-function ff_put_pixels8_y2_no_rnd_armv6, export=1
-        push            {r4-r9, lr}
-        ldr             r4,  [r1]
-        ldr             r5,  [r1, #4]
-        ldr             r6,  [r1, r2]!
-        ldr             r7,  [r1, #4]
-1:
-        subs            r3,  r3,  #2
-        uhadd8          r8,  r4,  r6
-        ldr             r4,  [r1, r2]!
-        uhadd8          r9,  r5,  r7
-        ldr             r5,  [r1, #4]
-        uhadd8          r12, r4,  r6
-        ldr             r6,  [r1, r2]!
-        uhadd8          r14, r5,  r7
-        ldr             r7,  [r1, #4]
-        stm             r0,  {r8,r9}
-        add             r0,  r0,  r2
-        stm             r0,  {r12,r14}
-        add             r0,  r0,  r2
-        bne             1b
-
-        pop             {r4-r9, pc}
-endfunc
-
-function ff_avg_pixels8_armv6, export=1
-        pld             [r1, r2]
-        push            {r4-r10, lr}
-        mov             lr,  #1
-        orr             lr,  lr,  lr,  lsl #8
-        orr             lr,  lr,  lr,  lsl #16
-        ldrd            r4,  r5,  [r0]
-        ldr             r10, [r1, #4]
-        ldr             r9,  [r1], r2
-        subs            r3,  r3,  #2
-1:
-        pld             [r1, r2]
-        eor             r8,  r4,  r9
-        uhadd8          r4,  r4,  r9
-        eor             r12, r5,  r10
-        ldrd            r6,  r7,  [r0, r2]
-        uhadd8          r5,  r5,  r10
-        and             r8,  r8,  lr
-        ldr             r10, [r1, #4]
-        and             r12, r12, lr
-        uadd8           r4,  r4,  r8
-        ldr             r9,  [r1], r2
-        eor             r8,  r6,  r9
-        uadd8           r5,  r5,  r12
-        pld             [r1, r2,  lsl #1]
-        eor             r12, r7,  r10
-        uhadd8          r6,  r6,  r9
-        strd            r4,  r5,  [r0], r2
-        uhadd8          r7,  r7,  r10
-        beq             2f
-        and             r8,  r8,  lr
-        ldrd            r4,  r5,  [r0, r2]
-        uadd8           r6,  r6,  r8
-        ldr             r10, [r1, #4]
-        and             r12, r12, lr
-        subs            r3,  r3,  #2
-        uadd8           r7,  r7,  r12
-        ldr             r9,  [r1], r2
-        strd            r6,  r7,  [r0], r2
-        b               1b
-2:
-        and             r8,  r8,  lr
-        and             r12, r12, lr
-        uadd8           r6,  r6,  r8
-        uadd8           r7,  r7,  r12
-        strd            r6,  r7,  [r0], r2
-
-        pop             {r4-r10, pc}
-endfunc
-
-function ff_add_pixels_clamped_armv6, export=1
-        push            {r4-r8,lr}
-        mov             r3,  #8
-1:
-        ldm             r0!, {r4,r5,r12,lr}
-        ldrd            r6,  r7,  [r1]
-        pkhbt           r8,  r4,  r5,  lsl #16
-        pkhtb           r5,  r5,  r4,  asr #16
-        pkhbt           r4,  r12, lr,  lsl #16
-        pkhtb           lr,  lr,  r12, asr #16
-        pld             [r1, r2]
-        uxtab16         r8,  r8,  r6
-        uxtab16         r5,  r5,  r6,  ror #8
-        uxtab16         r4,  r4,  r7
-        uxtab16         lr,  lr,  r7,  ror #8
-        usat16          r8,  #8,  r8
-        usat16          r5,  #8,  r5
-        usat16          r4,  #8,  r4
-        usat16          lr,  #8,  lr
-        orr             r6,  r8,  r5,  lsl #8
-        orr             r7,  r4,  lr,  lsl #8
-        subs            r3,  r3,  #1
-        strd            r6,  r7,  [r1],  r2
-        bgt             1b
-        pop             {r4-r8,pc}
-endfunc
-
-function ff_get_pixels_armv6, export=1
-        pld             [r1, r2]
-        push            {r4-r8, lr}
-        mov             lr,  #8
-1:
-        ldrd            r4,  r5,  [r1],  r2
-        subs            lr,  lr,  #1
-        uxtb16          r6,  r4
-        uxtb16          r4,  r4,  ror #8
-        uxtb16          r12, r5
-        uxtb16          r8,  r5,  ror #8
-        pld             [r1, r2]
-        pkhbt           r5,  r6,  r4,  lsl #16
-        pkhtb           r6,  r4,  r6,  asr #16
-        pkhbt           r7,  r12, r8,  lsl #16
-        pkhtb           r12, r8,  r12, asr #16
-        stm             r0!, {r5,r6,r7,r12}
-        bgt             1b
-
-        pop             {r4-r8, pc}
-endfunc
-
-function ff_diff_pixels_armv6, export=1
-        pld             [r1, r3]
-        pld             [r2, r3]
-        push            {r4-r9, lr}
-        mov             lr,  #8
-1:
-        ldrd            r4,  r5,  [r1],  r3
-        ldrd            r6,  r7,  [r2],  r3
-        uxtb16          r8,  r4
-        uxtb16          r4,  r4,  ror #8
-        uxtb16          r9,  r6
-        uxtb16          r6,  r6,  ror #8
-        pld             [r1, r3]
-        ssub16          r9,  r8,  r9
-        ssub16          r6,  r4,  r6
-        uxtb16          r8,  r5
-        uxtb16          r5,  r5,  ror #8
-        pld             [r2, r3]
-        pkhbt           r4,  r9,  r6,  lsl #16
-        pkhtb           r6,  r6,  r9,  asr #16
-        uxtb16          r9,  r7
-        uxtb16          r7,  r7,  ror #8
-        ssub16          r9,  r8,  r9
-        ssub16          r5,  r5,  r7
-        subs            lr,  lr,  #1
-        pkhbt           r8,  r9,  r5,  lsl #16
-        pkhtb           r9,  r5,  r9,  asr #16
-        stm             r0!, {r4,r6,r8,r9}
-        bgt             1b
-
-        pop             {r4-r9, pc}
-endfunc
-
-function ff_pix_abs16_armv6, export=1
-        ldr             r0,  [sp]
-        push            {r4-r9, lr}
-        mov             r12, #0
-        mov             lr,  #0
-        ldm             r1,  {r4-r7}
-        ldr             r8,  [r2]
-1:
-        ldr             r9,  [r2, #4]
-        pld             [r1, r3]
-        usada8          r12, r4,  r8,  r12
-        ldr             r8,  [r2, #8]
-        pld             [r2, r3]
-        usada8          lr,  r5,  r9,  lr
-        ldr             r9,  [r2, #12]
-        usada8          r12, r6,  r8,  r12
-        subs            r0,  r0,  #1
-        usada8          lr,  r7,  r9,  lr
-        beq             2f
-        add             r1,  r1,  r3
-        ldm             r1,  {r4-r7}
-        add             r2,  r2,  r3
-        ldr             r8,  [r2]
-        b               1b
-2:
-        add             r0,  r12, lr
-        pop             {r4-r9, pc}
-endfunc
-
-function ff_pix_abs16_x2_armv6, export=1
-        ldr             r12, [sp]
-        push            {r4-r11, lr}
-        mov             r0,  #0
-        mov             lr,  #1
-        orr             lr,  lr,  lr,  lsl #8
-        orr             lr,  lr,  lr,  lsl #16
-1:
-        ldr             r8,  [r2]
-        ldr             r9,  [r2, #4]
-        lsr             r10, r8,  #8
-        ldr             r4,  [r1]
-        lsr             r6,  r9,  #8
-        orr             r10, r10, r9,  lsl #24
-        ldr             r5,  [r2, #8]
-        eor             r11, r8,  r10
-        uhadd8          r7,  r8,  r10
-        orr             r6,  r6,  r5,  lsl #24
-        and             r11, r11, lr
-        uadd8           r7,  r7,  r11
-        ldr             r8,  [r1, #4]
-        usada8          r0,  r4,  r7,  r0
-        eor             r7,  r9,  r6
-        lsr             r10, r5,  #8
-        and             r7,  r7,  lr
-        uhadd8          r4,  r9,  r6
-        ldr             r6,  [r2, #12]
-        uadd8           r4,  r4,  r7
-        pld             [r1, r3]
-        orr             r10, r10, r6,  lsl #24
-        usada8          r0,  r8,  r4,  r0
-        ldr             r4,  [r1, #8]
-        eor             r11, r5,  r10
-        ldrb            r7,  [r2, #16]
-        and             r11, r11, lr
-        uhadd8          r8,  r5,  r10
-        ldr             r5,  [r1, #12]
-        uadd8           r8,  r8,  r11
-        pld             [r2, r3]
-        lsr             r10, r6,  #8
-        usada8          r0,  r4,  r8,  r0
-        orr             r10, r10, r7,  lsl #24
-        subs            r12,  r12,  #1
-        eor             r11, r6,  r10
-        add             r1,  r1,  r3
-        uhadd8          r9,  r6,  r10
-        and             r11, r11, lr
-        uadd8           r9,  r9,  r11
-        add             r2,  r2,  r3
-        usada8          r0,  r5,  r9,  r0
-        bgt             1b
-
-        pop             {r4-r11, pc}
-endfunc
-
-.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
-        ldr             \n0, [r2]
-        eor             \n1, \p0, \n0
-        uhadd8          \p0, \p0, \n0
-        and             \n1, \n1, lr
-        ldr             \n2, [r1]
-        uadd8           \p0, \p0, \n1
-        ldr             \n1, [r2, #4]
-        usada8          r0,  \p0, \n2, r0
-        pld             [r1,  r3]
-        eor             \n3, \p1, \n1
-        uhadd8          \p1, \p1, \n1
-        and             \n3, \n3, lr
-        ldr             \p0, [r1, #4]
-        uadd8           \p1, \p1, \n3
-        ldr             \n2, [r2, #8]
-        usada8          r0,  \p1, \p0, r0
-        pld             [r2,  r3]
-        eor             \p0, \p2, \n2
-        uhadd8          \p2, \p2, \n2
-        and             \p0, \p0, lr
-        ldr             \p1, [r1, #8]
-        uadd8           \p2, \p2, \p0
-        ldr             \n3, [r2, #12]
-        usada8          r0,  \p2, \p1, r0
-        eor             \p1, \p3, \n3
-        uhadd8          \p3, \p3, \n3
-        and             \p1, \p1, lr
-        ldr             \p0,  [r1, #12]
-        uadd8           \p3, \p3, \p1
-        add             r1,  r1,  r3
-        usada8          r0,  \p3, \p0,  r0
-        add             r2,  r2,  r3
-.endm
-
-function ff_pix_abs16_y2_armv6, export=1
-        pld             [r1]
-        pld             [r2]
-        ldr             r12, [sp]
-        push            {r4-r11, lr}
-        mov             r0,  #0
-        mov             lr,  #1
-        orr             lr,  lr,  lr,  lsl #8
-        orr             lr,  lr,  lr,  lsl #16
-        ldr             r4,  [r2]
-        ldr             r5,  [r2, #4]
-        ldr             r6,  [r2, #8]
-        ldr             r7,  [r2, #12]
-        add             r2,  r2,  r3
-1:
-        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
-        subs            r12, r12, #2
-        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
-        bgt             1b
-
-        pop             {r4-r11, pc}
-endfunc
-
-function ff_pix_abs8_armv6, export=1
-        pld             [r2, r3]
-        ldr             r12, [sp]
-        push            {r4-r9, lr}
-        mov             r0,  #0
-        mov             lr,  #0
-        ldrd            r4,  r5,  [r1], r3
-1:
-        subs            r12, r12, #2
-        ldr             r7,  [r2, #4]
-        ldr             r6,  [r2], r3
-        ldrd            r8,  r9,  [r1], r3
-        usada8          r0,  r4,  r6,  r0
-        pld             [r2, r3]
-        usada8          lr,  r5,  r7,  lr
-        ldr             r7,  [r2, #4]
-        ldr             r6,  [r2], r3
-        beq             2f
-        ldrd            r4,  r5,  [r1], r3
-        usada8          r0,  r8,  r6,  r0
-        pld             [r2, r3]
-        usada8          lr,  r9,  r7,  lr
-        b               1b
-2:
-        usada8          r0,  r8,  r6,  r0
-        usada8          lr,  r9,  r7,  lr
-        add             r0,  r0,  lr
-        pop             {r4-r9, pc}
-endfunc
-
-function ff_sse16_armv6, export=1
-        ldr             r12, [sp]
-        push            {r4-r9, lr}
-        mov             r0,  #0
-1:
-        ldrd            r4,  r5,  [r1]
-        ldr             r8,  [r2]
-        uxtb16          lr,  r4
-        uxtb16          r4,  r4,  ror #8
-        uxtb16          r9,  r8
-        uxtb16          r8,  r8,  ror #8
-        ldr             r7,  [r2, #4]
-        usub16          lr,  lr,  r9
-        usub16          r4,  r4,  r8
-        smlad           r0,  lr,  lr,  r0
-        uxtb16          r6,  r5
-        uxtb16          lr,  r5,  ror #8
-        uxtb16          r8,  r7
-        uxtb16          r9,  r7,  ror #8
-        smlad           r0,  r4,  r4,  r0
-        ldrd            r4,  r5,  [r1, #8]
-        usub16          r6,  r6,  r8
-        usub16          r8,  lr,  r9
-        ldr             r7,  [r2, #8]
-        smlad           r0,  r6,  r6,  r0
-        uxtb16          lr,  r4
-        uxtb16          r4,  r4,  ror #8
-        uxtb16          r9,  r7
-        uxtb16          r7,  r7, ror #8
-        smlad           r0,  r8,  r8,  r0
-        ldr             r8,  [r2, #12]
-        usub16          lr,  lr,  r9
-        usub16          r4,  r4,  r7
-        smlad           r0,  lr,  lr,  r0
-        uxtb16          r6,  r5
-        uxtb16          r5,  r5,  ror #8
-        uxtb16          r9,  r8
-        uxtb16          r8,  r8,  ror #8
-        smlad           r0,  r4,  r4,  r0
-        usub16          r6,  r6,  r9
-        usub16          r5,  r5,  r8
-        smlad           r0,  r6,  r6,  r0
-        add             r1,  r1,  r3
-        add             r2,  r2,  r3
-        subs            r12, r12, #1
-        smlad           r0,  r5,  r5,  r0
-        bgt             1b
-
-        pop             {r4-r9, pc}
-endfunc
-
-function ff_pix_norm1_armv6, export=1
-        push            {r4-r6, lr}
-        mov             r12, #16
-        mov             lr,  #0
-1:
-        ldm             r0,  {r2-r5}
-        uxtb16          r6,  r2
-        uxtb16          r2,  r2,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        uxtb16          r6,  r3
-        smlad           lr,  r2,  r2,  lr
-        uxtb16          r3,  r3,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        uxtb16          r6,  r4
-        smlad           lr,  r3,  r3,  lr
-        uxtb16          r4,  r4,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        uxtb16          r6,  r5
-        smlad           lr,  r4,  r4,  lr
-        uxtb16          r5,  r5,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        subs            r12, r12, #1
-        add             r0,  r0,  r1
-        smlad           lr,  r5,  r5,  lr
-        bgt             1b
-
-        mov             r0,  lr
-        pop             {r4-r6, pc}
-endfunc
-
-function ff_pix_sum_armv6, export=1
-        push            {r4-r7, lr}
-        mov             r12, #16
-        mov             r2,  #0
-        mov             r3,  #0
-        mov             lr,  #0
-        ldr             r4,  [r0]
-1:
-        subs            r12, r12, #1
-        ldr             r5,  [r0, #4]
-        usada8          r2,  r4,  lr,  r2
-        ldr             r6,  [r0, #8]
-        usada8          r3,  r5,  lr,  r3
-        ldr             r7,  [r0, #12]
-        usada8          r2,  r6,  lr,  r2
-        beq             2f
-        ldr             r4,  [r0, r1]!
-        usada8          r3,  r7,  lr,  r3
-        bgt             1b
-2:
-        usada8          r3,  r7,  lr,  r3
-        add             r0,  r2,  r3
-        pop             {r4-r7, pc}
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,112 +0,0 @@
-/*
- * ARM optimized DSP utils
- * Copyright (c) 2001 Lionel Ulmer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_j_rev_dct_arm(DCTELEM *data);
-void ff_simple_idct_arm(DCTELEM *data);
-
-/* XXX: local hack */
-static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
-static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
-
-void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-CALL_2X_PIXELS(ff_put_pixels16_x2_arm,         ff_put_pixels8_x2_arm,        8)
-CALL_2X_PIXELS(ff_put_pixels16_y2_arm,         ff_put_pixels8_y2_arm,        8)
-CALL_2X_PIXELS(ff_put_pixels16_xy2_arm,        ff_put_pixels8_xy2_arm,       8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm,  ff_put_no_rnd_pixels8_x2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm,  ff_put_no_rnd_pixels8_y2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
-
-void ff_add_pixels_clamped_arm(const DCTELEM *block, uint8_t *dest,
-                               int line_size);
-
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
-   converted */
-static void j_rev_dct_arm_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_j_rev_dct_arm (block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-static void j_rev_dct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_j_rev_dct_arm (block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-static void simple_idct_arm_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_simple_idct_arm (block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_simple_idct_arm (block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-
-int mm_support(void)
-{
-    return HAVE_IWMMXT * FF_MM_IWMMXT;
-}
-
-void dsputil_init_arm(DSPContext* c)
-{
-    ff_put_pixels_clamped = c->put_pixels_clamped;
-    ff_add_pixels_clamped = c->add_pixels_clamped;
-  
-    c->idct_put              = simple_idct_arm_put;
-    c->idct_add              = simple_idct_arm_add;
-    c->idct                  = ff_simple_idct_arm;
-    c->idct_permutation_type = FF_NO_IDCT_PERM;
-
-    c->add_pixels_clamped = ff_add_pixels_clamped_arm;
-
-    c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
-    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
-    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
-    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
-    c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
-    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
-    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
-    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
-
-    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
-    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
-    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
-    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
-    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
-    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
-    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
-    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
-
-    if (HAVE_NEON)    ff_dsputil_init_neon(c);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_armv5te(DCTELEM *data);
-void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
-void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
-
-void ff_prefetch_arm(void *mem, int stride, int h);
-
-void av_cold ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx)
-{
-    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
-                           avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
-        c->idct_put              = ff_simple_idct_put_armv5te;
-        c->idct_add              = ff_simple_idct_add_armv5te;
-        c->idct                  = ff_simple_idct_armv5te;
-        c->idct_permutation_type = FF_NO_IDCT_PERM;
-    }
-
-    c->prefetch = ff_prefetch_arm;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_armv6(DCTELEM *data);
-void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
-void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
-
-void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_add_pixels_clamped_armv6(const DCTELEM *block,
-                                 uint8_t *restrict pixels,
-                                 int line_size);
-
-void ff_get_pixels_armv6(DCTELEM *block, const uint8_t *pixels, int stride);
-void ff_diff_pixels_armv6(DCTELEM *block, const uint8_t *s1,
-                          const uint8_t *s2, int stride);
-
-int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
-                       int line_size, int h);
-int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
-                          int line_size, int h);
-int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
-                          int line_size, int h);
-
-int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
-                       int line_size, int h);
-
-int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
-                   int line_size, int h);
-
-int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
-int ff_pix_sum_armv6(uint8_t *pix, int line_size);
-
-void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
-{
-    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
-                           avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
-        c->idct_put              = ff_simple_idct_put_armv6;
-        c->idct_add              = ff_simple_idct_add_armv6;
-        c->idct                  = ff_simple_idct_armv6;
-        c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
-    }
-
-    c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
-    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
-    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
-/*     c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
-    c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
-    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
-    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
-/*     c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
-
-    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
-    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
-    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
-/*     c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
-    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
-    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
-    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
-/*     c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
-
-    c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
-    c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
-
-    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
-    c->get_pixels = ff_get_pixels_armv6;
-    c->diff_pixels = ff_diff_pixels_armv6;
-
-    c->pix_abs[0][0] = ff_pix_abs16_armv6;
-    c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
-    c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
-
-    c->pix_abs[1][0] = ff_pix_abs8_armv6;
-
-    c->sad[0] = ff_pix_abs16_armv6;
-    c->sad[1] = ff_pix_abs8_armv6;
-
-    c->sse[0] = ff_sse16_armv6;
-
-    c->pix_norm1 = ff_pix_norm1_armv6;
-    c->pix_sum   = ff_pix_sum_armv6;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,308 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_neon(DCTELEM *data);
-void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
-void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
-
-void ff_vp3_idct_neon(DCTELEM *data);
-void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
-void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
-void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data);
-
-void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-
-void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int);
-
-void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
-
-void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
-
-void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
-void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
-
-void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
-
-void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
-void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
-
-void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
-
-void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
-
-void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
-void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
-
-void ff_vector_fmul_neon(float *dst, const float *src, int len);
-void ff_vector_fmul_window_neon(float *dst, const float *src0,
-                                const float *src1, const float *win,
-                                float add_bias, int len);
-void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
-                                int len);
-void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src,
-                                     const float **vp, float mul, int len);
-void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src,
-                                     const float **vp, float mul, int len);
-void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul,
-                              int len);
-void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
-                              int len);
-void ff_butterflies_float_neon(float *v1, float *v2, int len);
-float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
-                                        float mul, int len);
-void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
-                                 const float *src1, int len);
-void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
-                             const float *src2, int len);
-
-void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
-                          int len);
-void ff_float_to_int16_neon(int16_t *, const float *, long);
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
-
-void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
-
-int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len,
-                                    int shift);
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2,
-                                             int16_t *v3, int len, int mul);
-
-void ff_dsputil_init_neon(DSPContext *c)
-{
-
-    {
-        c->idct_put              = ff_simple_idct_put_neon;
-        c->idct_add              = ff_simple_idct_add_neon;
-        c->idct                  = ff_simple_idct_neon;
-        c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
-
-    }
-
-    c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
-    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
-    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
-    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
-    c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
-    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
-    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
-    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
-
-    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
-    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
-    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
-    c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
-    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
-    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
-    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
-    c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
-
-    c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
-    c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
-
-    c->add_pixels_clamped = ff_add_pixels_clamped_neon;
-    c->put_pixels_clamped = ff_put_pixels_clamped_neon;
-    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
-
-
-	c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
-	c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
-	c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
-
-	c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
-	c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
-	c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
-
-	c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
-	c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
-	c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
-	c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
-	c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
-	c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
-	c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
-	c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
-	c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
-	c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
-	c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
-	c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
-	c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
-	c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
-	c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
-	c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
-
-	c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
-	c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
-	c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
-	c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
-	c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
-	c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
-	c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
-	c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
-	c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
-	c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
-	c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
-	c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
-	c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
-	c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
-	c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
-	c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
-
-	c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
-	c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
-	c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
-	c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
-	c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
-	c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
-	c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
-	c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
-
-	c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
-	c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
-	c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
-	c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
-	c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
-	c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
-	c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
-	c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;    
-
-    c->vector_fmul                = ff_vector_fmul_neon;
-    c->vector_fmul_window         = ff_vector_fmul_window_neon;
-    c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
-    c->butterflies_float          = ff_butterflies_float_neon;
-    c->scalarproduct_float        = ff_scalarproduct_float_neon;
-    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
-    c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
-    c->vector_fmul_add            = ff_vector_fmul_add_neon;
-    c->vector_clipf               = ff_vector_clipf_neon;
-
-    c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon;
-    c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon;
-
-    c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
-    c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
-
-
-    c->float_to_int16            = ff_float_to_int16_neon;
-    c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
-
-    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_vector_fmul_vfp(float *dst, const float *src, int len);
-void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
-                                const float *src1, int len);
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
-
-void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
-{
-    c->vector_fmul = ff_vector_fmul_vfp;
-    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
-#if HAVE_ARMV6
-    c->float_to_int16 = ff_float_to_int16_vfp;
-#endif
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,205 +0,0 @@
-/*
- * iWMMXt optimized DSP utils
- * Copyright (c) 2004 AGAWA Koji
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-
-#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
-#define SET_RND(regd)  __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
-#define WAVG2B "wavg2b"
-#include "dsputil_iwmmxt_rnd_template.c"
-#undef DEF
-#undef SET_RND
-#undef WAVG2B
-
-#define DEF(x, y) x ## _ ## y ##_iwmmxt
-#define SET_RND(regd)  __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
-#define WAVG2B "wavg2br"
-#include "dsputil_iwmmxt_rnd_template.c"
-#undef DEF
-#undef SET_RND
-#undef WAVG2BR
-
-// need scheduling
-#define OP(AVG)                                         \
-    __asm__ volatile (                                      \
-        /* alignment */                                 \
-        "and r12, %[pixels], #7 \n\t"                   \
-        "bic %[pixels], %[pixels], #7 \n\t"             \
-        "tmcr wcgr1, r12 \n\t"                          \
-                                                        \
-        "wldrd wr0, [%[pixels]] \n\t"                   \
-        "wldrd wr1, [%[pixels], #8] \n\t"               \
-        "add %[pixels], %[pixels], %[line_size] \n\t"   \
-        "walignr1 wr4, wr0, wr1 \n\t"                   \
-                                                        \
-        "1: \n\t"                                       \
-                                                        \
-        "wldrd wr2, [%[pixels]] \n\t"                   \
-        "wldrd wr3, [%[pixels], #8] \n\t"               \
-        "add %[pixels], %[pixels], %[line_size] \n\t"   \
-        "pld [%[pixels]] \n\t"                          \
-        "walignr1 wr5, wr2, wr3 \n\t"                   \
-        AVG " wr6, wr4, wr5 \n\t"                       \
-        "wstrd wr6, [%[block]] \n\t"                    \
-        "add %[block], %[block], %[line_size] \n\t"     \
-                                                        \
-        "wldrd wr0, [%[pixels]] \n\t"                   \
-        "wldrd wr1, [%[pixels], #8] \n\t"               \
-        "add %[pixels], %[pixels], %[line_size] \n\t"   \
-        "walignr1 wr4, wr0, wr1 \n\t"                   \
-        "pld [%[pixels]] \n\t"                          \
-        AVG " wr6, wr4, wr5 \n\t"                       \
-        "wstrd wr6, [%[block]] \n\t"                    \
-        "add %[block], %[block], %[line_size] \n\t"     \
-                                                        \
-        "subs %[h], %[h], #2 \n\t"                      \
-        "bne 1b \n\t"                                   \
-        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)  \
-        : [line_size]"r"(line_size) \
-        : "memory", "r12");
-void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    OP("wavg2br");
-}
-void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    OP("wavg2b");
-}
-#undef OP
-
-void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    uint8_t *pixels2 = pixels + line_size;
-
-    __asm__ volatile (
-        "mov            r12, #4                 \n\t"
-        "1:                                     \n\t"
-        "pld            [%[pixels], %[line_size2]]              \n\t"
-        "pld            [%[pixels2], %[line_size2]]             \n\t"
-        "wldrd          wr4, [%[pixels]]        \n\t"
-        "wldrd          wr5, [%[pixels2]]       \n\t"
-        "pld            [%[block], #32]         \n\t"
-        "wunpckelub     wr6, wr4                \n\t"
-        "wldrd          wr0, [%[block]]         \n\t"
-        "wunpckehub     wr7, wr4                \n\t"
-        "wldrd          wr1, [%[block], #8]     \n\t"
-        "wunpckelub     wr8, wr5                \n\t"
-        "wldrd          wr2, [%[block], #16]    \n\t"
-        "wunpckehub     wr9, wr5                \n\t"
-        "wldrd          wr3, [%[block], #24]    \n\t"
-        "add            %[block], %[block], #32 \n\t"
-        "waddhss        wr10, wr0, wr6          \n\t"
-        "waddhss        wr11, wr1, wr7          \n\t"
-        "waddhss        wr12, wr2, wr8          \n\t"
-        "waddhss        wr13, wr3, wr9          \n\t"
-        "wpackhus       wr14, wr10, wr11        \n\t"
-        "wpackhus       wr15, wr12, wr13        \n\t"
-        "wstrd          wr14, [%[pixels]]       \n\t"
-        "add            %[pixels], %[pixels], %[line_size2]     \n\t"
-        "subs           r12, r12, #1            \n\t"
-        "wstrd          wr15, [%[pixels2]]      \n\t"
-        "add            %[pixels2], %[pixels2], %[line_size2]   \n\t"
-        "bne            1b                      \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
-        : [line_size2]"r"(line_size << 1)
-        : "cc", "memory", "r12");
-}
-
-static void clear_blocks_iwmmxt(DCTELEM *blocks)
-{
-    __asm__ volatile(
-                "wzero wr0                      \n\t"
-                "mov r1, #(128 * 6 / 32)        \n\t"
-                "1:                             \n\t"
-                "wstrd wr0, [%0]                \n\t"
-                "wstrd wr0, [%0, #8]            \n\t"
-                "wstrd wr0, [%0, #16]           \n\t"
-                "wstrd wr0, [%0, #24]           \n\t"
-                "subs r1, r1, #1                \n\t"
-                "add %0, %0, #32                \n\t"
-                "bne 1b                         \n\t"
-                : "+r"(blocks)
-                :
-                : "r1"
-        );
-}
-
-static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    return;
-}
-
-/* A run time test is not simple. If this file is compiled in
- * then we should install the functions
- */
-int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */
-
-void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
-{
-    if (avctx->dsp_mask) {
-        if (avctx->dsp_mask & FF_MM_FORCE)
-            mm_flags |= (avctx->dsp_mask & 0xffff);
-        else
-            mm_flags &= ~(avctx->dsp_mask & 0xffff);
-    }
-
-    if (!(mm_flags & FF_MM_IWMMXT)) return;
-
-    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
-
-    c->clear_blocks = clear_blocks_iwmmxt;
-
-    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
-    c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
-    c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
-    c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
-    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
-    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
-    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
-
-    c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
-    c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
-    c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
-    c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
-    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
-    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
-    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
-
-    c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
-    c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
-    c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
-    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
-    c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
-    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
-    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
-    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
-
-    c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
-    c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
-    c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
-    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
-    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
-    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
-    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
-    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1114 +0,0 @@
-/*
- * iWMMXt optimized DSP utils
- * copyright (c) 2004 AGAWA Koji
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "wldrd wr0, [%[block]] \n\t"
-        "wldrd wr2, [r5] \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        WAVG2B" wr8, wr8, wr0 \n\t"
-        WAVG2B" wr10, wr10, wr2 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "pld [%[block]] \n\t"
-        "pld [%[block], #32] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "pld [r5] \n\t"
-        "pld [r5, #32] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr2, [%[pixels], #16] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "walignr1 wr9, wr1, wr2 \n\t"
-        "wldrd wr5, [r4, #16] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "walignr1 wr11, wr4, wr5 \n\t"
-        "wstrd wr9, [%[block], #8] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "wstrd wr11, [r5, #8] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1:                             \n\t"
-        "wldrd wr0, [%[pixels]]         \n\t"
-        "wldrd wr1, [%[pixels], #8]     \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wldrd wr2, [%[pixels], #16]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4]                \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr8, wr0, wr1         \n\t"
-        "wldrd wr4, [r4, #8]            \n\t"
-        "walignr1 wr9, wr1, wr2         \n\t"
-        "wldrd wr5, [r4, #16]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "wldrd wr0, [%[block]]          \n\t"
-        "pld [r4]                       \n\t"
-        "wldrd wr1, [%[block], #8]      \n\t"
-        "pld [r4, #32]                  \n\t"
-        "wldrd wr2, [r5]                \n\t"
-        "walignr1 wr10, wr3, wr4        \n\t"
-        "wldrd wr3, [r5, #8]            \n\t"
-        WAVG2B" wr8, wr8, wr0           \n\t"
-        WAVG2B" wr9, wr9, wr1           \n\t"
-        WAVG2B" wr10, wr10, wr2         \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "walignr1 wr11, wr4, wr5        \n\t"
-        WAVG2B" wr11, wr11, wr3         \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5]               \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "wstrd wr11, [r5, #8]           \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wldrd wr15, [r4, #16]          \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "walignr1 wr3, wr14, wr15       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr5, wr12               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "wmoveq wr7, wr15               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr5, wr11, wr12     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "walignr2ne wr7, wr14, wr15     \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr1, wr1, wr5           \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wstrd wr1, [%[block], #8]      \n\t"
-        WAVG2B" wr3, wr3, wr7           \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr3, [r5, #8]            \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "wldrd wr12, [r5]               \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        WAVG2B" wr0, wr0, wr10          \n\t"
-        WAVG2B" wr2, wr2, wr12          \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wldrd wr15, [r4, #16]          \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "walignr1 wr3, wr14, wr15       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr5, wr12               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "wmoveq wr7, wr15               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr5, wr11, wr12     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "walignr2ne wr7, wr14, wr15     \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr1, wr1, wr5           \n\t"
-        "wldrd wr12, [r5]               \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wldrd wr13, [r5, #8]           \n\t"
-        WAVG2B" wr3, wr3, wr7           \n\t"
-        WAVG2B" wr0, wr0, wr10          \n\t"
-        WAVG2B" wr1, wr1, wr11          \n\t"
-        WAVG2B" wr2, wr2, wr12          \n\t"
-        WAVG2B" wr3, wr3, wr13          \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr1, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "pld [%[block]]                 \n\t"
-        "wstrd wr3, [r5, #8]            \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        :"r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "and            r12, %[pixels], #7                      \n\t"
-        "tmcr           wcgr1, r12                              \n\t"
-        "bic            %[pixels], %[pixels], #7                \n\t"
-
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "pld            [%[block]]                              \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "walignr1       wr0, wr10, wr11                         \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-
-      "1:                                                       \n\t"
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "walignr1       wr4, wr10, wr11                         \n\t"
-        "wldrd          wr10, [%[block]]                        \n\t"
-         WAVG2B"        wr8, wr0, wr4                           \n\t"
-         WAVG2B"        wr8, wr8, wr10                          \n\t"
-        "wstrd          wr8, [%[block]]                         \n\t"
-        "add            %[block], %[block], %[line_size]        \n\t"
-
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "pld            [%[block]]                              \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "walignr1       wr0, wr10, wr11                         \n\t"
-        "wldrd          wr10, [%[block]]                        \n\t"
-         WAVG2B"        wr8, wr0, wr4                           \n\t"
-         WAVG2B"        wr8, wr8, wr10                          \n\t"
-        "wstrd          wr8, [%[block]]                         \n\t"
-        "add            %[block], %[block], %[line_size]        \n\t"
-
-        "subs           %[h], %[h], #2                          \n\t"
-        "pld            [%[block]]                              \n\t"
-        "bne            1b                                      \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "cc", "memory", "r12");
-}
-
-void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr4, wr10, wr11       \n\t"
-        "walignr1 wr5, wr11, wr12       \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr4, wr10, wr11       \n\t"
-        "walignr1 wr5, wr11, wr12       \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        WAVG2B" wr8, wr8, wr10          \n\t"
-        WAVG2B" wr9, wr9, wr11          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        WAVG2B" wr8, wr8, wr10          \n\t"
-        WAVG2B" wr9, wr9, wr11          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "pld [%[block]]                 \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "add r12, r12, #1               \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "cmp r12, #8                    \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        /* alignment */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "tmcr wcgr2, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr7, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr6, wr7            \n\t"
-        "wunpckehub wr7, wr7            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr6, wr6, wr10         \n\t"
-        "waddhus wr7, wr7, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "add r12, r12, #1               \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "cmp r12, #8                    \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        /* alignment */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7           \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "tmcr wcgr2, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr7, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr6, wr7            \n\t"
-        "wunpckehub wr7, wr7            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr6, wr6, wr10         \n\t"
-        "waddhus wr7, wr7, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wldrd wr13, [%[block], #8]     \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        WAVG2B" wr9, wr9, wr13          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "pld [%[block], #32]            \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wldrd wr13, [%[block], #8]     \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        WAVG2B" wr9, wr9, wr13          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1146 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "asm.S"
-
-        preserve8
-        .text
-
-        .macro pixels16 avg=0
-.if \avg
-        mov             ip,  r0
-.endif
-1:      vld1.64         {d0, d1},  [r1], r2
-        vld1.64         {d2, d3},  [r1], r2
-        vld1.64         {d4, d5},  [r1], r2
-        pld             [r1, r2, lsl #2]
-        vld1.64         {d6, d7},  [r1], r2
-        pld             [r1]
-        pld             [r1, r2]
-        pld             [r1, r2, lsl #1]
-.if \avg
-        vld1.64         {d16,d17}, [ip,:128], r2
-        vrhadd.u8       q0,  q0,  q8
-        vld1.64         {d18,d19}, [ip,:128], r2
-        vrhadd.u8       q1,  q1,  q9
-        vld1.64         {d20,d21}, [ip,:128], r2
-        vrhadd.u8       q2,  q2,  q10
-        vld1.64         {d22,d23}, [ip,:128], r2
-        vrhadd.u8       q3,  q3,  q11
-.endif
-        subs            r3,  r3,  #4
-        vst1.64         {d0, d1},  [r0,:128], r2
-        vst1.64         {d2, d3},  [r0,:128], r2
-        vst1.64         {d4, d5},  [r0,:128], r2
-        vst1.64         {d6, d7},  [r0,:128], r2
-        bne             1b
-        bx              lr
-        .endm
-
-        .macro pixels16_x2 vhadd=vrhadd.u8
-1:      vld1.64         {d0-d2},   [r1], r2
-        vld1.64         {d4-d6},   [r1], r2
-        pld             [r1]
-        pld             [r1, r2]
-        subs            r3,  r3,  #2
-        vext.8          q1,  q0,  q1,  #1
-        \vhadd          q0,  q0,  q1
-        vext.8          q3,  q2,  q3,  #1
-        \vhadd          q2,  q2,  q3
-        vst1.64         {d0, d1},  [r0,:128], r2
-        vst1.64         {d4, d5},  [r0,:128], r2
-        bne             1b
-        bx              lr
-        .endm
-
-        .macro pixels16_y2 vhadd=vrhadd.u8
-        vld1.64         {d0, d1},  [r1], r2
-        vld1.64         {d2, d3},  [r1], r2
-1:      subs            r3,  r3,  #2
-        \vhadd          q2,  q0,  q1
-        vld1.64         {d0, d1},  [r1], r2
-        \vhadd          q3,  q0,  q1
-        vld1.64         {d2, d3},  [r1], r2
-        pld             [r1]
-        pld             [r1, r2]
-        vst1.64         {d4, d5},  [r0,:128], r2
-        vst1.64         {d6, d7},  [r0,:128], r2
-        bne             1b
-        bx              lr
-        .endm
-
-        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
-        vld1.64         {d0-d2},   [r1], r2
-        vld1.64         {d4-d6},   [r1], r2
-.if \no_rnd
-        vmov.i16        q13, #1
-.endif
-        pld             [r1]
-        pld             [r1, r2]
-        vext.8          q1,  q0,  q1,  #1
-        vext.8          q3,  q2,  q3,  #1
-        vaddl.u8        q8,  d0,  d2
-        vaddl.u8        q10, d1,  d3
-        vaddl.u8        q9,  d4,  d6
-        vaddl.u8        q11, d5,  d7
-1:      subs            r3,  r3,  #2
-        vld1.64         {d0-d2},   [r1], r2
-        vadd.u16        q12, q8,  q9
-        pld             [r1]
-.if \no_rnd
-        vadd.u16        q12, q12, q13
-.endif
-        vext.8          q15, q0,  q1,  #1
-        vadd.u16        q1 , q10, q11
-        \vshrn          d28, q12, #2
-.if \no_rnd
-        vadd.u16        q1,  q1,  q13
-.endif
-        \vshrn          d29, q1,  #2
-        vaddl.u8        q8,  d0,  d30
-        vld1.64         {d2-d4},   [r1], r2
-        vaddl.u8        q10, d1,  d31
-        vst1.64         {d28,d29}, [r0,:128], r2
-        vadd.u16        q12, q8,  q9
-        pld             [r1, r2]
-.if \no_rnd
-        vadd.u16        q12, q12, q13
-.endif
-        vext.8          q2,  q1,  q2,  #1
-        vadd.u16        q0,  q10, q11
-        \vshrn          d30, q12, #2
-.if \no_rnd
-        vadd.u16        q0,  q0,  q13
-.endif
-        \vshrn          d31, q0,  #2
-        vaddl.u8        q9,  d2,  d4
-        vaddl.u8        q11, d3,  d5
-        vst1.64         {d30,d31}, [r0,:128], r2
-        bgt             1b
-        bx              lr
-        .endm
-
-        .macro pixels8 avg=0
-1:      vld1.64         {d0}, [r1], r2
-        vld1.64         {d1}, [r1], r2
-        vld1.64         {d2}, [r1], r2
-        pld             [r1, r2, lsl #2]
-        vld1.64         {d3}, [r1], r2
-        pld             [r1]
-        pld             [r1, r2]
-        pld             [r1, r2, lsl #1]
-.if \avg
-        vld1.64         {d4}, [r0,:64], r2
-        vrhadd.u8       d0,  d0,  d4
-        vld1.64         {d5}, [r0,:64], r2
-        vrhadd.u8       d1,  d1,  d5
-        vld1.64         {d6}, [r0,:64], r2
-        vrhadd.u8       d2,  d2,  d6
-        vld1.64         {d7}, [r0,:64], r2
-        vrhadd.u8       d3,  d3,  d7
-        sub             r0,  r0,  r2,  lsl #2
-.endif
-        subs            r3,  r3,  #4
-        vst1.64         {d0}, [r0,:64], r2
-        vst1.64         {d1}, [r0,:64], r2
-        vst1.64         {d2}, [r0,:64], r2
-        vst1.64         {d3}, [r0,:64], r2
-        bne             1b
-        bx              lr
-        .endm
-
-        .macro pixels8_x2 vhadd=vrhadd.u8
-1:      vld1.64         {d0, d1},  [r1], r2
-        vext.8          d1,  d0,  d1,  #1
-        vld1.64         {d2, d3},  [r1], r2
-        vext.8          d3,  d2,  d3,  #1
-        pld             [r1]
-        pld             [r1, r2]
-        subs            r3,  r3,  #2
-        vswp            d1,  d2
-        \vhadd          q0,  q0,  q1
-        vst1.64         {d0},      [r0,:64], r2
-        vst1.64         {d1},      [r0,:64], r2
-        bne             1b
-        bx              lr
-        .endm
-
-        .macro pixels8_y2 vhadd=vrhadd.u8
-        vld1.64         {d0},      [r1], r2
-        vld1.64         {d1},      [r1], r2
-1:      subs            r3,  r3,  #2
-        \vhadd          d4,  d0,  d1
-        vld1.64         {d0},      [r1], r2
-        \vhadd          d5,  d0,  d1
-        vld1.64         {d1},      [r1], r2
-        pld             [r1]
-        pld             [r1, r2]
-        vst1.64         {d4},      [r0,:64], r2
-        vst1.64         {d5},      [r0,:64], r2
-        bne             1b
-        bx              lr
-        .endm
-
-        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
-        vld1.64         {d0, d1},  [r1], r2
-        vld1.64         {d2, d3},  [r1], r2
-.if \no_rnd
-        vmov.i16        q11, #1
-.endif
-        pld             [r1]
-        pld             [r1, r2]
-        vext.8          d4,  d0,  d1,  #1
-        vext.8          d6,  d2,  d3,  #1
-        vaddl.u8        q8,  d0,  d4
-        vaddl.u8        q9,  d2,  d6
-1:      subs            r3,  r3,  #2
-        vld1.64         {d0, d1},  [r1], r2
-        pld             [r1]
-        vadd.u16        q10, q8,  q9
-        vext.8          d4,  d0,  d1,  #1
-.if \no_rnd
-        vadd.u16        q10, q10, q11
-.endif
-        vaddl.u8        q8,  d0,  d4
-        \vshrn          d5,  q10, #2
-        vld1.64         {d2, d3},  [r1], r2
-        vadd.u16        q10, q8,  q9
-        pld             [r1, r2]
-.if \no_rnd
-        vadd.u16        q10, q10, q11
-.endif
-        vst1.64         {d5},      [r0,:64], r2
-        \vshrn          d7,  q10, #2
-        vext.8          d6,  d2,  d3,  #1
-        vaddl.u8        q9,  d2,  d6
-        vst1.64         {d7},      [r0,:64], r2
-        bgt             1b
-        bx              lr
-        .endm
-
-        .macro pixfunc pfx name suf rnd_op args:vararg
-function ff_\pfx\name\suf\()_neon, export=1
-        \name \rnd_op \args
-endfunc
-        .endm
-
-        .macro pixfunc2 pfx name args:vararg
-        pixfunc \pfx \name
-        pixfunc \pfx \name \args
-        .endm
-
-function ff_put_h264_qpel16_mc00_neon, export=1
-        mov             r3,  #16
-endfunc
-
-        pixfunc  put_ pixels16
-        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
-        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
-        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
-
-function ff_avg_h264_qpel16_mc00_neon, export=1
-        mov             r3,  #16
-endfunc
-
-        pixfunc  avg_ pixels16,, 1
-
-function ff_put_h264_qpel8_mc00_neon, export=1
-        mov             r3,  #8
-endfunc
-
-        pixfunc  put_ pixels8
-        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
-        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
-        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
-
-function ff_avg_h264_qpel8_mc00_neon, export=1
-        mov             r3,  #8
-endfunc
-
-        pixfunc  avg_ pixels8,, 1
-
-function ff_put_pixels_clamped_neon, export=1
-        vld1.64         {d16-d19}, [r0,:128]!
-        vqmovun.s16     d0, q8
-        vld1.64         {d20-d23}, [r0,:128]!
-        vqmovun.s16     d1, q9
-        vld1.64         {d24-d27}, [r0,:128]!
-        vqmovun.s16     d2, q10
-        vld1.64         {d28-d31}, [r0,:128]!
-        vqmovun.s16     d3, q11
-        vst1.64         {d0},      [r1,:64], r2
-        vqmovun.s16     d4, q12
-        vst1.64         {d1},      [r1,:64], r2
-        vqmovun.s16     d5, q13
-        vst1.64         {d2},      [r1,:64], r2
-        vqmovun.s16     d6, q14
-        vst1.64         {d3},      [r1,:64], r2
-        vqmovun.s16     d7, q15
-        vst1.64         {d4},      [r1,:64], r2
-        vst1.64         {d5},      [r1,:64], r2
-        vst1.64         {d6},      [r1,:64], r2
-        vst1.64         {d7},      [r1,:64], r2
-        bx              lr
-endfunc
-
-function ff_put_signed_pixels_clamped_neon, export=1
-        vmov.u8         d31, #128
-        vld1.64         {d16-d17}, [r0,:128]!
-        vqmovn.s16      d0, q8
-        vld1.64         {d18-d19}, [r0,:128]!
-        vqmovn.s16      d1, q9
-        vld1.64         {d16-d17}, [r0,:128]!
-        vqmovn.s16      d2, q8
-        vld1.64         {d18-d19}, [r0,:128]!
-        vadd.u8         d0, d0, d31
-        vld1.64         {d20-d21}, [r0,:128]!
-        vadd.u8         d1, d1, d31
-        vld1.64         {d22-d23}, [r0,:128]!
-        vadd.u8         d2, d2, d31
-        vst1.64         {d0},      [r1,:64], r2
-        vqmovn.s16      d3, q9
-        vst1.64         {d1},      [r1,:64], r2
-        vqmovn.s16      d4, q10
-        vst1.64         {d2},      [r1,:64], r2
-        vqmovn.s16      d5, q11
-        vld1.64         {d24-d25}, [r0,:128]!
-        vadd.u8         d3, d3, d31
-        vld1.64         {d26-d27}, [r0,:128]!
-        vadd.u8         d4, d4, d31
-        vadd.u8         d5, d5, d31
-        vst1.64         {d3},      [r1,:64], r2
-        vqmovn.s16      d6, q12
-        vst1.64         {d4},      [r1,:64], r2
-        vqmovn.s16      d7, q13
-        vst1.64         {d5},      [r1,:64], r2
-        vadd.u8         d6, d6, d31
-        vadd.u8         d7, d7, d31
-        vst1.64         {d6},      [r1,:64], r2
-        vst1.64         {d7},      [r1,:64], r2
-        bx              lr
-endfunc
-
-function ff_add_pixels_clamped_neon, export=1
-        mov             r3, r1
-        vld1.64         {d16},   [r1,:64], r2
-        vld1.64         {d0-d1}, [r0,:128]!
-        vaddw.u8        q0, q0, d16
-        vld1.64         {d17},   [r1,:64], r2
-        vld1.64         {d2-d3}, [r0,:128]!
-        vqmovun.s16     d0, q0
-        vld1.64         {d18},   [r1,:64], r2
-        vaddw.u8        q1, q1, d17
-        vld1.64         {d4-d5}, [r0,:128]!
-        vaddw.u8        q2, q2, d18
-        vst1.64         {d0},    [r3,:64], r2
-        vqmovun.s16     d2, q1
-        vld1.64         {d19},   [r1,:64], r2
-        vld1.64         {d6-d7}, [r0,:128]!
-        vaddw.u8        q3, q3, d19
-        vqmovun.s16     d4, q2
-        vst1.64         {d2},    [r3,:64], r2
-        vld1.64         {d16},   [r1,:64], r2
-        vqmovun.s16     d6, q3
-        vld1.64         {d0-d1}, [r0,:128]!
-        vaddw.u8        q0, q0, d16
-        vst1.64         {d4},    [r3,:64], r2
-        vld1.64         {d17},   [r1,:64], r2
-        vld1.64         {d2-d3}, [r0,:128]!
-        vaddw.u8        q1, q1, d17
-        vst1.64         {d6},    [r3,:64], r2
-        vqmovun.s16     d0, q0
-        vld1.64         {d18},   [r1,:64], r2
-        vld1.64         {d4-d5}, [r0,:128]!
-        vaddw.u8        q2, q2, d18
-        vst1.64         {d0},    [r3,:64], r2
-        vqmovun.s16     d2, q1
-        vld1.64         {d19},   [r1,:64], r2
-        vqmovun.s16     d4, q2
-        vld1.64         {d6-d7}, [r0,:128]!
-        vaddw.u8        q3, q3, d19
-        vst1.64         {d2},    [r3,:64], r2
-        vqmovun.s16     d6, q3
-        vst1.64         {d4},    [r3,:64], r2
-        vst1.64         {d6},    [r3,:64], r2
-        bx              lr
-endfunc
-
-function ff_float_to_int16_neon, export=1
-        subs            r2,  r2,  #8
-        vld1.64         {d0-d1},  [r1,:128]!
-        vcvt.s32.f32    q8,  q0,  #16
-        vld1.64         {d2-d3},  [r1,:128]!
-        vcvt.s32.f32    q9,  q1,  #16
-        beq             3f
-        bics            ip,  r2,  #15
-        beq             2f
-1:      subs            ip,  ip,  #16
-        vshrn.s32       d4,  q8,  #16
-        vld1.64         {d0-d1},  [r1,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vshrn.s32       d5,  q9,  #16
-        vld1.64         {d2-d3},  [r1,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vshrn.s32       d6,  q0,  #16
-        vst1.64         {d4-d5},  [r0,:128]!
-        vshrn.s32       d7,  q1,  #16
-        vld1.64         {d16-d17},[r1,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r1,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.64         {d6-d7},  [r0,:128]!
-        bne             1b
-        ands            r2,  r2,  #15
-        beq             3f
-2:      vld1.64         {d0-d1},  [r1,:128]!
-        vshrn.s32       d4,  q8,  #16
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r1,:128]!
-        vshrn.s32       d5,  q9,  #16
-        vcvt.s32.f32    q1,  q1,  #16
-        vshrn.s32       d6,  q0,  #16
-        vst1.64         {d4-d5},  [r0,:128]!
-        vshrn.s32       d7,  q1,  #16
-        vst1.64         {d6-d7},  [r0,:128]!
-        bx              lr
-3:      vshrn.s32       d4,  q8,  #16
-        vshrn.s32       d5,  q9,  #16
-        vst1.64         {d4-d5},  [r0,:128]!
-        bx              lr
-endfunc
-
-function ff_float_to_int16_interleave_neon, export=1
-        cmp             r3, #2
-        ldrlt           r1, [r1]
-        blt             ff_float_to_int16_neon
-        bne             4f
-
-        ldr             r3, [r1]
-        ldr             r1, [r1, #4]
-
-        subs            r2,  r2,  #8
-        vld1.64         {d0-d1},  [r3,:128]!
-        vcvt.s32.f32    q8,  q0,  #16
-        vld1.64         {d2-d3},  [r3,:128]!
-        vcvt.s32.f32    q9,  q1,  #16
-        vld1.64         {d20-d21},[r1,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r1,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        beq             3f
-        bics            ip,  r2,  #15
-        beq             2f
-1:      subs            ip,  ip,  #16
-        vld1.64         {d0-d1},  [r3,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vsri.32         q10, q8,  #16
-        vld1.64         {d2-d3},  [r3,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vld1.64         {d24-d25},[r1,:128]!
-        vcvt.s32.f32    q12, q12, #16
-        vld1.64         {d26-d27},[r1,:128]!
-        vsri.32         q11, q9,  #16
-        vst1.64         {d20-d21},[r0,:128]!
-        vcvt.s32.f32    q13, q13, #16
-        vst1.64         {d22-d23},[r0,:128]!
-        vsri.32         q12, q0,  #16
-        vld1.64         {d16-d17},[r3,:128]!
-        vsri.32         q13, q1,  #16
-        vst1.64         {d24-d25},[r0,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r3,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vld1.64         {d20-d21},[r1,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r1,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        vst1.64         {d26-d27},[r0,:128]!
-        bne             1b
-        ands            r2,  r2,  #15
-        beq             3f
-2:      vsri.32         q10, q8,  #16
-        vld1.64         {d0-d1},  [r3,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r3,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vld1.64         {d24-d25},[r1,:128]!
-        vcvt.s32.f32    q12, q12, #16
-        vsri.32         q11, q9,  #16
-        vld1.64         {d26-d27},[r1,:128]!
-        vcvt.s32.f32    q13, q13, #16
-        vst1.64         {d20-d21},[r0,:128]!
-        vsri.32         q12, q0,  #16
-        vst1.64         {d22-d23},[r0,:128]!
-        vsri.32         q13, q1,  #16
-        vst1.64         {d24-d27},[r0,:128]!
-        bx              lr
-3:      vsri.32         q10, q8,  #16
-        vsri.32         q11, q9,  #16
-        vst1.64         {d20-d23},[r0,:128]!
-        bx              lr
-
-4:      push            {r4-r8,lr}
-        cmp             r3,  #4
-        lsl             ip,  r3,  #1
-        blt             4f
-
-        @ 4 channels
-5:      ldmia           r1!, {r4-r7}
-        mov             lr,  r2
-        mov             r8,  r0
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vld1.64         {d20-d21},[r6,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r7,:128]!
-        vcvt.s32.f32    q11, q11, #16
-6:      subs            lr,  lr,  #8
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vsri.32         q9,  q8,  #16
-        vld1.64         {d2-d3},  [r5,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vsri.32         q11, q10, #16
-        vld1.64         {d4-d5},  [r6,:128]!
-        vcvt.s32.f32    q2,  q2,  #16
-        vzip.32         d18, d22
-        vld1.64         {d6-d7},  [r7,:128]!
-        vcvt.s32.f32    q3,  q3,  #16
-        vzip.32         d19, d23
-        vst1.64         {d18},    [r8], ip
-        vsri.32         q1,  q0,  #16
-        vst1.64         {d22},    [r8], ip
-        vsri.32         q3,  q2,  #16
-        vst1.64         {d19},    [r8], ip
-        vzip.32         d2,  d6
-        vst1.64         {d23},    [r8], ip
-        vzip.32         d3,  d7
-        beq             7f
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vst1.64         {d2},     [r8], ip
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.64         {d6},     [r8], ip
-        vld1.64         {d20-d21},[r6,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vst1.64         {d3},     [r8], ip
-        vld1.64         {d22-d23},[r7,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        vst1.64         {d7},     [r8], ip
-        b               6b
-7:      vst1.64         {d2},     [r8], ip
-        vst1.64         {d6},     [r8], ip
-        vst1.64         {d3},     [r8], ip
-        vst1.64         {d7},     [r8], ip
-        subs            r3,  r3,  #4
-        popeq           {r4-r8,pc}
-        cmp             r3,  #4
-        add             r0,  r0,  #8
-        bge             5b
-
-        @ 2 channels
-4:      cmp             r3,  #2
-        blt             4f
-        ldmia           r1!, {r4-r5}
-        mov             lr,  r2
-        mov             r8,  r0
-        tst             lr,  #8
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vld1.64         {d20-d21},[r4,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r5,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        beq             6f
-        subs            lr,  lr,  #8
-        beq             7f
-        vsri.32         d18, d16, #16
-        vsri.32         d19, d17, #16
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vst1.32         {d18[0]}, [r8], ip
-        vsri.32         d22, d20, #16
-        vst1.32         {d18[1]}, [r8], ip
-        vsri.32         d23, d21, #16
-        vst1.32         {d19[0]}, [r8], ip
-        vst1.32         {d19[1]}, [r8], ip
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.32         {d22[0]}, [r8], ip
-        vst1.32         {d22[1]}, [r8], ip
-        vld1.64         {d20-d21},[r4,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vst1.32         {d23[0]}, [r8], ip
-        vst1.32         {d23[1]}, [r8], ip
-        vld1.64         {d22-d23},[r5,:128]!
-        vcvt.s32.f32    q11, q11, #16
-6:      subs            lr,  lr,  #16
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vsri.32         d18, d16, #16
-        vld1.64         {d2-d3},  [r5,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vsri.32         d19, d17, #16
-        vld1.64         {d4-d5},  [r4,:128]!
-        vcvt.s32.f32    q2,  q2,  #16
-        vld1.64         {d6-d7},  [r5,:128]!
-        vcvt.s32.f32    q3,  q3,  #16
-        vst1.32         {d18[0]}, [r8], ip
-        vsri.32         d22, d20, #16
-        vst1.32         {d18[1]}, [r8], ip
-        vsri.32         d23, d21, #16
-        vst1.32         {d19[0]}, [r8], ip
-        vsri.32         d2,  d0,  #16
-        vst1.32         {d19[1]}, [r8], ip
-        vsri.32         d3,  d1,  #16
-        vst1.32         {d22[0]}, [r8], ip
-        vsri.32         d6,  d4,  #16
-        vst1.32         {d22[1]}, [r8], ip
-        vsri.32         d7,  d5,  #16
-        vst1.32         {d23[0]}, [r8], ip
-        vst1.32         {d23[1]}, [r8], ip
-        beq             6f
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vst1.32         {d2[0]},  [r8], ip
-        vst1.32         {d2[1]},  [r8], ip
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.32         {d3[0]},  [r8], ip
-        vst1.32         {d3[1]},  [r8], ip
-        vld1.64         {d20-d21},[r4,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vst1.32         {d6[0]},  [r8], ip
-        vst1.32         {d6[1]},  [r8], ip
-        vld1.64         {d22-d23},[r5,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        vst1.32         {d7[0]},  [r8], ip
-        vst1.32         {d7[1]},  [r8], ip
-        bgt             6b
-6:      vst1.32         {d2[0]},  [r8], ip
-        vst1.32         {d2[1]},  [r8], ip
-        vst1.32         {d3[0]},  [r8], ip
-        vst1.32         {d3[1]},  [r8], ip
-        vst1.32         {d6[0]},  [r8], ip
-        vst1.32         {d6[1]},  [r8], ip
-        vst1.32         {d7[0]},  [r8], ip
-        vst1.32         {d7[1]},  [r8], ip
-        b               8f
-7:      vsri.32         d18, d16, #16
-        vsri.32         d19, d17, #16
-        vst1.32         {d18[0]}, [r8], ip
-        vsri.32         d22, d20, #16
-        vst1.32         {d18[1]}, [r8], ip
-        vsri.32         d23, d21, #16
-        vst1.32         {d19[0]}, [r8], ip
-        vst1.32         {d19[1]}, [r8], ip
-        vst1.32         {d22[0]}, [r8], ip
-        vst1.32         {d22[1]}, [r8], ip
-        vst1.32         {d23[0]}, [r8], ip
-        vst1.32         {d23[1]}, [r8], ip
-8:      subs            r3,  r3,  #2
-        add             r0,  r0,  #4
-        popeq           {r4-r8,pc}
-
-        @ 1 channel
-4:      ldr             r4,  [r1],#4
-        tst             r2,  #8
-        mov             lr,  r2
-        mov             r5,  r0
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r4,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        bne             8f
-6:      subs            lr,  lr,  #16
-        vld1.64         {d4-d5},  [r4,:128]!
-        vcvt.s32.f32    q2,  q2,  #16
-        vld1.64         {d6-d7},  [r4,:128]!
-        vcvt.s32.f32    q3,  q3,  #16
-        vst1.16         {d0[1]},  [r5,:16], ip
-        vst1.16         {d0[3]},  [r5,:16], ip
-        vst1.16         {d1[1]},  [r5,:16], ip
-        vst1.16         {d1[3]},  [r5,:16], ip
-        vst1.16         {d2[1]},  [r5,:16], ip
-        vst1.16         {d2[3]},  [r5,:16], ip
-        vst1.16         {d3[1]},  [r5,:16], ip
-        vst1.16         {d3[3]},  [r5,:16], ip
-        beq             7f
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r4,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-7:      vst1.16         {d4[1]},  [r5,:16], ip
-        vst1.16         {d4[3]},  [r5,:16], ip
-        vst1.16         {d5[1]},  [r5,:16], ip
-        vst1.16         {d5[3]},  [r5,:16], ip
-        vst1.16         {d6[1]},  [r5,:16], ip
-        vst1.16         {d6[3]},  [r5,:16], ip
-        vst1.16         {d7[1]},  [r5,:16], ip
-        vst1.16         {d7[3]},  [r5,:16], ip
-        bgt             6b
-        pop             {r4-r8,pc}
-8:      subs            lr,  lr,  #8
-        vst1.16         {d0[1]},  [r5,:16], ip
-        vst1.16         {d0[3]},  [r5,:16], ip
-        vst1.16         {d1[1]},  [r5,:16], ip
-        vst1.16         {d1[3]},  [r5,:16], ip
-        vst1.16         {d2[1]},  [r5,:16], ip
-        vst1.16         {d2[3]},  [r5,:16], ip
-        vst1.16         {d3[1]},  [r5,:16], ip
-        vst1.16         {d3[3]},  [r5,:16], ip
-        popeq           {r4-r8,pc}
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r4,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        b               6b
-endfunc
-
-function ff_vector_fmul_neon, export=1
-        mov             r3,  r0
-        subs            r2,  r2,  #8
-        vld1.64         {d0-d3},  [r0,:128]!
-        vld1.64         {d4-d7},  [r1,:128]!
-        vmul.f32        q8,  q0,  q2
-        vmul.f32        q9,  q1,  q3
-        beq             3f
-        bics            ip,  r2,  #15
-        beq             2f
-1:      subs            ip,  ip,  #16
-        vld1.64         {d0-d1},  [r0,:128]!
-        vld1.64         {d4-d5},  [r1,:128]!
-        vmul.f32        q10, q0,  q2
-        vld1.64         {d2-d3},  [r0,:128]!
-        vld1.64         {d6-d7},  [r1,:128]!
-        vmul.f32        q11, q1,  q3
-        vst1.64         {d16-d19},[r3,:128]!
-        vld1.64         {d0-d1},  [r0,:128]!
-        vld1.64         {d4-d5},  [r1,:128]!
-        vmul.f32        q8,  q0,  q2
-        vld1.64         {d2-d3},  [r0,:128]!
-        vld1.64         {d6-d7},  [r1,:128]!
-        vmul.f32        q9,  q1,  q3
-        vst1.64         {d20-d23},[r3,:128]!
-        bne             1b
-        ands            r2,  r2,  #15
-        beq             3f
-2:      vld1.64         {d0-d1},  [r0,:128]!
-        vld1.64         {d4-d5},  [r1,:128]!
-        vst1.64         {d16-d17},[r3,:128]!
-        vmul.f32        q8,  q0,  q2
-        vld1.64         {d2-d3},  [r0,:128]!
-        vld1.64         {d6-d7},  [r1,:128]!
-        vst1.64         {d18-d19},[r3,:128]!
-        vmul.f32        q9,  q1,  q3
-3:      vst1.64         {d16-d19},[r3,:128]!
-        bx              lr
-endfunc
-
-function ff_vector_fmul_window_neon, export=1
-VFP     vdup.32         q8,  d0[0]
-NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
-        push            {r4,r5,lr}
-VFP     ldr             lr,  [sp, #12]
-NOVFP   ldr             lr,  [sp, #16]
-        sub             r2,  r2,  #8
-        sub             r5,  lr,  #2
-        add             r2,  r2,  r5, lsl #2
-        add             r4,  r3,  r5, lsl #3
-        add             ip,  r0,  r5, lsl #3
-        mov             r5,  #-16
-        vld1.64         {d0,d1},  [r1,:128]!
-        vld1.64         {d2,d3},  [r2,:128], r5
-        vld1.64         {d4,d5},  [r3,:128]!
-        vld1.64         {d6,d7},  [r4,:128], r5
-1:      subs            lr,  lr,  #4
-        vmov            q11, q8
-        vmla.f32        d22, d0,  d4
-        vmov            q10, q8
-        vmla.f32        d23, d1,  d5
-        vrev64.32       q3,  q3
-        vmla.f32        d20, d0,  d7
-        vrev64.32       q1,  q1
-        vmla.f32        d21, d1,  d6
-        beq             2f
-        vmla.f32        d22, d3,  d7
-        vld1.64         {d0,d1},  [r1,:128]!
-        vmla.f32        d23, d2,  d6
-        vld1.64         {d18,d19},[r2,:128], r5
-        vmls.f32        d20, d3,  d4
-        vld1.64         {d24,d25},[r3,:128]!
-        vmls.f32        d21, d2,  d5
-        vld1.64         {d6,d7},  [r4,:128], r5
-        vmov            q1,  q9
-        vrev64.32       q11, q11
-        vmov            q2,  q12
-        vswp            d22, d23
-        vst1.64         {d20,d21},[r0,:128]!
-        vst1.64         {d22,d23},[ip,:128], r5
-        b               1b
-2:      vmla.f32        d22, d3,  d7
-        vmla.f32        d23, d2,  d6
-        vmls.f32        d20, d3,  d4
-        vmls.f32        d21, d2,  d5
-        vrev64.32       q11, q11
-        vswp            d22, d23
-        vst1.64         {d20,d21},[r0,:128]!
-        vst1.64         {d22,d23},[ip,:128], r5
-        pop             {r4,r5,pc}
-endfunc
-
-#if CONFIG_VORBIS_DECODER
-function ff_vorbis_inverse_coupling_neon, export=1
-        vmov.i32        q10, #1<<31
-        subs            r2,  r2,  #4
-        mov             r3,  r0
-        mov             r12, r1
-        beq             3f
-
-        vld1.32         {d24-d25},[r1,:128]!
-        vld1.32         {d22-d23},[r0,:128]!
-        vcle.s32        q8,  q12, #0
-        vand            q9,  q11, q10
-        veor            q12, q12, q9
-        vand            q2,  q12, q8
-        vbic            q3,  q12, q8
-        vadd.f32        q12, q11, q2
-        vsub.f32        q11, q11, q3
-1:      vld1.32         {d2-d3},  [r1,:128]!
-        vld1.32         {d0-d1},  [r0,:128]!
-        vcle.s32        q8,  q1,  #0
-        vand            q9,  q0,  q10
-        veor            q1,  q1,  q9
-        vst1.32         {d24-d25},[r3, :128]!
-        vst1.32         {d22-d23},[r12,:128]!
-        vand            q2,  q1,  q8
-        vbic            q3,  q1,  q8
-        vadd.f32        q1,  q0,  q2
-        vsub.f32        q0,  q0,  q3
-        subs            r2,  r2,  #8
-        ble             2f
-        vld1.32         {d24-d25},[r1,:128]!
-        vld1.32         {d22-d23},[r0,:128]!
-        vcle.s32        q8,  q12, #0
-        vand            q9,  q11, q10
-        veor            q12, q12, q9
-        vst1.32         {d2-d3},  [r3, :128]!
-        vst1.32         {d0-d1},  [r12,:128]!
-        vand            q2,  q12, q8
-        vbic            q3,  q12, q8
-        vadd.f32        q12, q11, q2
-        vsub.f32        q11, q11, q3
-        b               1b
-
-2:      vst1.32         {d2-d3},  [r3, :128]!
-        vst1.32         {d0-d1},  [r12,:128]!
-        bxlt            lr
-
-3:      vld1.32         {d2-d3},  [r1,:128]
-        vld1.32         {d0-d1},  [r0,:128]
-        vcle.s32        q8,  q1,  #0
-        vand            q9,  q0,  q10
-        veor            q1,  q1,  q9
-        vand            q2,  q1,  q8
-        vbic            q3,  q1,  q8
-        vadd.f32        q1,  q0,  q2
-        vsub.f32        q0,  q0,  q3
-        vst1.32         {d2-d3},  [r0,:128]!
-        vst1.32         {d0-d1},  [r1,:128]!
-        bx              lr
-endfunc
-#endif
-
-function ff_vector_fmul_scalar_neon, export=1
-VFP     len .req r2
-NOVFP   len .req r3
-VFP     vdup.32         q8,  d0[0]
-NOVFP   vdup.32         q8,  r2
-        bics            r12, len, #15
-        beq             3f
-        vld1.32         {q0},[r1,:128]!
-        vld1.32         {q1},[r1,:128]!
-1:      vmul.f32        q0,  q0,  q8
-        vld1.32         {q2},[r1,:128]!
-        vmul.f32        q1,  q1,  q8
-        vld1.32         {q3},[r1,:128]!
-        vmul.f32        q2,  q2,  q8
-        vst1.32         {q0},[r0,:128]!
-        vmul.f32        q3,  q3,  q8
-        vst1.32         {q1},[r0,:128]!
-        subs            r12, r12, #16
-        beq             2f
-        vld1.32         {q0},[r1,:128]!
-        vst1.32         {q2},[r0,:128]!
-        vld1.32         {q1},[r1,:128]!
-        vst1.32         {q3},[r0,:128]!
-        b               1b
-2:      vst1.32         {q2},[r0,:128]!
-        vst1.32         {q3},[r0,:128]!
-        ands            len, len, #15
-        bxeq            lr
-3:      vld1.32         {q0},[r1,:128]!
-        vmul.f32        q0,  q0,  q8
-        vst1.32         {q0},[r0,:128]!
-        subs            len, len, #4
-        bgt             3b
-        bx              lr
-        .unreq          len
-endfunc
-
-function ff_vector_fmul_sv_scalar_2_neon, export=1
-VFP     vdup.32         d16, d0[0]
-NOVFP   vdup.32         d16, r3
-NOVFP   ldr             r3,  [sp]
-        vld1.32         {d0},[r1,:64]!
-        vld1.32         {d1},[r1,:64]!
-1:      subs            r3,  r3,  #4
-        vmul.f32        d4,  d0,  d16
-        vmul.f32        d5,  d1,  d16
-        ldr             r12, [r2], #4
-        vld1.32         {d2},[r12,:64]
-        ldr             r12, [r2], #4
-        vld1.32         {d3},[r12,:64]
-        vmul.f32        d4,  d4,  d2
-        vmul.f32        d5,  d5,  d3
-        beq             2f
-        vld1.32         {d0},[r1,:64]!
-        vld1.32         {d1},[r1,:64]!
-        vst1.32         {d4},[r0,:64]!
-        vst1.32         {d5},[r0,:64]!
-        b               1b
-2:      vst1.32         {d4},[r0,:64]!
-        vst1.32         {d5},[r0,:64]!
-        bx              lr
-endfunc
-
-function ff_vector_fmul_sv_scalar_4_neon, export=1
-VFP     vdup.32         q10, d0[0]
-NOVFP   vdup.32         q10, r3
-NOVFP   ldr             r3,  [sp]
-        push            {lr}
-        bics            lr,  r3,  #7
-        beq             3f
-        vld1.32         {q0},[r1,:128]!
-        vld1.32         {q2},[r1,:128]!
-1:      ldr             r12, [r2], #4
-        vld1.32         {q1},[r12,:128]
-        ldr             r12, [r2], #4
-        vld1.32         {q3},[r12,:128]
-        vmul.f32        q8,  q0,  q10
-        vmul.f32        q8,  q8,  q1
-        vmul.f32        q9,  q2,  q10
-        vmul.f32        q9,  q9,  q3
-        subs            lr,  lr,  #8
-        beq             2f
-        vld1.32         {q0},[r1,:128]!
-        vld1.32         {q2},[r1,:128]!
-        vst1.32         {q8},[r0,:128]!
-        vst1.32         {q9},[r0,:128]!
-        b               1b
-2:      vst1.32         {q8},[r0,:128]!
-        vst1.32         {q9},[r0,:128]!
-        ands            r3,  r3,  #7
-        popeq           {pc}
-3:      vld1.32         {q0},[r1,:128]!
-        ldr             r12, [r2], #4
-        vld1.32         {q1},[r12,:128]
-        vmul.f32        q0,  q0,  q10
-        vmul.f32        q0,  q0,  q1
-        vst1.32         {q0},[r0,:128]!
-        subs            r3,  r3,  #4
-        bgt             3b
-        pop             {pc}
-endfunc
-
-function ff_sv_fmul_scalar_2_neon, export=1
-VFP     len .req r2
-NOVFP   len .req r3
-VFP     vdup.32         q8,  d0[0]
-NOVFP   vdup.32         q8,  r2
-        ldr             r12, [r1], #4
-        vld1.32         {d0},[r12,:64]
-        ldr             r12, [r1], #4
-        vld1.32         {d1},[r12,:64]
-1:      vmul.f32        q1,  q0,  q8
-        subs            len, len, #4
-        beq             2f
-        ldr             r12, [r1], #4
-        vld1.32         {d0},[r12,:64]
-        ldr             r12, [r1], #4
-        vld1.32         {d1},[r12,:64]
-        vst1.32         {q1},[r0,:128]!
-        b               1b
-2:      vst1.32         {q1},[r0,:128]!
-        bx              lr
-        .unreq          len
-endfunc
-
-function ff_sv_fmul_scalar_4_neon, export=1
-VFP     len .req r2
-NOVFP   len .req r3
-VFP     vdup.32         q8,  d0[0]
-NOVFP   vdup.32         q8,  r2
-1:      ldr             r12, [r1], #4
-        vld1.32         {q0},[r12,:128]
-        vmul.f32        q0,  q0,  q8
-        vst1.32         {q0},[r0,:128]!
-        subs            len, len, #4
-        bgt             1b
-        bx              lr
-        .unreq          len
-endfunc
-
-function ff_butterflies_float_neon, export=1
-1:      vld1.32         {q0},[r0,:128]
-        vld1.32         {q1},[r1,:128]
-        vsub.f32        q2,  q0,  q1
-        vadd.f32        q1,  q0,  q1
-        vst1.32         {q2},[r1,:128]!
-        vst1.32         {q1},[r0,:128]!
-        subs            r2,  r2,  #4
-        bgt             1b
-        bx              lr
-endfunc
-
-function ff_scalarproduct_float_neon, export=1
-        vmov.f32        q2,  #0.0
-1:      vld1.32         {q0},[r0,:128]!
-        vld1.32         {q1},[r1,:128]!
-        vmla.f32        q2,  q0,  q1
-        subs            r2,  r2,  #4
-        bgt             1b
-        vadd.f32        d0,  d4,  d5
-        vpadd.f32       d0,  d0,  d0
-NOVFP   vmov.32         r0,  d0[0]
-        bx              lr
-endfunc
-
-function ff_int32_to_float_fmul_scalar_neon, export=1
-VFP     vdup.32         q0,  d0[0]
-VFP     len     .req    r2
-NOVFP   vdup.32         q0,  r2
-NOVFP   len     .req    r3
-
-        vld1.32         {q1},[r1,:128]!
-        vcvt.f32.s32    q3,  q1
-        vld1.32         {q2},[r1,:128]!
-        vcvt.f32.s32    q8,  q2
-1:      subs            len, len, #8
-        pld             [r1, #16]
-        vmul.f32        q9,  q3,  q0
-        vmul.f32        q10, q8,  q0
-        beq             2f
-        vld1.32         {q1},[r1,:128]!
-        vcvt.f32.s32    q3,  q1
-        vld1.32         {q2},[r1,:128]!
-        vcvt.f32.s32    q8,  q2
-        vst1.32         {q9}, [r0,:128]!
-        vst1.32         {q10},[r0,:128]!
-        b               1b
-2:      vst1.32         {q9}, [r0,:128]!
-        vst1.32         {q10},[r0,:128]!
-        bx              lr
-        .unreq  len
-endfunc
-
-function ff_vector_fmul_reverse_neon, export=1
-        add             r2,  r2,  r3,  lsl #2
-        sub             r2,  r2,  #32
-        mov             r12, #-32
-        vld1.32         {q0-q1},  [r1,:128]!
-        vld1.32         {q2-q3},  [r2,:128], r12
-1:      pld             [r1, #32]
-        vrev64.32       q3,  q3
-        vmul.f32        d16, d0,  d7
-        vmul.f32        d17, d1,  d6
-        pld             [r2, #-32]
-        vrev64.32       q2,  q2
-        vmul.f32        d18, d2,  d5
-        vmul.f32        d19, d3,  d4
-        subs            r3,  r3,  #8
-        beq             2f
-        vld1.32         {q0-q1},  [r1,:128]!
-        vld1.32         {q2-q3},  [r2,:128], r12
-        vst1.32         {q8-q9},  [r0,:128]!
-        b               1b
-2:      vst1.32         {q8-q9},  [r0,:128]!
-        bx              lr
-endfunc
-
-function ff_vector_fmul_add_neon, export=1
-        ldr             r12, [sp]
-        vld1.32         {q0-q1},  [r1,:128]!
-        vld1.32         {q8-q9},  [r2,:128]!
-        vld1.32         {q2-q3},  [r3,:128]!
-        vmul.f32        q10, q0,  q8
-        vmul.f32        q11, q1,  q9
-1:      vadd.f32        q12, q2,  q10
-        vadd.f32        q13, q3,  q11
-        pld             [r1, #16]
-        pld             [r2, #16]
-        pld             [r3, #16]
-        subs            r12, r12, #8
-        beq             2f
-        vld1.32         {q0},     [r1,:128]!
-        vld1.32         {q8},     [r2,:128]!
-        vmul.f32        q10, q0,  q8
-        vld1.32         {q1},     [r1,:128]!
-        vld1.32         {q9},     [r2,:128]!
-        vmul.f32        q11, q1,  q9
-        vld1.32         {q2-q3},  [r3,:128]!
-        vst1.32         {q12-q13},[r0,:128]!
-        b               1b
-2:      vst1.32         {q12-q13},[r0,:128]!
-        bx              lr
-endfunc
-
-function ff_vector_clipf_neon, export=1
-VFP     vdup.32         q1,  d0[1]
-VFP     vdup.32         q0,  d0[0]
-NOVFP   vdup.32         q0,  r2
-NOVFP   vdup.32         q1,  r3
-NOVFP   ldr             r2,  [sp]
-        vld1.f32        {q2},[r1,:128]!
-        vmin.f32        q10, q2,  q1
-        vld1.f32        {q3},[r1,:128]!
-        vmin.f32        q11, q3,  q1
-1:      vmax.f32        q8,  q10, q0
-        vmax.f32        q9,  q11, q0
-        subs            r2,  r2,  #8
-        beq             2f
-        vld1.f32        {q2},[r1,:128]!
-        vmin.f32        q10, q2,  q1
-        vld1.f32        {q3},[r1,:128]!
-        vmin.f32        q11, q3,  q1
-        vst1.f32        {q8},[r0,:128]!
-        vst1.f32        {q9},[r0,:128]!
-        b               1b
-2:      vst1.f32        {q8},[r0,:128]!
-        vst1.f32        {q9},[r0,:128]!
-        bx              lr
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "asm.S"
-
-        .syntax unified
-/*
- * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
- * throughput for almost all the instructions (except for double precision
- * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
- * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
- * important for performance. One more interesting feature is that VFP has
- * independent load/store and arithmetics pipelines, so it is possible to make
- * them work simultaneously and get more than 1 operation per cycle. Load/store
- * pipeline can process 2 single precision floating point values per cycle and
- * supports bulk loads and stores for large sets of registers. Arithmetic operations
- * can be done on vectors, which allows to keep the arithmetics pipeline busy,
- * while the processor may issue and execute other instructions. Detailed
- * optimization manuals can be found at http://www.arm.com
- */
-
-/**
- * ARM VFP optimized implementation of 'vector_fmul_c' function.
- * Assume that len is a positive number and is multiple of 8
- */
-@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
-function ff_vector_fmul_vfp, export=1
-        vpush           {d8-d15}
-        mov             r3,  r0
-        fmrx            r12, fpscr
-        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
-        fmxr            fpscr, r12
-
-        vldmia          r3!, {s0-s3}
-        vldmia          r1!, {s8-s11}
-        vldmia          r3!, {s4-s7}
-        vldmia          r1!, {s12-s15}
-        vmul.f32        s8,  s0,  s8
-1:
-        subs            r2,  r2,  #16
-        vmul.f32        s12, s4,  s12
-        vldmiage        r3!, {s16-s19}
-        vldmiage        r1!, {s24-s27}
-        vldmiage        r3!, {s20-s23}
-        vldmiage        r1!, {s28-s31}
-        vmulge.f32      s24, s16, s24
-        vstmia          r0!, {s8-s11}
-        vstmia          r0!, {s12-s15}
-        vmulge.f32      s28, s20, s28
-        vldmiagt        r3!, {s0-s3}
-        vldmiagt        r1!, {s8-s11}
-        vldmiagt        r3!, {s4-s7}
-        vldmiagt        r1!, {s12-s15}
-        vmulge.f32      s8,  s0,  s8
-        vstmiage        r0!, {s24-s27}
-        vstmiage        r0!, {s28-s31}
-        bgt             1b
-
-        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
-        fmxr            fpscr, r12
-        vpop            {d8-d15}
-        bx              lr
-endfunc
-
-/**
- * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
- * Assume that len is a positive number and is multiple of 8
- */
-@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
-@                                 const float *src1, int len)
-function ff_vector_fmul_reverse_vfp, export=1
-        vpush           {d8-d15}
-        add             r2,  r2,  r3, lsl #2
-        vldmdb          r2!, {s0-s3}
-        vldmia          r1!, {s8-s11}
-        vldmdb          r2!, {s4-s7}
-        vldmia          r1!, {s12-s15}
-        vmul.f32        s8,  s3,  s8
-        vmul.f32        s9,  s2,  s9
-        vmul.f32        s10, s1,  s10
-        vmul.f32        s11, s0,  s11
-1:
-        subs            r3,  r3,  #16
-        vldmdbge        r2!, {s16-s19}
-        vmul.f32        s12, s7,  s12
-        vldmiage        r1!, {s24-s27}
-        vmul.f32        s13, s6,  s13
-        vldmdbge        r2!, {s20-s23}
-        vmul.f32        s14, s5,  s14
-        vldmiage        r1!, {s28-s31}
-        vmul.f32        s15, s4,  s15
-        vmulge.f32      s24, s19, s24
-        vldmdbgt        r2!, {s0-s3}
-        vmulge.f32      s25, s18, s25
-        vstmia          r0!, {s8-s13}
-        vmulge.f32      s26, s17, s26
-        vldmiagt        r1!, {s8-s11}
-        vmulge.f32      s27, s16, s27
-        vmulge.f32      s28, s23, s28
-        vldmdbgt        r2!, {s4-s7}
-        vmulge.f32      s29, s22, s29
-        vstmia          r0!, {s14-s15}
-        vmulge.f32      s30, s21, s30
-        vmulge.f32      s31, s20, s31
-        vmulge.f32      s8,  s3,  s8
-        vldmiagt        r1!, {s12-s15}
-        vmulge.f32      s9,  s2,  s9
-        vmulge.f32      s10, s1,  s10
-        vstmiage        r0!, {s24-s27}
-        vmulge.f32      s11, s0,  s11
-        vstmiage        r0!, {s28-s31}
-        bgt             1b
-
-        vpop            {d8-d15}
-        bx              lr
-endfunc
-
-#if HAVE_ARMV6
-/**
- * ARM VFP optimized float to int16 conversion.
- * Assume that len is a positive number and is multiple of 8, destination
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
- * performance), little endian byte sex
- */
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
-function ff_float_to_int16_vfp, export=1
-        push            {r4-r8,lr}
-        vpush           {d8-d11}
-        vldmia          r1!, {s16-s23}
-        vcvt.s32.f32    s0,  s16
-        vcvt.s32.f32    s1,  s17
-        vcvt.s32.f32    s2,  s18
-        vcvt.s32.f32    s3,  s19
-        vcvt.s32.f32    s4,  s20
-        vcvt.s32.f32    s5,  s21
-        vcvt.s32.f32    s6,  s22
-        vcvt.s32.f32    s7,  s23
-1:
-        subs            r2,  r2,  #8
-        vmov            r3,  r4,  s0, s1
-        vmov            r5,  r6,  s2, s3
-        vmov            r7,  r8,  s4, s5
-        vmov            ip,  lr,  s6, s7
-        vldmiagt        r1!, {s16-s23}
-        ssat            r4,  #16, r4
-        ssat            r3,  #16, r3
-        ssat            r6,  #16, r6
-        ssat            r5,  #16, r5
-        pkhbt           r3,  r3,  r4, lsl #16
-        pkhbt           r4,  r5,  r6, lsl #16
-        vcvtgt.s32.f32  s0,  s16
-        vcvtgt.s32.f32  s1,  s17
-        vcvtgt.s32.f32  s2,  s18
-        vcvtgt.s32.f32  s3,  s19
-        vcvtgt.s32.f32  s4,  s20
-        vcvtgt.s32.f32  s5,  s21
-        vcvtgt.s32.f32  s6,  s22
-        vcvtgt.s32.f32  s7,  s23
-        ssat            r8,  #16, r8
-        ssat            r7,  #16, r7
-        ssat            lr,  #16, lr
-        ssat            ip,  #16, ip
-        pkhbt           r5,  r7,  r8, lsl #16
-        pkhbt           r6,  ip,  lr, lsl #16
-        stmia           r0!, {r3-r6}
-        bgt             1b
-
-        vpop            {d8-d11}
-        pop             {r4-r8,pc}
-endfunc
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/fft.h"
-#include "libavcodec/synth_filter.h"
-
-void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
-
-void ff_synth_filter_float_neon(FFTContext *imdct,
-                                float *synth_buf_ptr, int *synth_buf_offset,
-                                float synth_buf2[32], const float window[512],
-                                float out[32], const float in[32],
-                                float scale, float bias);
-
-av_cold void ff_fft_init_arm(FFTContext *s)
-{
-    if (HAVE_NEON) {
-        s->fft_permute  = ff_fft_permute_neon;
-        s->fft_calc     = ff_fft_calc_neon;
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->permutation  = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
-
-#if CONFIG_RDFT
-av_cold void ff_rdft_init_arm(RDFTContext *s)
-{
-    if (HAVE_NEON)
-        s->rdft_calc    = ff_rdft_calc_neon;
-}
-#endif
-
-#if CONFIG_DCA_DECODER
-av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
-{
-    if (HAVE_NEON)
-        s->synth_filter_float = ff_synth_filter_float_neon;
-}
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,371 +0,0 @@
-/*
- * ARM NEON optimised FFT
- *
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2009 Naotoshi Nojiri
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-#define M_SQRT1_2 0.70710678118654752440
-
-        .text
-
-function fft4_neon
-        vld1.32         {d0-d3}, [r0,:128]
-
-        vext.32         q8,  q1,  q1,  #1       @ i2,r3 d3=i3,r2
-        vsub.f32        d6,  d0,  d1            @ r0-r1,i0-i1
-        vsub.f32        d7,  d16, d17           @ r3-r2,i2-i3
-        vadd.f32        d4,  d0,  d1            @ r0+r1,i0+i1
-        vadd.f32        d5,  d2,  d3            @ i2+i3,r2+r3
-        vadd.f32        d1,  d6,  d7
-        vsub.f32        d3,  d6,  d7
-        vadd.f32        d0,  d4,  d5
-        vsub.f32        d2,  d4,  d5
-
-        vst1.32         {d0-d3}, [r0,:128]
-
-        bx              lr
-endfunc
-
-function fft8_neon
-        mov             r1,  r0
-        vld1.32         {d0-d3},   [r1,:128]!
-        vld1.32         {d16-d19}, [r1,:128]
-
-        movw            r2,  #0x04f3            @ sqrt(1/2)
-        movt            r2,  #0x3f35
-        eor             r3,  r2,  #1<<31
-        vdup.32         d31, r2
-
-        vext.32         q11, q1,  q1,  #1       @ i2,r3,i3,r2
-        vadd.f32        d4,  d16, d17           @ r4+r5,i4+i5
-        vmov            d28, r3,  r2
-        vadd.f32        d5,  d18, d19           @ r6+r7,i6+i7
-        vsub.f32        d17, d16, d17           @ r4-r5,i4-i5
-        vsub.f32        d19, d18, d19           @ r6-r7,i6-i7
-        vrev64.32       d29, d28
-        vadd.f32        d20, d0,  d1            @ r0+r1,i0+i1
-        vadd.f32        d21, d2,  d3            @ r2+r3,i2+i3
-        vmul.f32        d26, d17, d28           @ -a2r*w,a2i*w
-        vext.32         q3,  q2,  q2,  #1
-        vmul.f32        d27, d19, d29           @ a3r*w,-a3i*w
-        vsub.f32        d23, d22, d23           @ i2-i3,r3-r2
-        vsub.f32        d22, d0,  d1            @ r0-r1,i0-i1
-        vmul.f32        d24, d17, d31           @ a2r*w,a2i*w
-        vmul.f32        d25, d19, d31           @ a3r*w,a3i*w
-        vadd.f32        d0,  d20, d21
-        vsub.f32        d2,  d20, d21
-        vadd.f32        d1,  d22, d23
-        vrev64.32       q13, q13
-        vsub.f32        d3,  d22, d23
-        vsub.f32        d6,  d6,  d7
-        vadd.f32        d24, d24, d26           @ a2r+a2i,a2i-a2r   t1,t2
-        vadd.f32        d25, d25, d27           @ a3r-a3i,a3i+a3r   t5,t6
-        vadd.f32        d7,  d4,  d5
-        vsub.f32        d18, d2,  d6
-        vext.32         q13, q12, q12, #1
-        vadd.f32        d2,  d2,  d6
-        vsub.f32        d16, d0,  d7
-        vadd.f32        d5,  d25, d24
-        vsub.f32        d4,  d26, d27
-        vadd.f32        d0,  d0,  d7
-        vsub.f32        d17, d1,  d5
-        vsub.f32        d19, d3,  d4
-        vadd.f32        d3,  d3,  d4
-        vadd.f32        d1,  d1,  d5
-
-        vst1.32         {d16-d19}, [r1,:128]
-        vst1.32         {d0-d3},   [r0,:128]
-
-        bx              lr
-endfunc
-
-function fft16_neon
-        movrel          r1, mppm
-        vld1.32         {d16-d19}, [r0,:128]!   @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
-        pld             [r0, #32]
-        vld1.32         {d2-d3}, [r1,:128]
-        vext.32         q13, q9,  q9,  #1
-        vld1.32         {d22-d25}, [r0,:128]!   @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
-        vadd.f32        d4,  d16, d17
-        vsub.f32        d5,  d16, d17
-        vadd.f32        d18, d18, d19
-        vsub.f32        d19, d26, d27
-
-        vadd.f32        d20, d22, d23
-        vsub.f32        d22, d22, d23
-        vsub.f32        d23, d24, d25
-        vadd.f32        q8,  q2,  q9            @ {r0,i0,r1,i1}
-        vadd.f32        d21, d24, d25
-        vmul.f32        d24, d22, d2
-        vsub.f32        q9,  q2,  q9            @ {r2,i2,r3,i3}
-        vmul.f32        d25, d23, d3
-        vuzp.32         d16, d17                @ {r0,r1,i0,i1}
-        vmul.f32        q1,  q11, d2[1]
-        vuzp.32         d18, d19                @ {r2,r3,i2,i3}
-        vrev64.32       q12, q12
-        vadd.f32        q11, q12, q1            @ {t1a,t2a,t5,t6}
-        vld1.32         {d24-d27}, [r0,:128]!   @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
-        vzip.32         q10, q11
-        vld1.32         {d28-d31}, [r0,:128]    @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
-        vadd.f32        d0,  d22, d20
-        vadd.f32        d1,  d21, d23
-        vsub.f32        d2,  d21, d23
-        vsub.f32        d3,  d22, d20
-        sub             r0,  r0,  #96
-        vext.32         q13, q13, q13, #1
-        vsub.f32        q10, q8,  q0            @ {r4,r5,i4,i5}
-        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
-        vext.32         q15, q15, q15, #1
-        vsub.f32        q11, q9,  q1            @ {r6,r7,i6,i7}
-        vswp            d25, d26                @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
-        vadd.f32        q9,  q9,  q1            @ {r2,r3,i2,i3}
-        vswp            d29, d30                @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
-        vadd.f32        q0,  q12, q13           @ {t1,t2,t5,t6}
-        vadd.f32        q1,  q14, q15           @ {t1a,t2a,t5a,t6a}
-        movrel          r2,  X(ff_cos_16)
-        vsub.f32        q13, q12, q13           @ {t3,t4,t7,t8}
-        vrev64.32       d1,  d1
-        vsub.f32        q15, q14, q15           @ {t3a,t4a,t7a,t8a}
-        vrev64.32       d3,  d3
-        movrel          r3,  pmmp
-        vswp            d1,  d26                @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
-        vswp            d3,  d30                @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
-        vadd.f32        q12, q0,  q13           @ {r8,i8,r9,i9}
-        vadd.f32        q14, q1,  q15           @ {r12,i12,r13,i13}
-        vld1.32         {d4-d5},  [r2,:64]
-        vsub.f32        q13, q0,  q13           @ {r10,i10,r11,i11}
-        vsub.f32        q15, q1,  q15           @ {r14,i14,r15,i15}
-        vswp            d25, d28                @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
-        vld1.32         {d6-d7},  [r3,:128]
-        vrev64.32       q1,  q14
-        vmul.f32        q14, q14, d4[1]
-        vmul.f32        q1,  q1,  q3
-        vmla.f32        q14, q1,  d5[1]         @ {t1a,t2a,t5a,t6a}
-        vswp            d27, d30                @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
-        vzip.32         q12, q14
-        vadd.f32        d0,  d28, d24
-        vadd.f32        d1,  d25, d29
-        vsub.f32        d2,  d25, d29
-        vsub.f32        d3,  d28, d24
-        vsub.f32        q12, q8,  q0            @ {r8,r9,i8,i9}
-        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
-        vsub.f32        q14, q10, q1            @ {r12,r13,i12,i13}
-        mov             r1,  #32
-        vadd.f32        q10, q10, q1            @ {r4,r5,i4,i5}
-        vrev64.32       q0,  q13
-        vmul.f32        q13, q13, d5[0]
-        vrev64.32       q1,  q15
-        vmul.f32        q15, q15, d5[1]
-        vst2.32         {d16-d17},[r0,:128], r1
-        vmul.f32        q0,  q0,  q3
-        vst2.32         {d20-d21},[r0,:128], r1
-        vmul.f32        q1,  q1,  q3
-        vmla.f32        q13, q0,  d5[0]         @ {t1,t2,t5,t6}
-        vmla.f32        q15, q1,  d4[1]         @ {t1a,t2a,t5a,t6a}
-        vst2.32         {d24-d25},[r0,:128], r1
-        vst2.32         {d28-d29},[r0,:128]
-        vzip.32         q13, q15
-        sub             r0, r0, #80
-        vadd.f32        d0,  d30, d26
-        vadd.f32        d1,  d27, d31
-        vsub.f32        d2,  d27, d31
-        vsub.f32        d3,  d30, d26
-        vsub.f32        q13, q9,  q0            @ {r10,r11,i10,i11}
-        vadd.f32        q9,  q9,  q0            @ {r2,r3,i2,i3}
-        vsub.f32        q15, q11, q1            @ {r14,r15,i14,i15}
-        vadd.f32        q11, q11, q1            @ {r6,r7,i6,i7}
-        vst2.32         {d18-d19},[r0,:128], r1
-        vst2.32         {d22-d23},[r0,:128], r1
-        vst2.32         {d26-d27},[r0,:128], r1
-        vst2.32         {d30-d31},[r0,:128]
-        bx              lr
-endfunc
-
-function fft_pass_neon
-        push            {r4-r6,lr}
-        mov             r6,  r2                 @ n
-        lsl             r5,  r2,  #3            @ 2 * n * sizeof FFTSample
-        lsl             r4,  r2,  #4            @ 2 * n * sizeof FFTComplex
-        lsl             r2,  r2,  #5            @ 4 * n * sizeof FFTComplex
-        add             r3,  r2,  r4
-        add             r4,  r4,  r0            @ &z[o1]
-        add             r2,  r2,  r0            @ &z[o2]
-        add             r3,  r3,  r0            @ &z[o3]
-        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
-        movrel          r12, pmmp
-        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
-        add             r5,  r5,  r1            @ wim
-        vld1.32         {d6-d7},  [r12,:128]    @ pmmp
-        vswp            d21, d22
-        vld1.32         {d4},     [r1,:64]!     @ {wre[0],wre[1]}
-        sub             r5,  r5,  #4            @ wim--
-        vrev64.32       q1,  q11
-        vmul.f32        q11, q11, d4[1]
-        vmul.f32        q1,  q1,  q3
-        vld1.32         {d5[0]},  [r5,:32]      @ d5[0] = wim[-1]
-        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
-        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
-        sub             r6, r6, #1              @ n--
-        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
-        vzip.32         q10, q11
-        vadd.f32        d0,  d22, d20
-        vadd.f32        d1,  d21, d23
-        vsub.f32        d2,  d21, d23
-        vsub.f32        d3,  d22, d20
-        vsub.f32        q10, q8,  q0
-        vadd.f32        q8,  q8,  q0
-        vsub.f32        q11, q9,  q1
-        vadd.f32        q9,  q9,  q1
-        vst2.32         {d20-d21},[r2,:128]!    @ {z[o2],z[o2+1]}
-        vst2.32         {d16-d17},[r0,:128]!    @ {z[0],z[1]}
-        vst2.32         {d22-d23},[r3,:128]!    @ {z[o3],z[o3+1]}
-        vst2.32         {d18-d19},[r4,:128]!    @ {z[o1],z[o1+1]}
-        sub             r5,  r5,  #8            @ wim -= 2
-1:
-        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
-        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
-        vswp            d21, d22
-        vld1.32         {d4}, [r1]!             @ {wre[0],wre[1]}
-        vrev64.32       q0,  q10
-        vmul.f32        q10, q10, d4[0]
-        vrev64.32       q1,  q11
-        vmul.f32        q11, q11, d4[1]
-        vld1.32         {d5}, [r5]              @ {wim[-1],wim[0]}
-        vmul.f32        q0,  q0,  q3
-        sub             r5,  r5,  #8            @ wim -= 2
-        vmul.f32        q1,  q1,  q3
-        vmla.f32        q10, q0,  d5[1]         @ {t1,t2,t5,t6}
-        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
-        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
-        subs            r6,  r6,  #1            @ n--
-        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
-        vzip.32         q10, q11
-        vadd.f32        d0,  d22, d20
-        vadd.f32        d1,  d21, d23
-        vsub.f32        d2,  d21, d23
-        vsub.f32        d3,  d22, d20
-        vsub.f32        q10, q8,  q0
-        vadd.f32        q8,  q8,  q0
-        vsub.f32        q11, q9,  q1
-        vadd.f32        q9,  q9,  q1
-        vst2.32         {d20-d21}, [r2,:128]!   @ {z[o2],z[o2+1]}
-        vst2.32         {d16-d17}, [r0,:128]!   @ {z[0],z[1]}
-        vst2.32         {d22-d23}, [r3,:128]!   @ {z[o3],z[o3+1]}
-        vst2.32         {d18-d19}, [r4,:128]!   @ {z[o1],z[o1+1]}
-        bne             1b
-
-        pop             {r4-r6,pc}
-endfunc
-
-.macro  def_fft n, n2, n4
-        .align 6
-function fft\n\()_neon
-        push            {r4, lr}
-        mov             r4,  r0
-        bl              fft\n2\()_neon
-        add             r0,  r4,  #\n4*2*8
-        bl              fft\n4\()_neon
-        add             r0,  r4,  #\n4*3*8
-        bl              fft\n4\()_neon
-        mov             r0,  r4
-        pop             {r4, lr}
-        movrel          r1,  X(ff_cos_\n)
-        mov             r2,  #\n4/2
-        b               fft_pass_neon
-endfunc
-.endm
-
-        def_fft    32,    16,     8
-        def_fft    64,    32,    16
-        def_fft   128,    64,    32
-        def_fft   256,   128,    64
-        def_fft   512,   256,   128
-        def_fft  1024,   512,   256
-        def_fft  2048,  1024,   512
-        def_fft  4096,  2048,  1024
-        def_fft  8192,  4096,  2048
-        def_fft 16384,  8192,  4096
-        def_fft 32768, 16384,  8192
-        def_fft 65536, 32768, 16384
-
-function ff_fft_calc_neon, export=1
-        ldr             r2,  [r0]
-        sub             r2,  r2,  #2
-        movrel          r3,  fft_tab_neon
-        ldr             r3,  [r3, r2, lsl #2]
-        mov             r0,  r1
-        bx              r3
-endfunc
-
-function ff_fft_permute_neon, export=1
-        push            {r4,lr}
-        mov             r12, #1
-        ldr             r2,  [r0]       @ nbits
-        ldr             r3,  [r0, #20]  @ tmp_buf
-        ldr             r0,  [r0, #8]   @ revtab
-        lsl             r12, r12, r2
-        mov             r2,  r12
-1:
-        vld1.32         {d0-d1}, [r1,:128]!
-        ldr             r4,  [r0], #4
-        uxth            lr,  r4
-        uxth            r4,  r4,  ror #16
-        add             lr,  r3,  lr,  lsl #3
-        add             r4,  r3,  r4,  lsl #3
-        vst1.32         {d0}, [lr,:64]
-        vst1.32         {d1}, [r4,:64]
-        subs            r12, r12, #2
-        bgt             1b
-
-        sub             r1,  r1,  r2,  lsl #3
-1:
-        vld1.32         {d0-d3}, [r3,:128]!
-        vst1.32         {d0-d3}, [r1,:128]!
-        subs            r2,  r2,  #4
-        bgt             1b
-
-        pop             {r4,pc}
-endfunc
-
-        .section .rodata
-        .align 4
-fft_tab_neon:
-        .word fft4_neon
-        .word fft8_neon
-        .word fft16_neon
-        .word fft32_neon
-        .word fft64_neon
-        .word fft128_neon
-        .word fft256_neon
-        .word fft512_neon
-        .word fft1024_neon
-        .word fft2048_neon
-        .word fft4096_neon
-        .word fft8192_neon
-        .word fft16384_neon
-        .word fft32768_neon
-        .word fft65536_neon
-        .size fft_tab_neon, . - fft_tab_neon
-
-        .align 4
-pmmp:   .float  +1.0, -1.0, -1.0, +1.0
-mppm:   .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavcodec/dsputil.h"
-#include "libavcodec/h264dsp.h"
-
-void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
-                                     int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
-                                     int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
-                                       int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
-                                       int beta, int8_t *tc0);
-
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
-                                      int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
-                                        int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-
-void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
-                             DCTELEM *block, int stride,
-                             const uint8_t nnzc[6*8]);
-void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
-                                  DCTELEM *block, int stride,
-                                  const uint8_t nnzc[6*8]);
-void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
-                            DCTELEM *block, int stride,
-                            const uint8_t nnzc[6*8]);
-
-#if HAVE_NEON
-static void ff_h264dsp_init_neon(H264DSPContext *c)
-{
-    c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
-    c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
-    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
-    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
-
-    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
-    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
-    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
-    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
-    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
-    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
-    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
-    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
-
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
-    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
-    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
-    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
-    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
-    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
-
-    c->h264_idct_add        = ff_h264_idct_add_neon;
-    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
-    c->h264_idct_add16      = ff_h264_idct_add16_neon;
-    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
-    c->h264_idct_add8       = ff_h264_idct_add8_neon;
-}
-#endif
-
-void ff_h264dsp_init_arm(H264DSPContext *c)
-{
-    if (HAVE_NEON) ff_h264dsp_init_neon(c);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1883 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
-        vtrn.32         \r0, \r4
-        vtrn.32         \r1, \r5
-        vtrn.32         \r2, \r6
-        vtrn.32         \r3, \r7
-        vtrn.16         \r0, \r2
-        vtrn.16         \r1, \r3
-        vtrn.16         \r4, \r6
-        vtrn.16         \r5, \r7
-        vtrn.8          \r0, \r1
-        vtrn.8          \r2, \r3
-        vtrn.8          \r4, \r5
-        vtrn.8          \r6, \r7
-        .endm
-
-        .macro transpose_4x4 r0 r1 r2 r3
-        vtrn.16         \r0, \r2
-        vtrn.16         \r1, \r3
-        vtrn.8          \r0, \r1
-        vtrn.8          \r2, \r3
-        .endm
-
-        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
-        vswp            \r0, \r4
-        vswp            \r1, \r5
-        vswp            \r2, \r6
-        vswp            \r3, \r7
-        .endm
-
-        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
-        vtrn.32         \r0, \r2
-        vtrn.32         \r1, \r3
-        vtrn.32         \r4, \r6
-        vtrn.32         \r5, \r7
-        vtrn.16         \r0, \r1
-        vtrn.16         \r2, \r3
-        vtrn.16         \r4, \r5
-        vtrn.16         \r6, \r7
-        .endm
-
-/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-        .macro  h264_chroma_mc8 type
-function ff_\type\()_h264_chroma_mc8_neon, export=1
-        push            {r4-r7, lr}
-        ldrd            r4,  [sp, #20]
-.ifc \type,avg
-        mov             lr,  r0
-.endif
-        pld             [r1]
-        pld             [r1, r2]
-
-        muls            r7,  r4,  r5
-        rsb             r6,  r7,  r5,  lsl #3
-        rsb             ip,  r7,  r4,  lsl #3
-        sub             r4,  r7,  r4,  lsl #3
-        sub             r4,  r4,  r5,  lsl #3
-        add             r4,  r4,  #64
-
-        beq             2f
-
-        add             r5,  r1,  r2
-
-        vdup.8          d0,  r4
-        lsl             r4,  r2,  #1
-        vdup.8          d1,  ip
-        vld1.64         {d4, d5}, [r1], r4
-        vdup.8          d2,  r6
-        vld1.64         {d6, d7}, [r5], r4
-        vdup.8          d3,  r7
-
-        vext.8          d5,  d4,  d5,  #1
-        vext.8          d7,  d6,  d7,  #1
-
-1:      pld             [r5]
-        vmull.u8        q8,  d4,  d0
-        vmlal.u8        q8,  d5,  d1
-        vld1.64         {d4, d5}, [r1], r4
-        vmlal.u8        q8,  d6,  d2
-        vext.8          d5,  d4,  d5,  #1
-        vmlal.u8        q8,  d7,  d3
-        vmull.u8        q9,  d6,  d0
-        subs            r3,  r3,  #2
-        vmlal.u8        q9,  d7,  d1
-        vmlal.u8        q9,  d4,  d2
-        vmlal.u8        q9,  d5,  d3
-        vrshrn.u16      d16, q8,  #6
-        vld1.64         {d6, d7}, [r5], r4
-        pld             [r1]
-        vrshrn.u16      d17, q9,  #6
-.ifc \type,avg
-        vld1.64         {d20}, [lr,:64], r2
-        vld1.64         {d21}, [lr,:64], r2
-        vrhadd.u8       q8,  q8,  q10
-.endif
-        vext.8          d7,  d6,  d7,  #1
-        vst1.64         {d16}, [r0,:64], r2
-        vst1.64         {d17}, [r0,:64], r2
-        bgt             1b
-
-        pop             {r4-r7, pc}
-
-2:      tst             r6,  r6
-        add             ip,  ip,  r6
-        vdup.8          d0,  r4
-        vdup.8          d1,  ip
-
-        beq             4f
-
-        add             r5,  r1,  r2
-        lsl             r4,  r2,  #1
-        vld1.64         {d4}, [r1], r4
-        vld1.64         {d6}, [r5], r4
-
-3:      pld             [r5]
-        vmull.u8        q8,  d4,  d0
-        vmlal.u8        q8,  d6,  d1
-        vld1.64         {d4}, [r1], r4
-        vmull.u8        q9,  d6,  d0
-        vmlal.u8        q9,  d4,  d1
-        vld1.64         {d6}, [r5], r4
-        vrshrn.u16      d16, q8,  #6
-        vrshrn.u16      d17, q9,  #6
-.ifc \type,avg
-        vld1.64         {d20}, [lr,:64], r2
-        vld1.64         {d21}, [lr,:64], r2
-        vrhadd.u8       q8,  q8,  q10
-.endif
-        subs            r3,  r3,  #2
-        pld             [r1]
-        vst1.64         {d16}, [r0,:64], r2
-        vst1.64         {d17}, [r0,:64], r2
-        bgt             3b
-
-        pop             {r4-r7, pc}
-
-4:      vld1.64         {d4, d5}, [r1], r2
-        vld1.64         {d6, d7}, [r1], r2
-        vext.8          d5,  d4,  d5,  #1
-        vext.8          d7,  d6,  d7,  #1
-
-5:      pld             [r1]
-        subs            r3,  r3,  #2
-        vmull.u8        q8,  d4,  d0
-        vmlal.u8        q8,  d5,  d1
-        vld1.64         {d4, d5}, [r1], r2
-        vmull.u8        q9,  d6,  d0
-        vmlal.u8        q9,  d7,  d1
-        pld             [r1]
-        vext.8          d5,  d4,  d5,  #1
-        vrshrn.u16      d16, q8,  #6
-        vrshrn.u16      d17, q9,  #6
-.ifc \type,avg
-        vld1.64         {d20}, [lr,:64], r2
-        vld1.64         {d21}, [lr,:64], r2
-        vrhadd.u8       q8,  q8,  q10
-.endif
-        vld1.64         {d6, d7}, [r1], r2
-        vext.8          d7,  d6,  d7,  #1
-        vst1.64         {d16}, [r0,:64], r2
-        vst1.64         {d17}, [r0,:64], r2
-        bgt             5b
-
-        pop             {r4-r7, pc}
-endfunc
-        .endm
-
-/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-        .macro  h264_chroma_mc4 type
-function ff_\type\()_h264_chroma_mc4_neon, export=1
-        push            {r4-r7, lr}
-        ldrd            r4,  [sp, #20]
-.ifc \type,avg
-        mov             lr,  r0
-.endif
-        pld             [r1]
-        pld             [r1, r2]
-
-        muls            r7,  r4,  r5
-        rsb             r6,  r7,  r5,  lsl #3
-        rsb             ip,  r7,  r4,  lsl #3
-        sub             r4,  r7,  r4,  lsl #3
-        sub             r4,  r4,  r5,  lsl #3
-        add             r4,  r4,  #64
-
-        beq             2f
-
-        add             r5,  r1,  r2
-
-        vdup.8          d0,  r4
-        lsl             r4,  r2,  #1
-        vdup.8          d1,  ip
-        vld1.64         {d4},     [r1], r4
-        vdup.8          d2,  r6
-        vld1.64         {d6},     [r5], r4
-        vdup.8          d3,  r7
-
-        vext.8          d5,  d4,  d5,  #1
-        vext.8          d7,  d6,  d7,  #1
-        vtrn.32         d4,  d5
-        vtrn.32         d6,  d7
-
-        vtrn.32         d0,  d1
-        vtrn.32         d2,  d3
-
-1:      pld             [r5]
-        vmull.u8        q8,  d4,  d0
-        vmlal.u8        q8,  d6,  d2
-        vld1.64         {d4},     [r1], r4
-        vext.8          d5,  d4,  d5,  #1
-        vtrn.32         d4,  d5
-        vmull.u8        q9,  d6,  d0
-        vmlal.u8        q9,  d4,  d2
-        vld1.64         {d6},     [r5], r4
-        vadd.i16        d16, d16, d17
-        vadd.i16        d17, d18, d19
-        vrshrn.u16      d16, q8,  #6
-        subs            r3,  r3,  #2
-        pld             [r1]
-.ifc \type,avg
-        vld1.32         {d20[0]}, [lr,:32], r2
-        vld1.32         {d20[1]}, [lr,:32], r2
-        vrhadd.u8       d16, d16, d20
-.endif
-        vext.8          d7,  d6,  d7,  #1
-        vtrn.32         d6,  d7
-        vst1.32         {d16[0]}, [r0,:32], r2
-        vst1.32         {d16[1]}, [r0,:32], r2
-        bgt             1b
-
-        pop             {r4-r7, pc}
-
-2:      tst             r6,  r6
-        add             ip,  ip,  r6
-        vdup.8          d0,  r4
-        vdup.8          d1,  ip
-        vtrn.32         d0,  d1
-
-        beq             4f
-
-        vext.32         d1,  d0,  d1,  #1
-        add             r5,  r1,  r2
-        lsl             r4,  r2,  #1
-        vld1.32         {d4[0]},  [r1], r4
-        vld1.32         {d4[1]},  [r5], r4
-
-3:      pld             [r5]
-        vmull.u8        q8,  d4,  d0
-        vld1.32         {d4[0]},  [r1], r4
-        vmull.u8        q9,  d4,  d1
-        vld1.32         {d4[1]},  [r5], r4
-        vadd.i16        d16, d16, d17
-        vadd.i16        d17, d18, d19
-        vrshrn.u16      d16, q8,  #6
-.ifc \type,avg
-        vld1.32         {d20[0]}, [lr,:32], r2
-        vld1.32         {d20[1]}, [lr,:32], r2
-        vrhadd.u8       d16, d16, d20
-.endif
-        subs            r3,  r3,  #2
-        pld             [r1]
-        vst1.32         {d16[0]}, [r0,:32], r2
-        vst1.32         {d16[1]}, [r0,:32], r2
-        bgt             3b
-
-        pop             {r4-r7, pc}
-
-4:      vld1.64         {d4},     [r1], r2
-        vld1.64         {d6},     [r1], r2
-        vext.8          d5,  d4,  d5,  #1
-        vext.8          d7,  d6,  d7,  #1
-        vtrn.32         d4,  d5
-        vtrn.32         d6,  d7
-
-5:      vmull.u8        q8,  d4,  d0
-        vmull.u8        q9,  d6,  d0
-        subs            r3,  r3,  #2
-        vld1.64         {d4},     [r1], r2
-        vext.8          d5,  d4,  d5,  #1
-        vtrn.32         d4,  d5
-        vadd.i16        d16, d16, d17
-        vadd.i16        d17, d18, d19
-        pld             [r1]
-        vrshrn.u16      d16, q8,  #6
-.ifc \type,avg
-        vld1.32         {d20[0]}, [lr,:32], r2
-        vld1.32         {d20[1]}, [lr,:32], r2
-        vrhadd.u8       d16, d16, d20
-.endif
-        vld1.64         {d6},     [r1], r2
-        vext.8          d7,  d6,  d7,  #1
-        vtrn.32         d6,  d7
-        pld             [r1]
-        vst1.32         {d16[0]}, [r0,:32], r2
-        vst1.32         {d16[1]}, [r0,:32], r2
-        bgt             5b
-
-        pop             {r4-r7, pc}
-endfunc
-        .endm
-
-        .macro  h264_chroma_mc2 type
-function ff_\type\()_h264_chroma_mc2_neon, export=1
-        push            {r4-r6, lr}
-        ldr             r4,  [sp, #16]
-        ldr             lr,  [sp, #20]
-        pld             [r1]
-        pld             [r1, r2]
-        orrs            r5,  r4,  lr
-        beq             2f
-
-        mul             r5,  r4,  lr
-        rsb             r6,  r5,  lr,  lsl #3
-        rsb             r12, r5,  r4,  lsl #3
-        sub             r4,  r5,  r4,  lsl #3
-        sub             r4,  r4,  lr,  lsl #3
-        add             r4,  r4,  #64
-        vdup.8          d0,  r4
-        vdup.8          d2,  r12
-        vdup.8          d1,  r6
-        vdup.8          d3,  r5
-        vtrn.16         q0,  q1
-1:
-        vld1.32         {d4[0]},  [r1], r2
-        vld1.32         {d4[1]},  [r1], r2
-        vrev64.32       d5,  d4
-        vld1.32         {d5[1]},  [r1]
-        vext.8          q3,  q2,  q2,  #1
-        vtrn.16         q2,  q3
-        vmull.u8        q8,  d4,  d0
-        vmlal.u8        q8,  d5,  d1
-.ifc \type,avg
-        vld1.16         {d18[0]}, [r0,:16], r2
-        vld1.16         {d18[1]}, [r0,:16]
-        sub             r0,  r0,  r2
-.endif
-        vtrn.32         d16, d17
-        vadd.i16        d16, d16, d17
-        vrshrn.u16      d16, q8,  #6
-.ifc \type,avg
-        vrhadd.u8       d16, d16, d18
-.endif
-        vst1.16         {d16[0]}, [r0,:16], r2
-        vst1.16         {d16[1]}, [r0,:16], r2
-        subs            r3,  r3,  #2
-        bgt             1b
-        pop             {r4-r6, pc}
-2:
-.ifc \type,put
-        ldrh            r5,  [r1], r2
-        strh            r5,  [r0], r2
-        ldrh            r6,  [r1], r2
-        strh            r6,  [r0], r2
-.else
-        vld1.16         {d16[0]}, [r1], r2
-        vld1.16         {d16[1]}, [r1], r2
-        vld1.16         {d18[0]}, [r0,:16], r2
-        vld1.16         {d18[1]}, [r0,:16]
-        sub             r0,  r0,  r2
-        vrhadd.u8       d16, d16, d18
-        vst1.16         {d16[0]}, [r0,:16], r2
-        vst1.16         {d16[1]}, [r0,:16], r2
-.endif
-        subs            r3,  r3,  #2
-        bgt             2b
-        pop             {r4-r6, pc}
-endfunc
-.endm
-
-        .text
-        .align
-
-        h264_chroma_mc8 put
-        h264_chroma_mc8 avg
-        h264_chroma_mc4 put
-        h264_chroma_mc4 avg
-        h264_chroma_mc2 put
-        h264_chroma_mc2 avg
-
-        /* H.264 loop filter */
-
-        .macro h264_loop_filter_start
-        ldr             ip,  [sp]
-        tst             r2,  r2
-        ldr             ip,  [ip]
-        tstne           r3,  r3
-        vmov.32         d24[0], ip
-        and             ip,  ip,  ip, lsl #16
-        bxeq            lr
-        ands            ip,  ip,  ip, lsl #8
-        bxlt            lr
-        .endm
-
-        .macro align_push_regs
-        and             ip,  sp,  #15
-        add             ip,  ip,  #32
-        sub             sp,  sp,  ip
-        vst1.64         {d12-d15}, [sp,:128]
-        sub             sp,  sp,  #32
-        vst1.64         {d8-d11},  [sp,:128]
-        .endm
-
-        .macro align_pop_regs
-        vld1.64         {d8-d11},  [sp,:128]!
-        vld1.64         {d12-d15}, [sp,:128], ip
-        .endm
-
-        .macro h264_loop_filter_luma
-        vdup.8          q11, r2         @ alpha
-        vmovl.u8        q12, d24
-        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
-        vmovl.u16       q12, d24
-        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
-        vsli.16         q12, q12, #8
-        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
-        vsli.32         q12, q12, #16
-        vclt.u8         q6,  q6,  q11   @ < alpha
-        vdup.8          q11, r3         @ beta
-        vclt.s8         q7,  q12, #0
-        vclt.u8         q14, q14, q11   @ < beta
-        vclt.u8         q15, q15, q11   @ < beta
-        vbic            q6,  q6,  q7
-        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
-        vand            q6,  q6,  q14
-        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
-        vclt.u8         q4,  q4,  q11   @ < beta
-        vand            q6,  q6,  q15
-        vclt.u8         q5,  q5,  q11   @ < beta
-        vand            q4,  q4,  q6
-        vand            q5,  q5,  q6
-        vand            q12, q12, q6
-        vrhadd.u8       q14, q8,  q0
-        vsub.i8         q6,  q12, q4
-        vqadd.u8        q7,  q9,  q12
-        vhadd.u8        q10, q10, q14
-        vsub.i8         q6,  q6,  q5
-        vhadd.u8        q14, q2,  q14
-        vmin.u8         q7,  q7,  q10
-        vqsub.u8        q11, q9,  q12
-        vqadd.u8        q2,  q1,  q12
-        vmax.u8         q7,  q7,  q11
-        vqsub.u8        q11, q1,  q12
-        vmin.u8         q14, q2,  q14
-        vmovl.u8        q2,  d0
-        vmax.u8         q14, q14, q11
-        vmovl.u8        q10, d1
-        vsubw.u8        q2,  q2,  d16
-        vsubw.u8        q10, q10, d17
-        vshl.i16        q2,  q2,  #2
-        vshl.i16        q10, q10, #2
-        vaddw.u8        q2,  q2,  d18
-        vaddw.u8        q10, q10, d19
-        vsubw.u8        q2,  q2,  d2
-        vsubw.u8        q10, q10, d3
-        vrshrn.i16      d4,  q2,  #3
-        vrshrn.i16      d5,  q10, #3
-        vbsl            q4,  q7,  q9
-        vbsl            q5,  q14, q1
-        vneg.s8         q7,  q6
-        vmovl.u8        q14, d16
-        vmin.s8         q2,  q2,  q6
-        vmovl.u8        q6,  d17
-        vmax.s8         q2,  q2,  q7
-        vmovl.u8        q11, d0
-        vmovl.u8        q12, d1
-        vaddw.s8        q14, q14, d4
-        vaddw.s8        q6,  q6,  d5
-        vsubw.s8        q11, q11, d4
-        vsubw.s8        q12, q12, d5
-        vqmovun.s16     d16, q14
-        vqmovun.s16     d17, q6
-        vqmovun.s16     d0,  q11
-        vqmovun.s16     d1,  q12
-        .endm
-
-function ff_h264_v_loop_filter_luma_neon, export=1
-        h264_loop_filter_start
-
-        vld1.64         {d0, d1},  [r0,:128], r1
-        vld1.64         {d2, d3},  [r0,:128], r1
-        vld1.64         {d4, d5},  [r0,:128], r1
-        sub             r0,  r0,  r1, lsl #2
-        sub             r0,  r0,  r1, lsl #1
-        vld1.64         {d20,d21}, [r0,:128], r1
-        vld1.64         {d18,d19}, [r0,:128], r1
-        vld1.64         {d16,d17}, [r0,:128], r1
-
-        align_push_regs
-
-        h264_loop_filter_luma
-
-        sub             r0,  r0,  r1, lsl #1
-        vst1.64         {d8, d9},  [r0,:128], r1
-        vst1.64         {d16,d17}, [r0,:128], r1
-        vst1.64         {d0, d1},  [r0,:128], r1
-        vst1.64         {d10,d11}, [r0,:128]
-
-        align_pop_regs
-        bx              lr
-endfunc
-
-function ff_h264_h_loop_filter_luma_neon, export=1
-        h264_loop_filter_start
-
-        sub             r0,  r0,  #4
-        vld1.64         {d6},  [r0], r1
-        vld1.64         {d20}, [r0], r1
-        vld1.64         {d18}, [r0], r1
-        vld1.64         {d16}, [r0], r1
-        vld1.64         {d0},  [r0], r1
-        vld1.64         {d2},  [r0], r1
-        vld1.64         {d4},  [r0], r1
-        vld1.64         {d26}, [r0], r1
-        vld1.64         {d7},  [r0], r1
-        vld1.64         {d21}, [r0], r1
-        vld1.64         {d19}, [r0], r1
-        vld1.64         {d17}, [r0], r1
-        vld1.64         {d1},  [r0], r1
-        vld1.64         {d3},  [r0], r1
-        vld1.64         {d5},  [r0], r1
-        vld1.64         {d27}, [r0], r1
-
-        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
-
-        align_push_regs
-
-        h264_loop_filter_luma
-
-        transpose_4x4   q4, q8, q0, q5
-
-        sub             r0,  r0,  r1, lsl #4
-        add             r0,  r0,  #2
-        vst1.32         {d8[0]},  [r0], r1
-        vst1.32         {d16[0]}, [r0], r1
-        vst1.32         {d0[0]},  [r0], r1
-        vst1.32         {d10[0]}, [r0], r1
-        vst1.32         {d8[1]},  [r0], r1
-        vst1.32         {d16[1]}, [r0], r1
-        vst1.32         {d0[1]},  [r0], r1
-        vst1.32         {d10[1]}, [r0], r1
-        vst1.32         {d9[0]},  [r0], r1
-        vst1.32         {d17[0]}, [r0], r1
-        vst1.32         {d1[0]},  [r0], r1
-        vst1.32         {d11[0]}, [r0], r1
-        vst1.32         {d9[1]},  [r0], r1
-        vst1.32         {d17[1]}, [r0], r1
-        vst1.32         {d1[1]},  [r0], r1
-        vst1.32         {d11[1]}, [r0], r1
-
-        align_pop_regs
-        bx              lr
-endfunc
-
-        .macro h264_loop_filter_chroma
-        vdup.8          d22, r2         @ alpha
-        vmovl.u8        q12, d24
-        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
-        vmovl.u8        q2,  d0
-        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
-        vsubw.u8        q2,  q2,  d16
-        vsli.16         d24, d24, #8
-        vshl.i16        q2,  q2,  #2
-        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
-        vaddw.u8        q2,  q2,  d18
-        vclt.u8         d26, d26, d22   @ < alpha
-        vsubw.u8        q2,  q2,  d2
-        vdup.8          d22, r3         @ beta
-        vclt.s8         d25, d24, #0
-        vrshrn.i16      d4,  q2,  #3
-        vclt.u8         d28, d28, d22   @ < beta
-        vbic            d26, d26, d25
-        vclt.u8         d30, d30, d22   @ < beta
-        vand            d26, d26, d28
-        vneg.s8         d25, d24
-        vand            d26, d26, d30
-        vmin.s8         d4,  d4,  d24
-        vmovl.u8        q14, d16
-        vand            d4,  d4,  d26
-        vmax.s8         d4,  d4,  d25
-        vmovl.u8        q11, d0
-        vaddw.s8        q14, q14, d4
-        vsubw.s8        q11, q11, d4
-        vqmovun.s16     d16, q14
-        vqmovun.s16     d0,  q11
-        .endm
-
-function ff_h264_v_loop_filter_chroma_neon, export=1
-        h264_loop_filter_start
-
-        sub             r0,  r0,  r1, lsl #1
-        vld1.64         {d18}, [r0,:64], r1
-        vld1.64         {d16}, [r0,:64], r1
-        vld1.64         {d0},  [r0,:64], r1
-        vld1.64         {d2},  [r0,:64]
-
-        h264_loop_filter_chroma
-
-        sub             r0,  r0,  r1, lsl #1
-        vst1.64         {d16}, [r0,:64], r1
-        vst1.64         {d0},  [r0,:64], r1
-
-        bx              lr
-endfunc
-
-function ff_h264_h_loop_filter_chroma_neon, export=1
-        h264_loop_filter_start
-
-        sub             r0,  r0,  #2
-        vld1.32         {d18[0]}, [r0], r1
-        vld1.32         {d16[0]}, [r0], r1
-        vld1.32         {d0[0]},  [r0], r1
-        vld1.32         {d2[0]},  [r0], r1
-        vld1.32         {d18[1]}, [r0], r1
-        vld1.32         {d16[1]}, [r0], r1
-        vld1.32         {d0[1]},  [r0], r1
-        vld1.32         {d2[1]},  [r0], r1
-
-        vtrn.16         d18, d0
-        vtrn.16         d16, d2
-        vtrn.8          d18, d16
-        vtrn.8          d0,  d2
-
-        h264_loop_filter_chroma
-
-        vtrn.16         d18, d0
-        vtrn.16         d16, d2
-        vtrn.8          d18, d16
-        vtrn.8          d0,  d2
-
-        sub             r0,  r0,  r1, lsl #3
-        vst1.32         {d18[0]}, [r0], r1
-        vst1.32         {d16[0]}, [r0], r1
-        vst1.32         {d0[0]},  [r0], r1
-        vst1.32         {d2[0]},  [r0], r1
-        vst1.32         {d18[1]}, [r0], r1
-        vst1.32         {d16[1]}, [r0], r1
-        vst1.32         {d0[1]},  [r0], r1
-        vst1.32         {d2[1]},  [r0], r1
-
-        bx              lr
-endfunc
-
-        /* H.264 qpel MC */
-
-        .macro  lowpass_const r
-        movw            \r,  #5
-        movt            \r,  #20
-        vmov.32         d6[0], \r
-        .endm
-
-        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
-.if \narrow
-        t0 .req q0
-        t1 .req q8
-.else
-        t0 .req \d0
-        t1 .req \d1
-.endif
-        vext.8          d2,  \r0, \r1, #2
-        vext.8          d3,  \r0, \r1, #3
-        vaddl.u8        q1,  d2,  d3
-        vext.8          d4,  \r0, \r1, #1
-        vext.8          d5,  \r0, \r1, #4
-        vaddl.u8        q2,  d4,  d5
-        vext.8          d30, \r0, \r1, #5
-        vaddl.u8        t0,  \r0, d30
-        vext.8          d18, \r2, \r3, #2
-        vmla.i16        t0,  q1,  d6[1]
-        vext.8          d19, \r2, \r3, #3
-        vaddl.u8        q9,  d18, d19
-        vext.8          d20, \r2, \r3, #1
-        vmls.i16        t0,  q2,  d6[0]
-        vext.8          d21, \r2, \r3, #4
-        vaddl.u8        q10, d20, d21
-        vext.8          d31, \r2, \r3, #5
-        vaddl.u8        t1,  \r2, d31
-        vmla.i16        t1,  q9,  d6[1]
-        vmls.i16        t1,  q10, d6[0]
-.if \narrow
-        vqrshrun.s16    \d0, t0,  #5
-        vqrshrun.s16    \d1, t1,  #5
-.endif
-        .unreq  t0
-        .unreq  t1
-        .endm
-
-        .macro  lowpass_8_1 r0, r1, d0, narrow=1
-.if \narrow
-        t0 .req q0
-.else
-        t0 .req \d0
-.endif
-        vext.8          d2,  \r0, \r1, #2
-        vext.8          d3,  \r0, \r1, #3
-        vaddl.u8        q1,  d2,  d3
-        vext.8          d4,  \r0, \r1, #1
-        vext.8          d5,  \r0, \r1, #4
-        vaddl.u8        q2,  d4,  d5
-        vext.8          d30, \r0, \r1, #5
-        vaddl.u8        t0,  \r0, d30
-        vmla.i16        t0,  q1,  d6[1]
-        vmls.i16        t0,  q2,  d6[0]
-.if \narrow
-        vqrshrun.s16    \d0, t0,  #5
-.endif
-        .unreq  t0
-        .endm
-
-        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
-        vext.16         q1,  \r0, \r1, #2
-        vext.16         q0,  \r0, \r1, #3
-        vaddl.s16       q9,  d2,  d0
-        vext.16         q2,  \r0, \r1, #1
-        vaddl.s16       q1,  d3,  d1
-        vext.16         q3,  \r0, \r1, #4
-        vaddl.s16       q10, d4,  d6
-        vext.16         \r1, \r0, \r1, #5
-        vaddl.s16       q2,  d5,  d7
-        vaddl.s16       q0,  \h0, \h1
-        vaddl.s16       q8,  \l0, \l1
-
-        vshl.i32        q3,  q9,  #4
-        vshl.i32        q9,  q9,  #2
-        vshl.i32        q15, q10, #2
-        vadd.i32        q9,  q9,  q3
-        vadd.i32        q10, q10, q15
-
-        vshl.i32        q3,  q1,  #4
-        vshl.i32        q1,  q1,  #2
-        vshl.i32        q15, q2,  #2
-        vadd.i32        q1,  q1,  q3
-        vadd.i32        q2,  q2,  q15
-
-        vadd.i32        q9,  q9,  q8
-        vsub.i32        q9,  q9,  q10
-
-        vadd.i32        q1,  q1,  q0
-        vsub.i32        q1,  q1,  q2
-
-        vrshrn.s32      d18, q9,  #10
-        vrshrn.s32      d19, q1,  #10
-
-        vqmovun.s16     \d,  q9
-        .endm
-
-function put_h264_qpel16_h_lowpass_neon_packed
-        mov             r4,  lr
-        mov             ip,  #16
-        mov             r3,  #8
-        bl              put_h264_qpel8_h_lowpass_neon
-        sub             r1,  r1,  r2, lsl #4
-        add             r1,  r1,  #8
-        mov             ip,  #16
-        mov             lr,  r4
-        b               put_h264_qpel8_h_lowpass_neon
-endfunc
-
-        .macro h264_qpel_h_lowpass type
-function \type\()_h264_qpel16_h_lowpass_neon
-        push            {lr}
-        mov             ip,  #16
-        bl              \type\()_h264_qpel8_h_lowpass_neon
-        sub             r0,  r0,  r3, lsl #4
-        sub             r1,  r1,  r2, lsl #4
-        add             r0,  r0,  #8
-        add             r1,  r1,  #8
-        mov             ip,  #16
-        pop             {lr}
-endfunc
-
-function \type\()_h264_qpel8_h_lowpass_neon
-1:      vld1.64         {d0, d1},  [r1], r2
-        vld1.64         {d16,d17}, [r1], r2
-        subs            ip,  ip,  #2
-        lowpass_8       d0,  d1,  d16, d17, d0,  d16
-.ifc \type,avg
-        vld1.8          {d2},     [r0,:64], r3
-        vrhadd.u8       d0,  d0,  d2
-        vld1.8          {d3},     [r0,:64]
-        vrhadd.u8       d16, d16, d3
-        sub             r0,  r0,  r3
-.endif
-        vst1.64         {d0},     [r0,:64], r3
-        vst1.64         {d16},    [r0,:64], r3
-        bne             1b
-        bx              lr
-endfunc
-        .endm
-
-        h264_qpel_h_lowpass put
-        h264_qpel_h_lowpass avg
-
-        .macro h264_qpel_h_lowpass_l2 type
-function \type\()_h264_qpel16_h_lowpass_l2_neon
-        push            {lr}
-        mov             ip,  #16
-        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
-        sub             r0,  r0,  r2, lsl #4
-        sub             r1,  r1,  r2, lsl #4
-        sub             r3,  r3,  r2, lsl #4
-        add             r0,  r0,  #8
-        add             r1,  r1,  #8
-        add             r3,  r3,  #8
-        mov             ip,  #16
-        pop             {lr}
-endfunc
-
-function \type\()_h264_qpel8_h_lowpass_l2_neon
-1:      vld1.64         {d0, d1},  [r1], r2
-        vld1.64         {d16,d17}, [r1], r2
-        vld1.64         {d28},     [r3], r2
-        vld1.64         {d29},     [r3], r2
-        subs            ip,  ip,  #2
-        lowpass_8       d0,  d1,  d16, d17, d0,  d1
-        vrhadd.u8       q0,  q0,  q14
-.ifc \type,avg
-        vld1.8          {d2},      [r0,:64], r2
-        vrhadd.u8       d0,  d0,  d2
-        vld1.8          {d3},      [r0,:64]
-        vrhadd.u8       d1,  d1,  d3
-        sub             r0,  r0,  r2
-.endif
-        vst1.64         {d0},      [r0,:64], r2
-        vst1.64         {d1},      [r0,:64], r2
-        bne             1b
-        bx              lr
-endfunc
-        .endm
-
-        h264_qpel_h_lowpass_l2 put
-        h264_qpel_h_lowpass_l2 avg
-
-function put_h264_qpel16_v_lowpass_neon_packed
-        mov             r4,  lr
-        mov             r2,  #8
-        bl              put_h264_qpel8_v_lowpass_neon
-        sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_v_lowpass_neon
-        sub             r1,  r1,  r3, lsl #4
-        sub             r1,  r1,  r3, lsl #2
-        add             r1,  r1,  #8
-        bl              put_h264_qpel8_v_lowpass_neon
-        sub             r1,  r1,  r3, lsl #2
-        mov             lr,  r4
-        b               put_h264_qpel8_v_lowpass_neon
-endfunc
-
-        .macro h264_qpel_v_lowpass type
-function \type\()_h264_qpel16_v_lowpass_neon
-        mov             r4,  lr
-        bl              \type\()_h264_qpel8_v_lowpass_neon
-        sub             r1,  r1,  r3, lsl #2
-        bl              \type\()_h264_qpel8_v_lowpass_neon
-        sub             r0,  r0,  r2, lsl #4
-        add             r0,  r0,  #8
-        sub             r1,  r1,  r3, lsl #4
-        sub             r1,  r1,  r3, lsl #2
-        add             r1,  r1,  #8
-        bl              \type\()_h264_qpel8_v_lowpass_neon
-        sub             r1,  r1,  r3, lsl #2
-        mov             lr,  r4
-endfunc
-
-function \type\()_h264_qpel8_v_lowpass_neon
-        vld1.64         {d8},  [r1], r3
-        vld1.64         {d10}, [r1], r3
-        vld1.64         {d12}, [r1], r3
-        vld1.64         {d14}, [r1], r3
-        vld1.64         {d22}, [r1], r3
-        vld1.64         {d24}, [r1], r3
-        vld1.64         {d26}, [r1], r3
-        vld1.64         {d28}, [r1], r3
-        vld1.64         {d9},  [r1], r3
-        vld1.64         {d11}, [r1], r3
-        vld1.64         {d13}, [r1], r3
-        vld1.64         {d15}, [r1], r3
-        vld1.64         {d23}, [r1]
-
-        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
-        lowpass_8       d8,  d9,  d10, d11, d8,  d10
-        lowpass_8       d12, d13, d14, d15, d12, d14
-        lowpass_8       d22, d23, d24, d25, d22, d24
-        lowpass_8       d26, d27, d28, d29, d26, d28
-        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
-
-.ifc \type,avg
-        vld1.8          {d9},  [r0,:64], r2
-        vrhadd.u8       d8,  d8,  d9
-        vld1.8          {d11}, [r0,:64], r2
-        vrhadd.u8       d10, d10, d11
-        vld1.8          {d13}, [r0,:64], r2
-        vrhadd.u8       d12, d12, d13
-        vld1.8          {d15}, [r0,:64], r2
-        vrhadd.u8       d14, d14, d15
-        vld1.8          {d23}, [r0,:64], r2
-        vrhadd.u8       d22, d22, d23
-        vld1.8          {d25}, [r0,:64], r2
-        vrhadd.u8       d24, d24, d25
-        vld1.8          {d27}, [r0,:64], r2
-        vrhadd.u8       d26, d26, d27
-        vld1.8          {d29}, [r0,:64], r2
-        vrhadd.u8       d28, d28, d29
-        sub             r0,  r0,  r2,  lsl #3
-.endif
-
-        vst1.64         {d8},  [r0,:64], r2
-        vst1.64         {d10}, [r0,:64], r2
-        vst1.64         {d12}, [r0,:64], r2
-        vst1.64         {d14}, [r0,:64], r2
-        vst1.64         {d22}, [r0,:64], r2
-        vst1.64         {d24}, [r0,:64], r2
-        vst1.64         {d26}, [r0,:64], r2
-        vst1.64         {d28}, [r0,:64], r2
-
-        bx              lr
-endfunc
-        .endm
-
-        h264_qpel_v_lowpass put
-        h264_qpel_v_lowpass avg
-
-        .macro h264_qpel_v_lowpass_l2 type
-function \type\()_h264_qpel16_v_lowpass_l2_neon
-        mov             r4,  lr
-        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
-        sub             r1,  r1,  r3, lsl #2
-        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
-        sub             r0,  r0,  r3, lsl #4
-        sub             ip,  ip,  r2, lsl #4
-        add             r0,  r0,  #8
-        add             ip,  ip,  #8
-        sub             r1,  r1,  r3, lsl #4
-        sub             r1,  r1,  r3, lsl #2
-        add             r1,  r1,  #8
-        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
-        sub             r1,  r1,  r3, lsl #2
-        mov             lr,  r4
-endfunc
-
-function \type\()_h264_qpel8_v_lowpass_l2_neon
-        vld1.64         {d8},  [r1], r3
-        vld1.64         {d10}, [r1], r3
-        vld1.64         {d12}, [r1], r3
-        vld1.64         {d14}, [r1], r3
-        vld1.64         {d22}, [r1], r3
-        vld1.64         {d24}, [r1], r3
-        vld1.64         {d26}, [r1], r3
-        vld1.64         {d28}, [r1], r3
-        vld1.64         {d9},  [r1], r3
-        vld1.64         {d11}, [r1], r3
-        vld1.64         {d13}, [r1], r3
-        vld1.64         {d15}, [r1], r3
-        vld1.64         {d23}, [r1]
-
-        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
-        lowpass_8       d8,  d9,  d10, d11, d8,  d9
-        lowpass_8       d12, d13, d14, d15, d12, d13
-        lowpass_8       d22, d23, d24, d25, d22, d23
-        lowpass_8       d26, d27, d28, d29, d26, d27
-        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
-
-        vld1.64         {d0},  [ip], r2
-        vld1.64         {d1},  [ip], r2
-        vld1.64         {d2},  [ip], r2
-        vld1.64         {d3},  [ip], r2
-        vld1.64         {d4},  [ip], r2
-        vrhadd.u8       q0,  q0,  q4
-        vld1.64         {d5},  [ip], r2
-        vrhadd.u8       q1,  q1,  q6
-        vld1.64         {d10}, [ip], r2
-        vrhadd.u8       q2,  q2,  q11
-        vld1.64         {d11}, [ip], r2
-        vrhadd.u8       q5,  q5,  q13
-
-.ifc \type,avg
-        vld1.8          {d16}, [r0,:64], r3
-        vrhadd.u8       d0,  d0,  d16
-        vld1.8          {d17}, [r0,:64], r3
-        vrhadd.u8       d1,  d1,  d17
-        vld1.8          {d16}, [r0,:64], r3
-        vrhadd.u8       d2,  d2,  d16
-        vld1.8          {d17}, [r0,:64], r3
-        vrhadd.u8       d3,  d3,  d17
-        vld1.8          {d16}, [r0,:64], r3
-        vrhadd.u8       d4,  d4,  d16
-        vld1.8          {d17}, [r0,:64], r3
-        vrhadd.u8       d5,  d5,  d17
-        vld1.8          {d16}, [r0,:64], r3
-        vrhadd.u8       d10, d10, d16
-        vld1.8          {d17}, [r0,:64], r3
-        vrhadd.u8       d11, d11, d17
-        sub             r0,  r0,  r3,  lsl #3
-.endif
-
-        vst1.64         {d0},  [r0,:64], r3
-        vst1.64         {d1},  [r0,:64], r3
-        vst1.64         {d2},  [r0,:64], r3
-        vst1.64         {d3},  [r0,:64], r3
-        vst1.64         {d4},  [r0,:64], r3
-        vst1.64         {d5},  [r0,:64], r3
-        vst1.64         {d10}, [r0,:64], r3
-        vst1.64         {d11}, [r0,:64], r3
-
-        bx              lr
-endfunc
-        .endm
-
-        h264_qpel_v_lowpass_l2 put
-        h264_qpel_v_lowpass_l2 avg
-
-function put_h264_qpel8_hv_lowpass_neon_top
-        lowpass_const   ip
-        mov             ip,  #12
-1:      vld1.64         {d0, d1},  [r1], r3
-        vld1.64         {d16,d17}, [r1], r3
-        subs            ip,  ip,  #2
-        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
-        vst1.64         {d22-d25}, [r4,:128]!
-        bne             1b
-
-        vld1.64         {d0, d1},  [r1]
-        lowpass_8_1     d0,  d1,  q12, narrow=0
-
-        mov             ip,  #-16
-        add             r4,  r4,  ip
-        vld1.64         {d30,d31}, [r4,:128], ip
-        vld1.64         {d20,d21}, [r4,:128], ip
-        vld1.64         {d18,d19}, [r4,:128], ip
-        vld1.64         {d16,d17}, [r4,:128], ip
-        vld1.64         {d14,d15}, [r4,:128], ip
-        vld1.64         {d12,d13}, [r4,:128], ip
-        vld1.64         {d10,d11}, [r4,:128], ip
-        vld1.64         {d8, d9},  [r4,:128], ip
-        vld1.64         {d6, d7},  [r4,:128], ip
-        vld1.64         {d4, d5},  [r4,:128], ip
-        vld1.64         {d2, d3},  [r4,:128], ip
-        vld1.64         {d0, d1},  [r4,:128]
-
-        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
-        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
-
-        swap4           d17, d19, d21, d31, d24, d26, d28, d22
-        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
-
-        vst1.64         {d30,d31}, [r4,:128]!
-        vst1.64         {d6, d7},  [r4,:128]!
-        vst1.64         {d20,d21}, [r4,:128]!
-        vst1.64         {d4, d5},  [r4,:128]!
-        vst1.64         {d18,d19}, [r4,:128]!
-        vst1.64         {d2, d3},  [r4,:128]!
-        vst1.64         {d16,d17}, [r4,:128]!
-        vst1.64         {d0, d1},  [r4,:128]
-
-        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
-        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
-        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
-        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
-
-        vld1.64         {d16,d17}, [r4,:128], ip
-        vld1.64         {d30,d31}, [r4,:128], ip
-        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
-        vld1.64         {d16,d17}, [r4,:128], ip
-        vld1.64         {d30,d31}, [r4,:128], ip
-        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
-        vld1.64         {d16,d17}, [r4,:128], ip
-        vld1.64         {d30,d31}, [r4,:128], ip
-        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
-        vld1.64         {d16,d17}, [r4,:128], ip
-        vld1.64         {d30,d31}, [r4,:128]
-        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
-
-        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
-
-        bx              lr
-endfunc
-
-        .macro h264_qpel8_hv_lowpass type
-function \type\()_h264_qpel8_hv_lowpass_neon
-        mov             r10, lr
-        bl              put_h264_qpel8_hv_lowpass_neon_top
-.ifc \type,avg
-        vld1.8          {d0},      [r0,:64], r2
-        vrhadd.u8       d12, d12, d0
-        vld1.8          {d1},      [r0,:64], r2
-        vrhadd.u8       d13, d13, d1
-        vld1.8          {d2},      [r0,:64], r2
-        vrhadd.u8       d14, d14, d2
-        vld1.8          {d3},      [r0,:64], r2
-        vrhadd.u8       d15, d15, d3
-        vld1.8          {d4},      [r0,:64], r2
-        vrhadd.u8       d8,  d8,  d4
-        vld1.8          {d5},      [r0,:64], r2
-        vrhadd.u8       d9,  d9,  d5
-        vld1.8          {d6},      [r0,:64], r2
-        vrhadd.u8       d10, d10, d6
-        vld1.8          {d7},      [r0,:64], r2
-        vrhadd.u8       d11, d11, d7
-        sub             r0,  r0,  r2,  lsl #3
-.endif
-        vst1.64         {d12},     [r0,:64], r2
-        vst1.64         {d13},     [r0,:64], r2
-        vst1.64         {d14},     [r0,:64], r2
-        vst1.64         {d15},     [r0,:64], r2
-        vst1.64         {d8},      [r0,:64], r2
-        vst1.64         {d9},      [r0,:64], r2
-        vst1.64         {d10},     [r0,:64], r2
-        vst1.64         {d11},     [r0,:64], r2
-
-        mov             lr,  r10
-        bx              lr
-endfunc
-        .endm
-
-        h264_qpel8_hv_lowpass put
-        h264_qpel8_hv_lowpass avg
-
-        .macro h264_qpel8_hv_lowpass_l2 type
-function \type\()_h264_qpel8_hv_lowpass_l2_neon
-        mov             r10, lr
-        bl              put_h264_qpel8_hv_lowpass_neon_top
-
-        vld1.64         {d0, d1},  [r2,:128]!
-        vld1.64         {d2, d3},  [r2,:128]!
-        vrhadd.u8       q0,  q0,  q6
-        vld1.64         {d4, d5},  [r2,:128]!
-        vrhadd.u8       q1,  q1,  q7
-        vld1.64         {d6, d7},  [r2,:128]!
-        vrhadd.u8       q2,  q2,  q4
-        vrhadd.u8       q3,  q3,  q5
-.ifc \type,avg
-        vld1.8          {d16},     [r0,:64], r3
-        vrhadd.u8       d0,  d0,  d16
-        vld1.8          {d17},     [r0,:64], r3
-        vrhadd.u8       d1,  d1,  d17
-        vld1.8          {d18},     [r0,:64], r3
-        vrhadd.u8       d2,  d2,  d18
-        vld1.8          {d19},     [r0,:64], r3
-        vrhadd.u8       d3,  d3,  d19
-        vld1.8          {d20},     [r0,:64], r3
-        vrhadd.u8       d4,  d4,  d20
-        vld1.8          {d21},     [r0,:64], r3
-        vrhadd.u8       d5,  d5,  d21
-        vld1.8          {d22},     [r0,:64], r3
-        vrhadd.u8       d6,  d6,  d22
-        vld1.8          {d23},     [r0,:64], r3
-        vrhadd.u8       d7,  d7,  d23
-        sub             r0,  r0,  r3,  lsl #3
-.endif
-        vst1.64         {d0},      [r0,:64], r3
-        vst1.64         {d1},      [r0,:64], r3
-        vst1.64         {d2},      [r0,:64], r3
-        vst1.64         {d3},      [r0,:64], r3
-        vst1.64         {d4},      [r0,:64], r3
-        vst1.64         {d5},      [r0,:64], r3
-        vst1.64         {d6},      [r0,:64], r3
-        vst1.64         {d7},      [r0,:64], r3
-
-        mov             lr,  r10
-        bx              lr
-endfunc
-        .endm
-
-        h264_qpel8_hv_lowpass_l2 put
-        h264_qpel8_hv_lowpass_l2 avg
-
-        .macro h264_qpel16_hv type
-function \type\()_h264_qpel16_hv_lowpass_neon
-        mov             r9,  lr
-        bl              \type\()_h264_qpel8_hv_lowpass_neon
-        sub             r1,  r1,  r3, lsl #2
-        bl              \type\()_h264_qpel8_hv_lowpass_neon
-        sub             r1,  r1,  r3, lsl #4
-        sub             r1,  r1,  r3, lsl #2
-        add             r1,  r1,  #8
-        sub             r0,  r0,  r2, lsl #4
-        add             r0,  r0,  #8
-        bl              \type\()_h264_qpel8_hv_lowpass_neon
-        sub             r1,  r1,  r3, lsl #2
-        mov             lr,  r9
-        b               \type\()_h264_qpel8_hv_lowpass_neon
-endfunc
-
-function \type\()_h264_qpel16_hv_lowpass_l2_neon
-        mov             r9,  lr
-        sub             r2,  r4,  #256
-        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
-        sub             r1,  r1,  r3, lsl #2
-        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
-        sub             r1,  r1,  r3, lsl #4
-        sub             r1,  r1,  r3, lsl #2
-        add             r1,  r1,  #8
-        sub             r0,  r0,  r3, lsl #4
-        add             r0,  r0,  #8
-        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
-        sub             r1,  r1,  r3, lsl #2
-        mov             lr,  r9
-        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
-endfunc
-        .endm
-
-        h264_qpel16_hv put
-        h264_qpel16_hv avg
-
-        .macro h264_qpel8 type
-function ff_\type\()_h264_qpel8_mc10_neon, export=1
-        lowpass_const   r3
-        mov             r3,  r1
-        sub             r1,  r1,  #2
-        mov             ip,  #8
-        b               \type\()_h264_qpel8_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel8_mc20_neon, export=1
-        lowpass_const   r3
-        sub             r1,  r1,  #2
-        mov             r3,  r2
-        mov             ip,  #8
-        b               \type\()_h264_qpel8_h_lowpass_neon
-endfunc
-
-function ff_\type\()_h264_qpel8_mc30_neon, export=1
-        lowpass_const   r3
-        add             r3,  r1,  #1
-        sub             r1,  r1,  #2
-        mov             ip,  #8
-        b               \type\()_h264_qpel8_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel8_mc01_neon, export=1
-        push            {lr}
-        mov             ip,  r1
-\type\()_h264_qpel8_mc01:
-        lowpass_const   r3
-        mov             r3,  r2
-        sub             r1,  r1,  r2, lsl #1
-        vpush           {d8-d15}
-        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
-        vpop            {d8-d15}
-        pop             {pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc11_neon, export=1
-        push            {r0, r1, r11, lr}
-\type\()_h264_qpel8_mc11:
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             sp,  sp,  #64
-        mov             r0,  sp
-        sub             r1,  r1,  #2
-        mov             r3,  #8
-        mov             ip,  #8
-        vpush           {d8-d15}
-        bl              put_h264_qpel8_h_lowpass_neon
-        ldrd            r0,  [r11]
-        mov             r3,  r2
-        add             ip,  sp,  #64
-        sub             r1,  r1,  r2, lsl #1
-        mov             r2,  #8
-        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
-        vpop            {d8-d15}
-        add             sp,  r11, #8
-        pop             {r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc21_neon, export=1
-        push            {r0, r1, r4, r10, r11, lr}
-\type\()_h264_qpel8_mc21:
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             sp,  sp,  #(8*8+16*12)
-        sub             r1,  r1,  #2
-        mov             r3,  #8
-        mov             r0,  sp
-        mov             ip,  #8
-        vpush           {d8-d15}
-        bl              put_h264_qpel8_h_lowpass_neon
-        mov             r4,  r0
-        ldrd            r0,  [r11]
-        sub             r1,  r1,  r2, lsl #1
-        sub             r1,  r1,  #2
-        mov             r3,  r2
-        sub             r2,  r4,  #64
-        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
-        vpop            {d8-d15}
-        add             sp,  r11,  #8
-        pop             {r4, r10, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc31_neon, export=1
-        add             r1,  r1,  #1
-        push            {r0, r1, r11, lr}
-        sub             r1,  r1,  #1
-        b               \type\()_h264_qpel8_mc11
-endfunc
-
-function ff_\type\()_h264_qpel8_mc02_neon, export=1
-        push            {lr}
-        lowpass_const   r3
-        sub             r1,  r1,  r2, lsl #1
-        mov             r3,  r2
-        vpush           {d8-d15}
-        bl              \type\()_h264_qpel8_v_lowpass_neon
-        vpop            {d8-d15}
-        pop             {pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc12_neon, export=1
-        push            {r0, r1, r4, r10, r11, lr}
-\type\()_h264_qpel8_mc12:
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             sp,  sp,  #(8*8+16*12)
-        sub             r1,  r1,  r2, lsl #1
-        mov             r3,  r2
-        mov             r2,  #8
-        mov             r0,  sp
-        vpush           {d8-d15}
-        bl              put_h264_qpel8_v_lowpass_neon
-        mov             r4,  r0
-        ldrd            r0,  [r11]
-        sub             r1,  r1,  r3, lsl #1
-        sub             r1,  r1,  #2
-        sub             r2,  r4,  #64
-        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
-        vpop            {d8-d15}
-        add             sp,  r11,  #8
-        pop             {r4, r10, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc22_neon, export=1
-        push            {r4, r10, r11, lr}
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             r1,  r1,  r2, lsl #1
-        sub             r1,  r1,  #2
-        mov             r3,  r2
-        sub             sp,  sp,  #(16*12)
-        mov             r4,  sp
-        vpush           {d8-d15}
-        bl              \type\()_h264_qpel8_hv_lowpass_neon
-        vpop            {d8-d15}
-        mov             sp,  r11
-        pop             {r4, r10, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc32_neon, export=1
-        push            {r0, r1, r4, r10, r11, lr}
-        add             r1,  r1,  #1
-        b               \type\()_h264_qpel8_mc12
-endfunc
-
-function ff_\type\()_h264_qpel8_mc03_neon, export=1
-        push            {lr}
-        add             ip,  r1,  r2
-        b               \type\()_h264_qpel8_mc01
-endfunc
-
-function ff_\type\()_h264_qpel8_mc13_neon, export=1
-        push            {r0, r1, r11, lr}
-        add             r1,  r1,  r2
-        b               \type\()_h264_qpel8_mc11
-endfunc
-
-function ff_\type\()_h264_qpel8_mc23_neon, export=1
-        push            {r0, r1, r4, r10, r11, lr}
-        add             r1,  r1,  r2
-        b               \type\()_h264_qpel8_mc21
-endfunc
-
-function ff_\type\()_h264_qpel8_mc33_neon, export=1
-        add             r1,  r1,  #1
-        push            {r0, r1, r11, lr}
-        add             r1,  r1,  r2
-        sub             r1,  r1,  #1
-        b               \type\()_h264_qpel8_mc11
-endfunc
-        .endm
-
-        h264_qpel8 put
-        h264_qpel8 avg
-
-        .macro h264_qpel16 type
-function ff_\type\()_h264_qpel16_mc10_neon, export=1
-        lowpass_const   r3
-        mov             r3,  r1
-        sub             r1,  r1,  #2
-        b               \type\()_h264_qpel16_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel16_mc20_neon, export=1
-        lowpass_const   r3
-        sub             r1,  r1,  #2
-        mov             r3,  r2
-        b               \type\()_h264_qpel16_h_lowpass_neon
-endfunc
-
-function ff_\type\()_h264_qpel16_mc30_neon, export=1
-        lowpass_const   r3
-        add             r3,  r1,  #1
-        sub             r1,  r1,  #2
-        b               \type\()_h264_qpel16_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel16_mc01_neon, export=1
-        push            {r4, lr}
-        mov             ip,  r1
-\type\()_h264_qpel16_mc01:
-        lowpass_const   r3
-        mov             r3,  r2
-        sub             r1,  r1,  r2, lsl #1
-        vpush           {d8-d15}
-        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
-        vpop            {d8-d15}
-        pop             {r4, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc11_neon, export=1
-        push            {r0, r1, r4, r11, lr}
-\type\()_h264_qpel16_mc11:
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             sp,  sp,  #256
-        mov             r0,  sp
-        sub             r1,  r1,  #2
-        mov             r3,  #16
-        vpush           {d8-d15}
-        bl              put_h264_qpel16_h_lowpass_neon
-        ldrd            r0,  [r11]
-        mov             r3,  r2
-        add             ip,  sp,  #64
-        sub             r1,  r1,  r2, lsl #1
-        mov             r2,  #16
-        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
-        vpop            {d8-d15}
-        add             sp,  r11, #8
-        pop             {r4, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc21_neon, export=1
-        push            {r0, r1, r4-r5, r9-r11, lr}
-\type\()_h264_qpel16_mc21:
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             sp,  sp,  #(16*16+16*12)
-        sub             r1,  r1,  #2
-        mov             r0,  sp
-        vpush           {d8-d15}
-        bl              put_h264_qpel16_h_lowpass_neon_packed
-        mov             r4,  r0
-        ldrd            r0,  [r11]
-        sub             r1,  r1,  r2, lsl #1
-        sub             r1,  r1,  #2
-        mov             r3,  r2
-        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
-        vpop            {d8-d15}
-        add             sp,  r11,  #8
-        pop             {r4-r5, r9-r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc31_neon, export=1
-        add             r1,  r1,  #1
-        push            {r0, r1, r4, r11, lr}
-        sub             r1,  r1,  #1
-        b               \type\()_h264_qpel16_mc11
-endfunc
-
-function ff_\type\()_h264_qpel16_mc02_neon, export=1
-        push            {r4, lr}
-        lowpass_const   r3
-        sub             r1,  r1,  r2, lsl #1
-        mov             r3,  r2
-        vpush           {d8-d15}
-        bl              \type\()_h264_qpel16_v_lowpass_neon
-        vpop            {d8-d15}
-        pop             {r4, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc12_neon, export=1
-        push            {r0, r1, r4-r5, r9-r11, lr}
-\type\()_h264_qpel16_mc12:
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             sp,  sp,  #(16*16+16*12)
-        sub             r1,  r1,  r2, lsl #1
-        mov             r0,  sp
-        mov             r3,  r2
-        vpush           {d8-d15}
-        bl              put_h264_qpel16_v_lowpass_neon_packed
-        mov             r4,  r0
-        ldrd            r0,  [r11]
-        sub             r1,  r1,  r3, lsl #1
-        sub             r1,  r1,  #2
-        mov             r2,  r3
-        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
-        vpop            {d8-d15}
-        add             sp,  r11,  #8
-        pop             {r4-r5, r9-r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc22_neon, export=1
-        push            {r4, r9-r11, lr}
-        lowpass_const   r3
-        mov             r11, sp
-        bic             sp,  sp,  #15
-        sub             r1,  r1,  r2, lsl #1
-        sub             r1,  r1,  #2
-        mov             r3,  r2
-        sub             sp,  sp,  #(16*12)
-        mov             r4,  sp
-        vpush           {d8-d15}
-        bl              \type\()_h264_qpel16_hv_lowpass_neon
-        vpop            {d8-d15}
-        mov             sp,  r11
-        pop             {r4, r9-r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc32_neon, export=1
-        push            {r0, r1, r4-r5, r9-r11, lr}
-        add             r1,  r1,  #1
-        b               \type\()_h264_qpel16_mc12
-endfunc
-
-function ff_\type\()_h264_qpel16_mc03_neon, export=1
-        push            {r4, lr}
-        add             ip,  r1,  r2
-        b               \type\()_h264_qpel16_mc01
-endfunc
-
-function ff_\type\()_h264_qpel16_mc13_neon, export=1
-        push            {r0, r1, r4, r11, lr}
-        add             r1,  r1,  r2
-        b               \type\()_h264_qpel16_mc11
-endfunc
-
-function ff_\type\()_h264_qpel16_mc23_neon, export=1
-        push            {r0, r1, r4-r5, r9-r11, lr}
-        add             r1,  r1,  r2
-        b               \type\()_h264_qpel16_mc21
-endfunc
-
-function ff_\type\()_h264_qpel16_mc33_neon, export=1
-        add             r1,  r1,  #1
-        push            {r0, r1, r4, r11, lr}
-        add             r1,  r1,  r2
-        sub             r1,  r1,  #1
-        b               \type\()_h264_qpel16_mc11
-endfunc
-        .endm
-
-        h264_qpel16 put
-        h264_qpel16 avg
-
-@ Biweighted prediction
-
-        .macro  biweight_16 macs, macd
-        vdup.8          d0,  r4
-        vdup.8          d1,  r5
-        vmov            q2,  q8
-        vmov            q3,  q8
-1:      subs            ip,  ip,  #2
-        vld1.8          {d20-d21},[r0,:128], r2
-        \macd           q2,  d0,  d20
-        pld             [r0]
-        \macd           q3,  d0,  d21
-        vld1.8          {d22-d23},[r1,:128], r2
-        \macs           q2,  d1,  d22
-        pld             [r1]
-        \macs           q3,  d1,  d23
-        vmov            q12, q8
-        vld1.8          {d28-d29},[r0,:128], r2
-        vmov            q13, q8
-        \macd           q12, d0,  d28
-        pld             [r0]
-        \macd           q13, d0,  d29
-        vld1.8          {d30-d31},[r1,:128], r2
-        \macs           q12, d1,  d30
-        pld             [r1]
-        \macs           q13, d1,  d31
-        vshl.s16        q2,  q2,  q9
-        vshl.s16        q3,  q3,  q9
-        vqmovun.s16     d4,  q2
-        vqmovun.s16     d5,  q3
-        vshl.s16        q12, q12, q9
-        vshl.s16        q13, q13, q9
-        vqmovun.s16     d24, q12
-        vqmovun.s16     d25, q13
-        vmov            q3,  q8
-        vst1.8          {d4- d5}, [r6,:128], r2
-        vmov            q2,  q8
-        vst1.8          {d24-d25},[r6,:128], r2
-        bne             1b
-        pop             {r4-r6, pc}
-        .endm
-
-        .macro  biweight_8 macs, macd
-        vdup.8          d0,  r4
-        vdup.8          d1,  r5
-        vmov            q1,  q8
-        vmov            q10, q8
-1:      subs            ip,  ip,  #2
-        vld1.8          {d4},[r0,:64], r2
-        \macd           q1,  d0,  d4
-        pld             [r0]
-        vld1.8          {d5},[r1,:64], r2
-        \macs           q1,  d1,  d5
-        pld             [r1]
-        vld1.8          {d6},[r0,:64], r2
-        \macd           q10, d0,  d6
-        pld             [r0]
-        vld1.8          {d7},[r1,:64], r2
-        \macs           q10, d1,  d7
-        pld             [r1]
-        vshl.s16        q1,  q1,  q9
-        vqmovun.s16     d2,  q1
-        vshl.s16        q10, q10, q9
-        vqmovun.s16     d4,  q10
-        vmov            q10, q8
-        vst1.8          {d2},[r6,:64], r2
-        vmov            q1,  q8
-        vst1.8          {d4},[r6,:64], r2
-        bne             1b
-        pop             {r4-r6, pc}
-        .endm
-
-        .macro  biweight_4 macs, macd
-        vdup.8          d0,  r4
-        vdup.8          d1,  r5
-        vmov            q1,  q8
-        vmov            q10, q8
-1:      subs            ip,  ip,  #4
-        vld1.32         {d4[0]},[r0,:32], r2
-        vld1.32         {d4[1]},[r0,:32], r2
-        \macd           q1,  d0,  d4
-        pld             [r0]
-        vld1.32         {d5[0]},[r1,:32], r2
-        vld1.32         {d5[1]},[r1,:32], r2
-        \macs           q1,  d1,  d5
-        pld             [r1]
-        blt             2f
-        vld1.32         {d6[0]},[r0,:32], r2
-        vld1.32         {d6[1]},[r0,:32], r2
-        \macd           q10, d0,  d6
-        pld             [r0]
-        vld1.32         {d7[0]},[r1,:32], r2
-        vld1.32         {d7[1]},[r1,:32], r2
-        \macs           q10, d1,  d7
-        pld             [r1]
-        vshl.s16        q1,  q1,  q9
-        vqmovun.s16     d2,  q1
-        vshl.s16        q10, q10, q9
-        vqmovun.s16     d4,  q10
-        vmov            q10, q8
-        vst1.32         {d2[0]},[r6,:32], r2
-        vst1.32         {d2[1]},[r6,:32], r2
-        vmov            q1,  q8
-        vst1.32         {d4[0]},[r6,:32], r2
-        vst1.32         {d4[1]},[r6,:32], r2
-        bne             1b
-        pop             {r4-r6, pc}
-2:      vshl.s16        q1,  q1,  q9
-        vqmovun.s16     d2,  q1
-        vst1.32         {d2[0]},[r6,:32], r2
-        vst1.32         {d2[1]},[r6,:32], r2
-        pop             {r4-r6, pc}
-        .endm
-
-        .macro  biweight_func w
-function biweight_h264_pixels_\w\()_neon
-        push            {r4-r6, lr}
-        add             r4,  sp,  #16
-        ldm             r4,  {r4-r6}
-        lsr             lr,  r4,  #31
-        add             r6,  r6,  #1
-        eors            lr,  lr,  r5,  lsr #30
-        orr             r6,  r6,  #1
-        vdup.16         q9,  r3
-        lsl             r6,  r6,  r3
-        vmvn            q9,  q9
-        vdup.16         q8,  r6
-        mov             r6,  r0
-        beq             10f
-        subs            lr,  lr,  #1
-        beq             20f
-        subs            lr,  lr,  #1
-        beq             30f
-        b               40f
-10:     biweight_\w     vmlal.u8, vmlal.u8
-20:     rsb             r4,  r4,  #0
-        biweight_\w     vmlal.u8, vmlsl.u8
-30:     rsb             r4,  r4,  #0
-        rsb             r5,  r5,  #0
-        biweight_\w     vmlsl.u8, vmlsl.u8
-40:     rsb             r5,  r5,  #0
-        biweight_\w     vmlsl.u8, vmlal.u8
-endfunc
-        .endm
-
-        .macro  biweight_entry w, h, b=1
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               biweight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        biweight_entry  16, 8
-        biweight_entry  16, 16, b=0
-        biweight_func   16
-
-        biweight_entry  8,  16
-        biweight_entry  8,  4
-        biweight_entry  8,  8,  b=0
-        biweight_func   8
-
-        biweight_entry  4,  8
-        biweight_entry  4,  2
-        biweight_entry  4,  4,  b=0
-        biweight_func   4
-
-@ Weighted prediction
-
-        .macro  weight_16 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
-        vld1.8          {d20-d21},[r0,:128], r1
-        vmull.u8        q2,  d0,  d20
-        pld             [r0]
-        vmull.u8        q3,  d0,  d21
-        vld1.8          {d28-d29},[r0,:128], r1
-        vmull.u8        q12, d0,  d28
-        pld             [r0]
-        vmull.u8        q13, d0,  d29
-        \add            q2,  q8,  q2
-        vrshl.s16       q2,  q2,  q9
-        \add            q3,  q8,  q3
-        vrshl.s16       q3,  q3,  q9
-        vqmovun.s16     d4,  q2
-        vqmovun.s16     d5,  q3
-        \add            q12, q8,  q12
-        vrshl.s16       q12, q12, q9
-        \add            q13, q8,  q13
-        vrshl.s16       q13, q13, q9
-        vqmovun.s16     d24, q12
-        vqmovun.s16     d25, q13
-        vst1.8          {d4- d5}, [r4,:128], r1
-        vst1.8          {d24-d25},[r4,:128], r1
-        bne             1b
-        pop             {r4, pc}
-        .endm
-
-        .macro  weight_8 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
-        vld1.8          {d4},[r0,:64], r1
-        vmull.u8        q1,  d0,  d4
-        pld             [r0]
-        vld1.8          {d6},[r0,:64], r1
-        vmull.u8        q10, d0,  d6
-        \add            q1,  q8,  q1
-        pld             [r0]
-        vrshl.s16       q1,  q1,  q9
-        vqmovun.s16     d2,  q1
-        \add            q10, q8,  q10
-        vrshl.s16       q10, q10, q9
-        vqmovun.s16     d4,  q10
-        vst1.8          {d2},[r4,:64], r1
-        vst1.8          {d4},[r4,:64], r1
-        bne             1b
-        pop             {r4, pc}
-        .endm
-
-        .macro  weight_4 add
-        vdup.8          d0,  r3
-        vmov            q1,  q8
-        vmov            q10, q8
-1:      subs            ip,  ip,  #4
-        vld1.32         {d4[0]},[r0,:32], r1
-        vld1.32         {d4[1]},[r0,:32], r1
-        vmull.u8        q1,  d0,  d4
-        pld             [r0]
-        blt             2f
-        vld1.32         {d6[0]},[r0,:32], r1
-        vld1.32         {d6[1]},[r0,:32], r1
-        vmull.u8        q10, d0,  d6
-        pld             [r0]
-        \add            q1,  q8,  q1
-        vrshl.s16       q1,  q1,  q9
-        vqmovun.s16     d2,  q1
-        \add            q10, q8,  q10
-        vrshl.s16       q10, q10, q9
-        vqmovun.s16     d4,  q10
-        vmov            q10, q8
-        vst1.32         {d2[0]},[r4,:32], r1
-        vst1.32         {d2[1]},[r4,:32], r1
-        vmov            q1,  q8
-        vst1.32         {d4[0]},[r4,:32], r1
-        vst1.32         {d4[1]},[r4,:32], r1
-        bne             1b
-        pop             {r4, pc}
-2:      \add            q1,  q8,  q1
-        vrshl.s16       q1,  q1,  q9
-        vqmovun.s16     d2,  q1
-        vst1.32         {d2[0]},[r4,:32], r1
-        vst1.32         {d2[1]},[r4,:32], r1
-        pop             {r4, pc}
-        .endm
-
-        .macro  weight_func w
-function weight_h264_pixels_\w\()_neon
-        push            {r4, lr}
-        ldr             r4,  [sp, #8]
-        cmp             r2,  #1
-        lsl             r4,  r4,  r2
-        vdup.16         q8,  r4
-        mov             r4,  r0
-        ble             20f
-        rsb             lr,  r2,  #1
-        vdup.16         q9,  lr
-        cmp             r3,  #0
-        blt             10f
-        weight_\w       vhadd.s16
-10:     rsb             r3,  r3,  #0
-        weight_\w       vhsub.s16
-20:     rsb             lr,  r2,  #0
-        vdup.16         q9,  lr
-        cmp             r3,  #0
-        blt             10f
-        weight_\w       vadd.s16
-10:     rsb             r3,  r3,  #0
-        weight_\w       vsub.s16
-endfunc
-        .endm
-
-        .macro  weight_entry w, h, b=1
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               weight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        weight_entry    16, 8
-        weight_entry    16, 16, b=0
-        weight_func     16
-
-        weight_entry    8,  16
-        weight_entry    8,  4
-        weight_entry    8,  8,  b=0
-        weight_func     8
-
-        weight_entry    4,  8
-        weight_entry    4,  2
-        weight_entry    4,  4,  b=0
-        weight_func     4
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        preserve8
-        .text
-
-function ff_h264_idct_add_neon, export=1
-        vld1.64         {d0-d3},  [r1,:128]
-
-        vswp            d1,  d2
-        vadd.i16        d4,  d0,  d1
-        vshr.s16        q8,  q1,  #1
-        vsub.i16        d5,  d0,  d1
-        vadd.i16        d6,  d2,  d17
-        vsub.i16        d7,  d16, d3
-        vadd.i16        q0,  q2,  q3
-        vsub.i16        q1,  q2,  q3
-
-        vtrn.16         d0,  d1
-        vtrn.16         d3,  d2
-        vtrn.32         d0,  d3
-        vtrn.32         d1,  d2
-
-        vadd.i16        d4,  d0,  d3
-        vld1.32         {d18[0]}, [r0,:32], r2
-        vswp            d1,  d3
-        vshr.s16        q8,  q1,  #1
-        vld1.32         {d19[1]}, [r0,:32], r2
-        vsub.i16        d5,  d0,  d1
-        vld1.32         {d18[1]}, [r0,:32], r2
-        vadd.i16        d6,  d16, d3
-        vld1.32         {d19[0]}, [r0,:32], r2
-        vsub.i16        d7,  d2,  d17
-        sub             r0,  r0,  r2, lsl #2
-        vadd.i16        q0,  q2,  q3
-        vsub.i16        q1,  q2,  q3
-
-        vrshr.s16       q0,  q0,  #6
-        vrshr.s16       q1,  q1,  #6
-
-        vaddw.u8        q0,  q0,  d18
-        vaddw.u8        q1,  q1,  d19
-
-        vqmovun.s16     d0,  q0
-        vqmovun.s16     d1,  q1
-
-        vst1.32         {d0[0]},  [r0,:32], r2
-        vst1.32         {d1[1]},  [r0,:32], r2
-        vst1.32         {d0[1]},  [r0,:32], r2
-        vst1.32         {d1[0]},  [r0,:32], r2
-
-        bx              lr
-endfunc
-
-function ff_h264_idct_dc_add_neon, export=1
-        vld1.16         {d2[],d3[]}, [r1,:16]
-        vrshr.s16       q1,  q1,  #6
-        vld1.32         {d0[0]},  [r0,:32], r2
-        vld1.32         {d0[1]},  [r0,:32], r2
-        vaddw.u8        q2,  q1,  d0
-        vld1.32         {d1[0]},  [r0,:32], r2
-        vld1.32         {d1[1]},  [r0,:32], r2
-        vaddw.u8        q1,  q1,  d1
-        vqmovun.s16     d0,  q2
-        vqmovun.s16     d1,  q1
-        sub             r0,  r0,  r2, lsl #2
-        vst1.32         {d0[0]},  [r0,:32], r2
-        vst1.32         {d0[1]},  [r0,:32], r2
-        vst1.32         {d1[0]},  [r0,:32], r2
-        vst1.32         {d1[1]},  [r0,:32], r2
-        bx              lr
-endfunc
-
-function ff_h264_idct_add16_neon, export=1
-        push            {r4-r8,lr}
-        mov             r4,  r0
-        mov             r5,  r1
-        mov             r1,  r2
-        mov             r2,  r3
-        ldr             r6,  [sp, #24]
-        movrel          r7,  scan8
-        mov             ip,  #16
-1:      ldrb            r8,  [r7], #1
-        ldr             r0,  [r5], #4
-        ldrb            r8,  [r6, r8]
-        subs            r8,  r8,  #1
-        blt             2f
-        ldrsh           lr,  [r1]
-        add             r0,  r0,  r4
-        movne           lr,  #0
-        cmp             lr,  #0
-        adrne           lr,  ff_h264_idct_dc_add_neon
-        adreq           lr,  ff_h264_idct_add_neon
-        blx             lr
-2:      subs            ip,  ip,  #1
-        add             r1,  r1,  #32
-        bne             1b
-        pop             {r4-r8,pc}
-endfunc
-
-function ff_h264_idct_add16intra_neon, export=1
-        push            {r4-r8,lr}
-        mov             r4,  r0
-        mov             r5,  r1
-        mov             r1,  r2
-        mov             r2,  r3
-        ldr             r6,  [sp, #24]
-        movrel          r7,  scan8
-        mov             ip,  #16
-1:      ldrb            r8,  [r7], #1
-        ldr             r0,  [r5], #4
-        ldrb            r8,  [r6, r8]
-        add             r0,  r0,  r4
-        cmp             r8,  #0
-        ldrsh           r8,  [r1]
-        adrne           lr,  ff_h264_idct_add_neon
-        adreq           lr,  ff_h264_idct_dc_add_neon
-        cmpeq           r8,  #0
-        blxne           lr
-        subs            ip,  ip,  #1
-        add             r1,  r1,  #32
-        bne             1b
-        pop             {r4-r8,pc}
-endfunc
-
-function ff_h264_idct_add8_neon, export=1
-        push            {r4-r10,lr}
-        ldm             r0,  {r4,r9}
-        add             r5,  r1,  #16*4
-        add             r1,  r2,  #16*32
-        mov             r2,  r3
-        ldr             r6,  [sp, #32]
-        movrel          r7,  scan8+16
-        mov             ip,  #8
-1:      ldrb            r8,  [r7], #1
-        ldr             r0,  [r5], #4
-        ldrb            r8,  [r6, r8]
-        tst             ip,  #4
-        addeq           r0,  r0,  r4
-        addne           r0,  r0,  r9
-        cmp             r8,  #0
-        ldrsh           r8,  [r1]
-        adrne           lr,  ff_h264_idct_add_neon
-        adreq           lr,  ff_h264_idct_dc_add_neon
-        cmpeq           r8,  #0
-        blxne           lr
-        subs            ip,  ip,  #1
-        add             r1,  r1,  #32
-        bne             1b
-        pop             {r4-r10,pc}
-endfunc
-
-        .section .rodata
-scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
-        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
-        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
-        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
-        .byte           1+1*8, 2+1*8
-        .byte           1+2*8, 2+2*8
-        .byte           1+4*8, 2+4*8
-        .byte           1+5*8, 2+5*8
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavcodec/h264pred.h"
-
-void ff_pred16x16_vert_neon(uint8_t *src, int stride);
-void ff_pred16x16_hor_neon(uint8_t *src, int stride);
-void ff_pred16x16_plane_neon(uint8_t *src, int stride);
-void ff_pred16x16_dc_neon(uint8_t *src, int stride);
-void ff_pred16x16_128_dc_neon(uint8_t *src, int stride);
-void ff_pred16x16_left_dc_neon(uint8_t *src, int stride);
-void ff_pred16x16_top_dc_neon(uint8_t *src, int stride);
-
-void ff_pred8x8_vert_neon(uint8_t *src, int stride);
-void ff_pred8x8_hor_neon(uint8_t *src, int stride);
-void ff_pred8x8_plane_neon(uint8_t *src, int stride);
-void ff_pred8x8_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_128_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_left_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_top_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_l0t_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride);
-void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride);
-
-#if HAVE_NEON
-static void ff_h264_pred_init_neon(H264PredContext *h)
-{
-    h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
-    h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
-    h->pred8x8[PLANE_PRED8x8    ] = ff_pred8x8_plane_neon;
-    h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
-    
-    h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
-    h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
-    h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
-    h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
-    h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
-    h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
-    h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
-
-
-    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
-    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
-    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
-    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
-    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
-    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
-    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
-}
-#endif
-
-void ff_h264_pred_init_arm(H264PredContext *h)
-{
-    if (HAVE_NEON)    ff_h264_pred_init_neon(h);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
-.if \n == 8 || \hi == 0
-        vld1.8          {\rd[0]}, [\rs], \rt
-        vld1.8          {\rd[1]}, [\rs], \rt
-        vld1.8          {\rd[2]}, [\rs], \rt
-        vld1.8          {\rd[3]}, [\rs], \rt
-.endif
-.if \n == 8 || \hi == 1
-        vld1.8          {\rd[4]}, [\rs], \rt
-        vld1.8          {\rd[5]}, [\rs], \rt
-        vld1.8          {\rd[6]}, [\rs], \rt
-        vld1.8          {\rd[7]}, [\rs], \rt
-.endif
-        .endm
-
-        .macro add16x8  dq,  dl,  dh,  rl,  rh
-        vaddl.u8        \dq, \rl, \rh
-        vadd.u16        \dl, \dl, \dh
-        vpadd.u16       \dl, \dl, \dl
-        vpadd.u16       \dl, \dl, \dl
-        .endm
-
-function ff_pred16x16_128_dc_neon, export=1
-        vmov.i8         q0,  #128
-        b               .L_pred16x16_dc_end
-endfunc
-
-function ff_pred16x16_top_dc_neon, export=1
-        sub             r2,  r0,  r1
-        vld1.8          {q0},     [r2,:128]
-        add16x8         q0,  d0,  d1,  d0,  d1
-        vrshrn.u16      d0,  q0,  #4
-        vdup.8          q0,  d0[0]
-        b               .L_pred16x16_dc_end
-endfunc
-
-function ff_pred16x16_left_dc_neon, export=1
-        sub             r2,  r0,  #1
-        ldcol.8         d0,  r2,  r1
-        ldcol.8         d1,  r2,  r1
-        add16x8         q0,  d0,  d1,  d0,  d1
-        vrshrn.u16      d0,  q0,  #4
-        vdup.8          q0,  d0[0]
-        b               .L_pred16x16_dc_end
-endfunc
-
-function ff_pred16x16_dc_neon, export=1
-        sub             r2,  r0,  r1
-        vld1.8          {q0},     [r2,:128]
-        sub             r2,  r0,  #1
-        ldcol.8         d2,  r2,  r1
-        ldcol.8         d3,  r2,  r1
-        vaddl.u8        q0,  d0,  d1
-        vaddl.u8        q1,  d2,  d3
-        vadd.u16        q0,  q0,  q1
-        vadd.u16        d0,  d0,  d1
-        vpadd.u16       d0,  d0,  d0
-        vpadd.u16       d0,  d0,  d0
-        vrshrn.u16      d0,  q0,  #5
-        vdup.8          q0,  d0[0]
-.L_pred16x16_dc_end:
-        mov             r3,  #8
-6:      vst1.8          {q0},     [r0,:128], r1
-        vst1.8          {q0},     [r0,:128], r1
-        subs            r3,  r3,  #1
-        bne             6b
-        bx              lr
-endfunc
-
-function ff_pred16x16_hor_neon, export=1
-        sub             r2,  r0,  #1
-        mov             r3,  #16
-1:      vld1.8          {d0[],d1[]},[r2],      r1
-        vst1.8          {q0},       [r0,:128], r1
-        subs            r3,  r3,  #1
-        bne             1b
-        bx              lr
-endfunc
-
-function ff_pred16x16_vert_neon, export=1
-        sub             r0,  r0,  r1
-        vld1.8          {q0},     [r0,:128], r1
-        mov             r3,  #8
-1:      vst1.8          {q0},     [r0,:128], r1
-        vst1.8          {q0},     [r0,:128], r1
-        subs            r3,  r3,  #1
-        bne             1b
-        bx              lr
-endfunc
-
-function ff_pred16x16_plane_neon, export=1
-        sub             r3,  r0,  r1
-        add             r2,  r3,  #8
-        sub             r3,  r3,  #1
-        vld1.8          {d0},     [r3]
-        vld1.8          {d2},     [r2,:64], r1
-        ldcol.8         d1,  r3,  r1
-        add             r3,  r3,  r1
-        ldcol.8         d3,  r3,  r1
-        vrev64.8        q0,  q0
-        vaddl.u8        q8,  d2,  d3
-        vsubl.u8        q2,  d2,  d0
-        vsubl.u8        q3,  d3,  d1
-        movrel          r3,  p16weight
-        vld1.8          {q0},     [r3,:128]
-        vmul.s16        q2,  q2,  q0
-        vmul.s16        q3,  q3,  q0
-        vadd.i16        d4,  d4,  d5
-        vadd.i16        d5,  d6,  d7
-        vpadd.i16       d4,  d4,  d5
-        vpadd.i16       d4,  d4,  d4
-        vshl.i16        d5,  d4,  #2
-        vaddl.s16       q2,  d4,  d5
-        vrshrn.s32      d4,  q2,  #6
-        mov             r3,  #0
-        vtrn.16         d4,  d5
-        vadd.i16        d2,  d4,  d5
-        vshl.i16        d3,  d2,  #3
-        vrev64.16       d16, d17
-        vsub.i16        d3,  d3,  d2
-        vadd.i16        d16, d16, d0
-        vshl.i16        d2,  d16, #4
-        vsub.i16        d2,  d2,  d3
-        vshl.i16        d3,  d4,  #4
-        vext.16         q0,  q0,  q0,  #7
-        vsub.i16        d6,  d5,  d3
-        vmov.16         d0[0], r3
-        vmul.i16        q0,  q0,  d4[0]
-        vdup.16         q1,  d2[0]
-        vdup.16         q2,  d4[0]
-        vdup.16         q3,  d6[0]
-        vshl.i16        q2,  q2,  #3
-        vadd.i16        q1,  q1,  q0
-        vadd.i16        q3,  q3,  q2
-        mov             r3,  #16
-1:
-        vqshrun.s16     d0,  q1,  #5
-        vadd.i16        q1,  q1,  q2
-        vqshrun.s16     d1,  q1,  #5
-        vadd.i16        q1,  q1,  q3
-        vst1.8          {q0},     [r0,:128], r1
-        subs            r3,  r3,  #1
-        bne             1b
-        bx              lr
-endfunc
-
-        .section        .rodata
-        .align          4
-p16weight:
-        .short          1,2,3,4,5,6,7,8
-
-        .text
-
-function ff_pred8x8_hor_neon, export=1
-        sub             r2,  r0,  #1
-        mov             r3,  #8
-1:      vld1.8          {d0[]},   [r2],     r1
-        vst1.8          {d0},     [r0,:64], r1
-        subs            r3,  r3,  #1
-        bne             1b
-        bx              lr
-endfunc
-
-function ff_pred8x8_vert_neon, export=1
-        sub             r0,  r0,  r1
-        vld1.8          {d0},     [r0,:64], r1
-        mov             r3,  #4
-1:      vst1.8          {d0},     [r0,:64], r1
-        vst1.8          {d0},     [r0,:64], r1
-        subs            r3,  r3,  #1
-        bne             1b
-        bx              lr
-endfunc
-
-function ff_pred8x8_plane_neon, export=1
-        sub             r3,  r0,  r1
-        add             r2,  r3,  #4
-        sub             r3,  r3,  #1
-        vld1.32         {d0[0]},  [r3]
-        vld1.32         {d2[0]},  [r2,:32], r1
-        ldcol.8         d0,  r3,  r1,  4,  hi=1
-        add             r3,  r3,  r1
-        ldcol.8         d3,  r3,  r1,  4
-        vaddl.u8        q8,  d2,  d3
-        vrev32.8        d0,  d0
-        vtrn.32         d2,  d3
-        vsubl.u8        q2,  d2,  d0
-        movrel          r3,  p16weight
-        vld1.16         {q0},     [r3,:128]
-        vmul.s16        d4,  d4,  d0
-        vmul.s16        d5,  d5,  d0
-        vpadd.i16       d4,  d4,  d5
-        vpaddl.s16      d4,  d4
-        vshl.i32        d5,  d4,  #4
-        vadd.s32        d4,  d4,  d5
-        vrshrn.s32      d4,  q2,  #5
-        mov             r3,  #0
-        vtrn.16         d4,  d5
-        vadd.i16        d2,  d4,  d5
-        vshl.i16        d3,  d2,  #2
-        vrev64.16       d16, d16
-        vsub.i16        d3,  d3,  d2
-        vadd.i16        d16, d16, d0
-        vshl.i16        d2,  d16, #4
-        vsub.i16        d2,  d2,  d3
-        vshl.i16        d3,  d4,  #3
-        vext.16         q0,  q0,  q0,  #7
-        vsub.i16        d6,  d5,  d3
-        vmov.16         d0[0], r3
-        vmul.i16        q0,  q0,  d4[0]
-        vdup.16         q1,  d2[0]
-        vdup.16         q2,  d4[0]
-        vdup.16         q3,  d6[0]
-        vshl.i16        q2,  q2,  #3
-        vadd.i16        q1,  q1,  q0
-        vadd.i16        q3,  q3,  q2
-        mov             r3,  #8
-1:
-        vqshrun.s16     d0,  q1,  #5
-        vadd.i16        q1,  q1,  q3
-        vst1.8          {d0},     [r0,:64], r1
-        subs            r3,  r3,  #1
-        bne             1b
-        bx              lr
-endfunc
-
-function ff_pred8x8_128_dc_neon, export=1
-        vmov.i8         q0,  #128
-        b               .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_top_dc_neon, export=1
-        sub             r2,  r0,  r1
-        vld1.8          {d0},     [r2,:64]
-        vpaddl.u8       d0,  d0
-        vpadd.u16       d0,  d0,  d0
-        vrshrn.u16      d0,  q0,  #2
-        vdup.8          d1,  d0[1]
-        vdup.8          d0,  d0[0]
-        vtrn.32         d0,  d1
-        b               .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_left_dc_neon, export=1
-        sub             r2,  r0,  #1
-        ldcol.8         d0,  r2,  r1
-        vpaddl.u8       d0,  d0
-        vpadd.u16       d0,  d0,  d0
-        vrshrn.u16      d0,  q0,  #2
-        vdup.8          d1,  d0[1]
-        vdup.8          d0,  d0[0]
-        b               .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_dc_neon, export=1
-        sub             r2,  r0,  r1
-        vld1.8          {d0},     [r2,:64]
-        sub             r2,  r0,  #1
-        ldcol.8         d1,  r2,  r1
-        vtrn.32         d0,  d1
-        vpaddl.u8       q0,  q0
-        vpadd.u16       d0,  d0,  d1
-        vpadd.u16       d1,  d0,  d0
-        vrshrn.u16      d2,  q0,  #3
-        vrshrn.u16      d3,  q0,  #2
-        vdup.8          d0,  d2[4]
-        vdup.8          d1,  d3[3]
-        vdup.8          d4,  d3[2]
-        vdup.8          d5,  d2[5]
-        vtrn.32         q0,  q2
-.L_pred8x8_dc_end:
-        mov             r3,  #4
-        add             r2,  r0,  r1,  lsl #2
-6:      vst1.8          {d0},     [r0,:64], r1
-        vst1.8          {d1},     [r2,:64], r1
-        subs            r3,  r3,  #1
-        bne             6b
-        bx              lr
-endfunc
-
-function ff_pred8x8_l0t_dc_neon, export=1
-        sub             r2,  r0,  r1
-        vld1.8          {d0},     [r2,:64]
-        sub             r2,  r0,  #1
-        ldcol.8         d1,  r2,  r1,  4
-        vtrn.32         d0,  d1
-        vpaddl.u8       q0,  q0
-        vpadd.u16       d0,  d0,  d1
-        vpadd.u16       d1,  d0,  d0
-        vrshrn.u16      d2,  q0,  #3
-        vrshrn.u16      d3,  q0,  #2
-        vdup.8          d0,  d2[4]
-        vdup.8          d1,  d3[0]
-        vdup.8          q2,  d3[2]
-        vtrn.32         q0,  q2
-        b               .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_l00_dc_neon, export=1
-        sub             r2,  r0,  #1
-        ldcol.8         d0,  r2,  r1,  4
-        vpaddl.u8       d0,  d0
-        vpadd.u16       d0,  d0,  d0
-        vrshrn.u16      d0,  q0,  #2
-        vmov.i8         d1,  #128
-        vdup.8          d0,  d0[0]
-        b               .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_0lt_dc_neon, export=1
-        sub             r2,  r0,  r1
-        vld1.8          {d0},     [r2,:64]
-        add             r2,  r0,  r1,  lsl #2
-        sub             r2,  r2,  #1
-        ldcol.8         d1,  r2,  r1,  4,  hi=1
-        vtrn.32         d0,  d1
-        vpaddl.u8       q0,  q0
-        vpadd.u16       d0,  d0,  d1
-        vpadd.u16       d1,  d0,  d0
-        vrshrn.u16      d3,  q0,  #2
-        vrshrn.u16      d2,  q0,  #3
-        vdup.8          d0,  d3[0]
-        vdup.8          d1,  d3[3]
-        vdup.8          d4,  d3[2]
-        vdup.8          d5,  d2[5]
-        vtrn.32         q0,  q2
-        b               .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_0l0_dc_neon, export=1
-        add             r2,  r0,  r1,  lsl #2
-        sub             r2,  r2,  #1
-        ldcol.8         d1,  r2,  r1,  4
-        vpaddl.u8       d2,  d1
-        vpadd.u16       d2,  d2,  d2
-        vrshrn.u16      d1,  q1,  #2
-        vmov.i8         d0,  #128
-        vdup.8          d1,  d1[0]
-        b               .L_pred8x8_dc_end
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,118 +0,0 @@
-/*
- * ARM NEON optimised integer operations
- * Copyright (c) 2009 Kostya Shishkov
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        preserve8
-        .fpu neon
-        .text
-
-function ff_scalarproduct_int16_neon, export=1
-        vmov.i16        q0,  #0
-        vmov.i16        q1,  #0
-        vmov.i16        q2,  #0
-        vmov.i16        q3,  #0
-        negs            r3,  r3
-        beq             2f
-
-        vdup.s32        q12, r3
-1:      vld1.16         {d16-d17}, [r0]!
-        vld1.16         {d20-d21}, [r1,:128]!
-        vmull.s16       q12, d16,  d20
-        vld1.16         {d18-d19}, [r0]!
-        vmull.s16       q13, d17,  d21
-        vld1.16         {d22-d23}, [r1,:128]!
-        vmull.s16       q14, d18,  d22
-        vmull.s16       q15, d19,  d23
-        vshl.s32        q8,  q12,  q12
-        vshl.s32        q9,  q13,  q12
-        vadd.s32        q0,  q0,   q8
-        vshl.s32        q10, q14,  q12
-        vadd.s32        q1,  q1,   q9
-        vshl.s32        q11, q15,  q12
-        vadd.s32        q2,  q2,   q10
-        vadd.s32        q3,  q3,   q11
-        subs            r2,  r2,   #16
-        bne             1b
-        b               3f
-
-2:      vld1.16         {d16-d17}, [r0]!
-        vld1.16         {d20-d21}, [r1,:128]!
-        vmlal.s16       q0,  d16,  d20
-        vld1.16         {d18-d19}, [r0]!
-        vmlal.s16       q1,  d17,  d21
-        vld1.16         {d22-d23}, [r1,:128]!
-        vmlal.s16       q2,  d18,  d22
-        vmlal.s16       q3,  d19,  d23
-        subs            r2,  r2,   #16
-        bne             2b
-
-3:      vpadd.s32       d16, d0,   d1
-        vpadd.s32       d17, d2,   d3
-        vpadd.s32       d10, d4,   d5
-        vpadd.s32       d11, d6,   d7
-        vpadd.s32       d0,  d16,  d17
-        vpadd.s32       d1,  d10,  d11
-        vpadd.s32       d2,  d0,   d1
-        vpaddl.s32      d3,  d2
-        vmov.32         r0,  d3[0]
-        bx              lr
-endfunc
-
-@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
-function ff_scalarproduct_and_madd_int16_neon, export=1
-        vld1.16         {d28[],d29[]}, [sp]
-        vmov.i16        q0,  #0
-        vmov.i16        q1,  #0
-        vmov.i16        q2,  #0
-        vmov.i16        q3,  #0
-        mov             r12, r0
-
-1:      vld1.16         {d16-d17}, [r0,:128]!
-        vld1.16         {d18-d19}, [r1]!
-        vld1.16         {d20-d21}, [r2]!
-        vld1.16         {d22-d23}, [r0,:128]!
-        vld1.16         {d24-d25}, [r1]!
-        vld1.16         {d26-d27}, [r2]!
-        vmul.s16        q10, q10,  q14
-        vmul.s16        q13, q13,  q14
-        vmlal.s16       q0,  d16,  d18
-        vmlal.s16       q1,  d17,  d19
-        vadd.s16        q10, q8,   q10
-        vadd.s16        q13, q11,  q13
-        vmlal.s16       q2,  d22,  d24
-        vmlal.s16       q3,  d23,  d25
-        vst1.16         {q10},     [r12,:128]!
-        subs            r3,  r3,   #16
-        vst1.16         {q13},     [r12,:128]!
-        bne             1b
-
-        vpadd.s32       d16, d0,   d1
-        vpadd.s32       d17, d2,   d3
-        vpadd.s32       d10, d4,   d5
-        vpadd.s32       d11, d6,   d7
-        vpadd.s32       d0,  d16,  d17
-        vpadd.s32       d1,  d10,  d11
-        vpadd.s32       d2,  d0,   d1
-        vpaddl.s32      d3,  d2
-        vmov.32         r0,  d3[0]
-        bx              lr
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,388 +0,0 @@
-/*
-   C-like prototype :
-        void j_rev_dct_arm(DCTBLOCK data)
-
-   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
-
-   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-*/
-
-#include "asm.S"
-
-#define FIX_0_298631336 2446
-#define FIX_0_541196100 4433
-#define FIX_0_765366865 6270
-#define FIX_1_175875602 9633
-#define FIX_1_501321110 12299
-#define FIX_2_053119869 16819
-#define FIX_3_072711026 25172
-#define FIX_M_0_390180644 -3196
-#define FIX_M_0_899976223 -7373
-#define FIX_M_1_847759065 -15137
-#define FIX_M_1_961570560 -16069
-#define FIX_M_2_562915447 -20995
-#define FIX_0xFFFF 0xFFFF
-
-#define FIX_0_298631336_ID      0
-#define FIX_0_541196100_ID      4
-#define FIX_0_765366865_ID      8
-#define FIX_1_175875602_ID     12
-#define FIX_1_501321110_ID     16
-#define FIX_2_053119869_ID     20
-#define FIX_3_072711026_ID     24
-#define FIX_M_0_390180644_ID   28
-#define FIX_M_0_899976223_ID   32
-#define FIX_M_1_847759065_ID   36
-#define FIX_M_1_961570560_ID   40
-#define FIX_M_2_562915447_ID   44
-#define FIX_0xFFFF_ID          48
-        .text
-        .align
-
-function ff_j_rev_dct_arm, export=1
-        stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
-
-        sub sp, sp, #4                  @ reserve some space on the stack
-        str r0, [ sp ]                  @ save the DCT pointer to the stack
-
-        mov lr, r0                      @ lr = pointer to the current row
-        mov r12, #8                     @ r12 = row-counter
-        adr r11, const_array            @ r11 = base pointer to the constants array
-row_loop:
-        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
-        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
-
-        @ Optimization for row that have all items except the first set to 0
-        @ (this works as the DCTELEMS are always 4-byte aligned)
-        ldr r5, [lr, # 0]
-        ldr r6, [lr, # 4]
-        ldr r3, [lr, # 8]
-        ldr r4, [lr, #12]
-        orr r3, r3, r4
-        orr r3, r3, r6
-        orrs r5, r3, r5
-        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
-        orrs r3, r3, r2
-        beq empty_row
-
-        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
-        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
-        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
-
-        ldr r3, [r11, #FIX_0_541196100_ID]
-        add r7, r2, r6
-        ldr r5, [r11, #FIX_M_1_847759065_ID]
-        mul r7, r3, r7                      @ r7 = z1
-        ldr r3, [r11, #FIX_0_765366865_ID]
-        mla r6, r5, r6, r7                  @ r6 = tmp2
-        add r5, r0, r4                      @ r5 = tmp0
-        mla r2, r3, r2, r7                  @ r2 = tmp3
-        sub r3, r0, r4                      @ r3 = tmp1
-
-        add r0, r2, r5, lsl #13             @ r0 = tmp10
-        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
-        add r4, r6, r3, lsl #13             @ r4 = tmp11
-        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
-
-        stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
-
-        ldrsh r3, [lr, #10]             @ r3 = 'd3'
-        ldrsh r5, [lr, #12]             @ r5 = 'd5'
-        ldrsh r7, [lr, #14]             @ r7 = 'd7'
-
-        add r0, r3, r5                        @ r0 = 'z2'
-        add r2, r1, r7                  @ r2 = 'z1'
-        add r4, r3, r7                  @ r4 = 'z3'
-        add r6, r1, r5                  @ r6 = 'z4'
-        ldr r9, [r11, #FIX_1_175875602_ID]
-        add r8, r4, r6                  @ r8 = z3 + z4
-        ldr r10, [r11, #FIX_M_0_899976223_ID]
-        mul r8, r9, r8                  @ r8 = 'z5'
-        ldr r9, [r11, #FIX_M_2_562915447_ID]
-        mul r2, r10, r2                 @ r2 = 'z1'
-        ldr r10, [r11, #FIX_M_1_961570560_ID]
-        mul r0, r9, r0                  @ r0 = 'z2'
-        ldr r9, [r11, #FIX_M_0_390180644_ID]
-        mla r4, r10, r4, r8             @ r4 = 'z3'
-        ldr r10, [r11, #FIX_0_298631336_ID]
-        mla r6, r9, r6, r8              @ r6 = 'z4'
-        ldr r9, [r11, #FIX_2_053119869_ID]
-        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
-        ldr r10, [r11, #FIX_3_072711026_ID]
-        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
-        ldr r9, [r11, #FIX_1_501321110_ID]
-        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
-        add r7, r7, r4                  @ r7 = tmp0
-        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
-        add r5,        r5, r6                  @ r5 = tmp1
-        add r3, r3, r4                  @ r3 = tmp2
-        add r1, r1, r6                  @ r1 = tmp3
-
-        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
-                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
-
-        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
-        add r8, r0, r1
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 0]
-
-        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
-        sub r8, r0, r1
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, #14]
-
-        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
-        add r8, r6, r3
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 2]
-
-        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
-        sub r8, r6, r3
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, #12]
-
-        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
-        add r8, r4, r5
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 4]
-
-        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
-        sub r8, r4, r5
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, #10]
-
-        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
-        add r8, r2, r7
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 6]
-
-        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
-        sub r8, r2, r7
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 8]
-
-        @ End of row loop
-        add lr, lr, #16
-        subs r12, r12, #1
-        bne row_loop
-        beq start_column_loop
-
-empty_row:
-        ldr r1, [r11, #FIX_0xFFFF_ID]
-        mov r0, r0, lsl #2
-        and r0, r0, r1
-        add r0, r0, r0, lsl #16
-        str r0, [lr, # 0]
-        str r0, [lr, # 4]
-        str r0, [lr, # 8]
-        str r0, [lr, #12]
-
-end_of_row_loop:
-        @ End of loop
-        add lr, lr, #16
-        subs r12, r12, #1
-        bne row_loop
-
-start_column_loop:
-        @ Start of column loop
-        ldr lr, [ sp ]
-        mov r12, #8
-column_loop:
-        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
-        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
-        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
-        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
-
-        ldr r3, [r11, #FIX_0_541196100_ID]
-        add r1, r2, r6
-        ldr r5, [r11, #FIX_M_1_847759065_ID]
-        mul r1, r3, r1                      @ r1 = z1
-        ldr r3, [r11, #FIX_0_765366865_ID]
-        mla r6, r5, r6, r1                  @ r6 = tmp2
-        add r5, r0, r4                      @ r5 = tmp0
-        mla r2, r3, r2, r1                  @ r2 = tmp3
-        sub r3, r0, r4                      @ r3 = tmp1
-
-        add r0, r2, r5, lsl #13             @ r0 = tmp10
-        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
-        add r4, r6, r3, lsl #13             @ r4 = tmp11
-        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
-
-        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
-        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
-        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
-        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
-
-        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
-        orr r9, r1, r3
-        orr r10, r5, r7
-        orrs r10, r9, r10
-        beq empty_odd_column
-
-        stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
-
-        add r0, r3, r5                  @ r0 = 'z2'
-        add r2, r1, r7                  @ r2 = 'z1'
-        add r4, r3, r7                  @ r4 = 'z3'
-        add r6, r1, r5                  @ r6 = 'z4'
-        ldr r9, [r11, #FIX_1_175875602_ID]
-        add r8, r4, r6
-        ldr r10, [r11, #FIX_M_0_899976223_ID]
-        mul r8, r9, r8                  @ r8 = 'z5'
-        ldr r9, [r11, #FIX_M_2_562915447_ID]
-        mul r2, r10, r2                 @ r2 = 'z1'
-        ldr r10, [r11, #FIX_M_1_961570560_ID]
-        mul r0, r9, r0                  @ r0 = 'z2'
-        ldr r9, [r11, #FIX_M_0_390180644_ID]
-        mla r4, r10, r4, r8             @ r4 = 'z3'
-        ldr r10, [r11, #FIX_0_298631336_ID]
-        mla r6, r9, r6, r8              @ r6 = 'z4'
-        ldr r9, [r11, #FIX_2_053119869_ID]
-        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
-        ldr r10, [r11, #FIX_3_072711026_ID]
-        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
-        ldr r9, [r11, #FIX_1_501321110_ID]
-        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
-        add r7, r7, r4                  @ r7 = tmp0
-        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
-        add r5,        r5, r6                  @ r5 = tmp1
-        add r3, r3, r4                  @ r3 = tmp2
-        add r1, r1, r6                  @ r1 = tmp3
-
-        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
-                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
-
-        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
-        add r8, r0, r1
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 0*8)]
-
-        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
-        sub r8, r0, r1
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #(14*8)]
-
-        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
-        add r8, r4, r3
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 2*8)]
-
-        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
-        sub r8, r4, r3
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #(12*8)]
-
-        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
-        add r8, r6, r5
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 4*8)]
-
-        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
-        sub r8, r6, r5
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #(10*8)]
-
-        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
-        add r8, r2, r7
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 6*8)]
-
-        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
-        sub r8, r2, r7
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 8*8)]
-
-        @ End of row loop
-        add lr, lr, #2
-        subs r12, r12, #1
-        bne column_loop
-        beq the_end
-
-empty_odd_column:
-        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
-        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
-        add r0, r0, #(1<<17)
-        mov r0, r0, asr #18
-        strh r0, [lr, #( 0*8)]
-        strh r0, [lr, #(14*8)]
-
-        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
-        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
-        add r4, r4, #(1<<17)
-        mov r4, r4, asr #18
-        strh r4, [lr, #( 2*8)]
-        strh r4, [lr, #(12*8)]
-
-        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
-        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
-        add r6, r6, #(1<<17)
-        mov r6, r6, asr #18
-        strh r6, [lr, #( 4*8)]
-        strh r6, [lr, #(10*8)]
-
-        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
-        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
-        add r2, r2, #(1<<17)
-        mov r2, r2, asr #18
-        strh r2, [lr, #( 6*8)]
-        strh r2, [lr, #( 8*8)]
-
-        @ End of row loop
-        add lr, lr, #2
-        subs r12, r12, #1
-        bne column_loop
-
-the_end:
-        @ The end....
-        add sp, sp, #4
-        ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
-
-const_array:
-        .align
-        .word FIX_0_298631336
-        .word FIX_0_541196100
-        .word FIX_0_765366865
-        .word FIX_1_175875602
-        .word FIX_1_501321110
-        .word FIX_2_053119869
-        .word FIX_3_072711026
-        .word FIX_M_0_390180644
-        .word FIX_M_0_899976223
-        .word FIX_M_1_847759065
-        .word FIX_M_1_961570560
-        .word FIX_M_2_562915447
-        .word FIX_0xFFFF
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mathops.h
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mathops.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,116 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_MATHOPS_H
-#define AVCODEC_ARM_MATHOPS_H
-
-#include <stdint.h>
-#include "config.h"
-#include "libavutil/common.h"
-
-#if HAVE_INLINE_ASM
-
-#   define MULL MULL
-static inline av_const int MULL(int a, int b, unsigned shift)
-{
-    int lo, hi;
-    __asm__("smull %0, %1, %2, %3     \n\t"
-            "mov   %0, %0,     lsr %4 \n\t"
-            "add   %1, %0, %1, lsl %5 \n\t"
-            : "=&r"(lo), "=&r"(hi)
-            : "r"(b), "r"(a), "ir"(shift), "ir"(32-shift));
-    return hi;
-}
-
-#define MULH MULH
-#if HAVE_ARMV6
-static inline av_const int MULH(int a, int b)
-{
-    int r;
-    __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
-    return r;
-}
-#else
-static inline av_const int MULH(int a, int b)
-{
-    int lo, hi;
-    __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
-    return hi;
-}
-#endif
-
-static inline av_const int64_t MUL64(int a, int b)
-{
-    union { uint64_t x; unsigned hl[2]; } x;
-    __asm__ ("smull %0, %1, %2, %3"
-             : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b));
-    return x.x;
-}
-#define MUL64 MUL64
-
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
-{
-    union { uint64_t x; unsigned hl[2]; } x = { d };
-    __asm__ ("smlal %0, %1, %2, %3"
-             : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b));
-    return x.x;
-}
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
-#define MLS64(d, a, b) MAC64(d, -(a), b)
-
-#if HAVE_ARMV5TE
-
-/* signed 16x16 -> 32 multiply add accumulate */
-#   define MAC16(rt, ra, rb)                                            \
-    __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
-
-/* signed 16x16 -> 32 multiply */
-#   define MUL16 MUL16
-static inline av_const int MUL16(int ra, int rb)
-{
-    int rt;
-    __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
-    return rt;
-}
-
-#endif
-
-#define mid_pred mid_pred
-static inline av_const int mid_pred(int a, int b, int c)
-{
-    int m;
-    __asm__ volatile (
-        "mov   %0, %2  \n\t"
-        "cmp   %1, %2  \n\t"
-        "movgt %0, %1  \n\t"
-        "movgt %1, %2  \n\t"
-        "cmp   %1, %3  \n\t"
-        "movle %1, %3  \n\t"
-        "cmp   %0, %1  \n\t"
-        "movgt %0, %1  \n\t"
-        : "=&r"(m), "+r"(a)
-        : "r"(b), "r"(c));
-    return m;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#endif /* AVCODEC_ARM_MATHOPS_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,303 +0,0 @@
-/*
- * ARM NEON optimised MDCT
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        preserve8
-
-        .text
-
-#define ff_fft_calc_neon X(ff_fft_calc_neon)
-
-function ff_imdct_half_neon, export=1
-        push            {r4-r8,lr}
-
-        mov             r12, #1
-        ldr             lr,  [r0, #28]          @ mdct_bits
-        ldr             r4,  [r0, #32]          @ tcos
-        ldr             r3,  [r0, #8]           @ revtab
-        lsl             r12, r12, lr            @ n  = 1 << nbits
-        lsr             lr,  r12, #2            @ n4 = n >> 2
-        add             r7,  r2,  r12,  lsl #1
-        mov             r12, #-16
-        sub             r7,  r7,  #16
-
-        vld2.32         {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
-        vld2.32         {d0-d1},  [r2,:128]!    @ d0 =m0,x d1 =m1,x
-        vrev64.32       d17, d17
-        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
-        vmul.f32        d6,  d17, d2
-        vmul.f32        d7,  d0,  d2
-1:
-        subs            lr,  lr,  #2
-        ldr             r6,  [r3], #4
-        vmul.f32        d4,  d0,  d3
-        vmul.f32        d5,  d17, d3
-        vsub.f32        d4,  d6,  d4
-        vadd.f32        d5,  d5,  d7
-        uxth            r8,  r6,  ror #16
-        uxth            r6,  r6
-        add             r8,  r1,  r8,  lsl #3
-        add             r6,  r1,  r6,  lsl #3
-        beq             1f
-        vld2.32         {d16-d17},[r7,:128],r12
-        vld2.32         {d0-d1},  [r2,:128]!
-        vrev64.32       d17, d17
-        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
-        vmul.f32        d6,  d17, d2
-        vmul.f32        d7,  d0,  d2
-        vst2.32         {d4[0],d5[0]}, [r6,:64]
-        vst2.32         {d4[1],d5[1]}, [r8,:64]
-        b               1b
-1:
-        vst2.32         {d4[0],d5[0]}, [r6,:64]
-        vst2.32         {d4[1],d5[1]}, [r8,:64]
-
-        mov             r4,  r0
-        mov             r6,  r1
-        bl              ff_fft_calc_neon
-
-        mov             r12, #1
-        ldr             lr,  [r4, #28]          @ mdct_bits
-        ldr             r4,  [r4, #32]          @ tcos
-        lsl             r12, r12, lr            @ n  = 1 << nbits
-        lsr             lr,  r12, #3            @ n8 = n >> 3
-
-        add             r4,  r4,  lr,  lsl #3
-        add             r6,  r6,  lr,  lsl #3
-        sub             r1,  r4,  #16
-        sub             r3,  r6,  #16
-
-        mov             r7,  #-16
-        mov             r8,  r6
-        mov             r0,  r3
-
-        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
-        vld2.32         {d20-d21},[r6,:128]!    @ d20=i2,r2 d21=i3,r3
-        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
-1:
-        subs            lr,  lr,  #2
-        vmul.f32        d7,  d0,  d18
-        vld2.32         {d17,d19},[r4,:128]!    @ d17=c2,c3 d19=s2,s3
-        vmul.f32        d4,  d1,  d18
-        vmul.f32        d5,  d21, d19
-        vmul.f32        d6,  d20, d19
-        vmul.f32        d22, d1,  d16
-        vmul.f32        d23, d21, d17
-        vmul.f32        d24, d0,  d16
-        vmul.f32        d25, d20, d17
-        vadd.f32        d7,  d7,  d22
-        vadd.f32        d6,  d6,  d23
-        vsub.f32        d4,  d4,  d24
-        vsub.f32        d5,  d5,  d25
-        beq             1f
-        vld2.32         {d0-d1},  [r3,:128], r7
-        vld2.32         {d20-d21},[r6,:128]!
-        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
-        vrev64.32       q3,  q3
-        vst2.32         {d4,d6},  [r0,:128], r7
-        vst2.32         {d5,d7},  [r8,:128]!
-        b               1b
-1:
-        vrev64.32       q3,  q3
-        vst2.32         {d4,d6},  [r0,:128]
-        vst2.32         {d5,d7},  [r8,:128]
-
-        pop             {r4-r8,pc}
-endfunc
-
-function ff_imdct_calc_neon, export=1
-        push            {r4-r6,lr}
-
-        ldr             r3,  [r0, #28]
-        mov             r4,  #1
-        mov             r5,  r1
-        lsl             r4,  r4,  r3
-        add             r1,  r1,  r4
-
-        bl              ff_imdct_half_neon
-
-        add             r0,  r5,  r4,  lsl #2
-        add             r1,  r5,  r4,  lsl #1
-        sub             r0,  r0,  #8
-        sub             r2,  r1,  #16
-        mov             r3,  #-16
-        mov             r6,  #-8
-        vmov.i32        d30, #1<<31
-1:
-        vld1.32         {d0-d1},  [r2,:128], r3
-        pld             [r0, #-16]
-        vrev64.32       q0,  q0
-        vld1.32         {d2-d3},  [r1,:128]!
-        veor            d4,  d1,  d30
-        pld             [r2, #-16]
-        vrev64.32       q1,  q1
-        veor            d5,  d0,  d30
-        vst1.32         {d2},     [r0,:64], r6
-        vst1.32         {d3},     [r0,:64], r6
-        vst1.32         {d4-d5},  [r5,:128]!
-        subs            r4,  r4,  #16
-        bgt             1b
-
-        pop             {r4-r6,pc}
-endfunc
-
-function ff_mdct_calc_neon, export=1
-        push            {r4-r10,lr}
-
-        mov             r12, #1
-        ldr             lr,  [r0, #28]          @ mdct_bits
-        ldr             r4,  [r0, #32]          @ tcos
-        ldr             r3,  [r0, #8]           @ revtab
-        lsl             lr,  r12, lr            @ n  = 1 << nbits
-        add             r7,  r2,  lr            @ in4u
-        sub             r9,  r7,  #16           @ in4d
-        add             r2,  r7,  lr,  lsl #1   @ in3u
-        add             r8,  r9,  lr,  lsl #1   @ in3d
-        add             r5,  r4,  lr,  lsl #1
-        sub             r5,  r5,  #16
-        sub             r3,  r3,  #4
-        mov             r12, #-16
-
-        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
-        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
-        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
-        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
-        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
-        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
-        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
-        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
-        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
-        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
-        vsub.f32        d16, d16, d2            @ in0u-in2d      R
-        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
-1:
-        vmul.f32        d7,  d0,  d21           @  I*s
-        ldr             r10, [r3, lr, lsr #1]
-        vmul.f32        d6,  d1,  d20           @ -R*c
-        ldr             r6,  [r3, #4]!
-        vmul.f32        d4,  d1,  d21           @ -R*s
-        vmul.f32        d5,  d0,  d20           @  I*c
-        vmul.f32        d24, d16, d30           @  R*c
-        vmul.f32        d25, d17, d31           @ -I*s
-        vmul.f32        d22, d16, d31           @  R*s
-        vmul.f32        d23, d17, d30           @  I*c
-        subs            lr,  lr,  #16
-        vsub.f32        d6,  d6,  d7            @ -R*c-I*s
-        vadd.f32        d7,  d4,  d5            @ -R*s+I*c
-        vsub.f32        d24, d25, d24           @ I*s-R*c
-        vadd.f32        d25, d22, d23           @ R*s-I*c
-        beq             1f
-        mov             r12, #-16
-        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
-        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
-        vneg.f32        d7,  d7                 @  R*s-I*c
-        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
-        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
-        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
-        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
-        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
-        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
-        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
-        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
-        vsub.f32        d16, d16, d2            @ in0u-in2d      R
-        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
-        uxth            r12, r6,  ror #16
-        uxth            r6,  r6
-        add             r12, r1,  r12, lsl #3
-        add             r6,  r1,  r6,  lsl #3
-        vst2.32         {d6[0],d7[0]}, [r6,:64]
-        vst2.32         {d6[1],d7[1]}, [r12,:64]
-        uxth            r6,  r10, ror #16
-        uxth            r10, r10
-        add             r6 , r1,  r6,  lsl #3
-        add             r10, r1,  r10, lsl #3
-        vst2.32         {d24[0],d25[0]},[r10,:64]
-        vst2.32         {d24[1],d25[1]},[r6,:64]
-        b               1b
-1:
-        vneg.f32        d7,  d7                 @  R*s-I*c
-        uxth            r12, r6,  ror #16
-        uxth            r6,  r6
-        add             r12, r1,  r12, lsl #3
-        add             r6,  r1,  r6,  lsl #3
-        vst2.32         {d6[0],d7[0]}, [r6,:64]
-        vst2.32         {d6[1],d7[1]}, [r12,:64]
-        uxth            r6,  r10, ror #16
-        uxth            r10, r10
-        add             r6 , r1,  r6,  lsl #3
-        add             r10, r1,  r10, lsl #3
-        vst2.32         {d24[0],d25[0]},[r10,:64]
-        vst2.32         {d24[1],d25[1]},[r6,:64]
-
-        mov             r4,  r0
-        mov             r6,  r1
-        bl              ff_fft_calc_neon
-
-        mov             r12, #1
-        ldr             lr,  [r4, #28]          @ mdct_bits
-        ldr             r4,  [r4, #32]          @ tcos
-        lsl             r12, r12, lr            @ n  = 1 << nbits
-        lsr             lr,  r12, #3            @ n8 = n >> 3
-
-        add             r4,  r4,  lr,  lsl #3
-        add             r6,  r6,  lr,  lsl #3
-        sub             r1,  r4,  #16
-        sub             r3,  r6,  #16
-
-        mov             r7,  #-16
-        mov             r8,  r6
-        mov             r0,  r3
-
-        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
-        vld2.32         {d20-d21},[r6,:128]!    @ d20=r2,i2 d21=r3,i3
-        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
-1:
-        subs            lr,  lr,  #2
-        vmul.f32        d7,  d0,  d18           @ r1*s1,r0*s0
-        vld2.32         {d17,d19},[r4,:128]!    @ c2,c3 s2,s3
-        vmul.f32        d4,  d1,  d18           @ i1*s1,i0*s0
-        vmul.f32        d5,  d21, d19           @ i2*s2,i3*s3
-        vmul.f32        d6,  d20, d19           @ r2*s2,r3*s3
-        vmul.f32        d24, d0,  d16           @ r1*c1,r0*c0
-        vmul.f32        d25, d20, d17           @ r2*c2,r3*c3
-        vmul.f32        d22, d21, d17           @ i2*c2,i3*c3
-        vmul.f32        d23, d1,  d16           @ i1*c1,i0*c0
-        vadd.f32        d4,  d4,  d24           @ i1*s1+r1*c1,i0*s0+r0*c0
-        vadd.f32        d5,  d5,  d25           @ i2*s2+r2*c2,i3*s3+r3*c3
-        vsub.f32        d6,  d22, d6            @ i2*c2-r2*s2,i3*c3-r3*s3
-        vsub.f32        d7,  d23, d7            @ i1*c1-r1*s1,i0*c0-r0*s0
-        vneg.f32        q2,  q2
-        beq             1f
-        vld2.32         {d0-d1},  [r3,:128], r7
-        vld2.32         {d20-d21},[r6,:128]!
-        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
-        vrev64.32       q3,  q3
-        vst2.32         {d4,d6},  [r0,:128], r7
-        vst2.32         {d5,d7},  [r8,:128]!
-        b               1b
-1:
-        vrev64.32       q3,  q3
-        vst2.32         {d4,d6},  [r0,:128]
-        vst2.32         {d5,d7},  [r8,:128]
-
-        pop             {r4-r10,pc}
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2002 Michael Niedermayer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/mpegvideo.h"
-#include "mpegvideo_arm.h"
-
-void MPV_common_init_arm(MpegEncContext *s)
-{
-    /* IWMMXT support is a superset of armv5te, so
-     * allow optimized functions for armv5te unless
-     * a better iwmmxt function exists
-     */
-#if HAVE_ARMV5TE
-    MPV_common_init_armv5te(s);
-#endif
-#if HAVE_IWMMXT
-    MPV_common_init_iwmmxt(s);
-#endif
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,27 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_MPEGVIDEO_H
-#define AVCODEC_ARM_MPEGVIDEO_H
-
-#include "libavcodec/mpegvideo.h"
-
-void MPV_common_init_iwmmxt(MpegEncContext *s);
-void MPV_common_init_armv5te(MpegEncContext *s);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,101 +0,0 @@
-/*
- * Optimization of some functions from mpegvideo.c for armv5te
- * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/mpegvideo.h"
-#include "mpegvideo_arm.h"
-
-void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count);
-
-#ifdef ENABLE_ARM_TESTS
-/**
- * h263 dequantizer supplementary function, it is performance critical and needs to
- * have optimized implementations for each architecture. Is also used as a reference
- * implementation in regression tests
- */
-static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count)
-{
-    int i, level;
-    for (i = 0; i < count; i++) {
-        level = block[i];
-        if (level) {
-            if (level < 0) {
-                level = level * qmul - qadd;
-            } else {
-                level = level * qmul + qadd;
-            }
-            block[i] = level;
-        }
-    }
-}
-#endif
-
-static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
-                                  DCTELEM *block, int n, int qscale)
-{
-    int level, qmul, qadd;
-    int nCoeffs;
-
-    assert(s->block_last_index[n]>=0);
-
-    qmul = qscale << 1;
-
-    if (!s->h263_aic) {
-        if (n < 4)
-            level = block[0] * s->y_dc_scale;
-        else
-            level = block[0] * s->c_dc_scale;
-        qadd = (qscale - 1) | 1;
-    }else{
-        qadd = 0;
-        level = block[0];
-    }
-    if(s->ac_pred)
-        nCoeffs=63;
-    else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
-    block[0] = level;
-}
-
-static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
-                                  DCTELEM *block, int n, int qscale)
-{
-    int qmul, qadd;
-    int nCoeffs;
-
-    assert(s->block_last_index[n]>=0);
-
-    qadd = (qscale - 1) | 1;
-    qmul = qscale << 1;
-
-    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
-}
-
-void MPV_common_init_armv5te(MpegEncContext *s)
-{
-    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
-    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,117 +0,0 @@
-/*
- * Optimization of some functions from mpegvideo.c for armv5te
- * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "asm.S"
-
-/*
- * Special optimized version of dct_unquantize_h263_helper_c, it
- * requires the block to be at least 8 bytes aligned, and may process
- * more elements than requested.  But it is guaranteed to never
- * process more than 64 elements provided that count argument is <= 64,
- * so it is safe. This function is optimized for a common distribution
- * of values for nCoeffs (they are mostly multiple of 8 plus one or
- * two extra elements). So this function processes data as 8 elements
- * per loop iteration and contains optional 2 elements processing in
- * the end.
- *
- * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
- */
-function ff_dct_unquantize_h263_armv5te, export=1
-        push            {r4-r9,lr}
-        mov             ip, #0
-        subs            r3, r3, #2
-        ble             2f
-        ldrd            r4, [r0, #0]
-1:
-        ldrd            r6, [r0, #8]
-
-        rsbs            r9, ip, r4, asr #16
-        addgt           r9, r2, #0
-        rsblt           r9, r2, #0
-        smlatbne        r9, r4, r1, r9
-
-        rsbs            lr, ip, r5, asr #16
-        addgt           lr, r2, #0
-        rsblt           lr, r2, #0
-        smlatbne        lr, r5, r1, lr
-
-        rsbs            r8, ip, r4, asl #16
-        addgt           r8, r2, #0
-        rsblt           r8, r2, #0
-        smlabbne        r4, r4, r1, r8
-
-        rsbs            r8, ip, r5, asl #16
-        addgt           r8, r2, #0
-        rsblt           r8, r2, #0
-        smlabbne        r5, r5, r1, r8
-
-        strh            r4, [r0], #2
-        strh            r9, [r0], #2
-        strh            r5, [r0], #2
-        strh            lr, [r0], #2
-
-        rsbs            r9, ip, r6, asr #16
-        addgt           r9, r2, #0
-        rsblt           r9, r2, #0
-        smlatbne        r9, r6, r1, r9
-
-        rsbs            lr, ip, r7, asr #16
-        addgt           lr, r2, #0
-        rsblt           lr, r2, #0
-        smlatbne        lr, r7, r1, lr
-
-        rsbs            r8, ip, r6, asl #16
-        addgt           r8, r2, #0
-        rsblt           r8, r2, #0
-        smlabbne        r6, r6, r1, r8
-
-        rsbs            r8, ip, r7, asl #16
-        addgt           r8, r2, #0
-        rsblt           r8, r2, #0
-        smlabbne        r7, r7, r1, r8
-
-        strh            r6, [r0], #2
-        strh            r9, [r0], #2
-        strh            r7, [r0], #2
-        strh            lr, [r0], #2
-
-        subs            r3, r3, #8
-        ldrgtd          r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
-        bgt             1b
-
-        adds            r3, r3, #2
-        pople           {r4-r9,pc}
-2:
-        ldrsh           r9, [r0, #0]
-        ldrsh           lr, [r0, #2]
-        mov             r8, r2
-        cmp             r9, #0
-        rsblt           r8, r2, #0
-        smlabbne        r9, r9, r1, r8
-        mov             r8, r2
-        cmp             lr, #0
-        rsblt           r8, r2, #0
-        smlabbne        lr, lr, r1, r8
-        strh            r9, [r0], #2
-        strh            lr, [r0], #2
-        pop             {r4-r9,pc}
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c
--- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-/*
- * copyright (c) 2004 AGAWA Koji
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/mpegvideo.h"
-#include "mpegvideo_arm.h"
-
-static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
-                                             DCTELEM *block, int n, int qscale)
-{
-    int level, qmul, qadd;
-    int nCoeffs;
-    DCTELEM *block_orig = block;
-
-    assert(s->block_last_index[n]>=0);
-
-    qmul = qscale << 1;
-
-    if (!s->h263_aic) {
-        if (n < 4)
-            level = block[0] * s->y_dc_scale;
-        else
-            level = block[0] * s->c_dc_scale;
-        qadd = (qscale - 1) | 1;
-    }else{
-        qadd = 0;
-        level = block[0];
-    }
-    if(s->ac_pred)
-        nCoeffs=63;
-    else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    __asm__ volatile (
-/*      "movd %1, %%mm6                 \n\t" //qmul */
-/*      "packssdw %%mm6, %%mm6          \n\t" */
-/*      "packssdw %%mm6, %%mm6          \n\t" */
-        "tbcsth wr6, %[qmul]            \n\t"
-/*      "movd %2, %%mm5                 \n\t" //qadd */
-/*      "packssdw %%mm5, %%mm5          \n\t" */
-/*      "packssdw %%mm5, %%mm5          \n\t" */
-        "tbcsth wr5, %[qadd]            \n\t"
-        "wzero wr7                      \n\t" /* "pxor %%mm7, %%mm7             \n\t" */
-        "wzero wr4                      \n\t" /* "pxor %%mm4, %%mm4             \n\t" */
-        "wsubh wr7, wr5, wr7            \n\t" /* "psubw %%mm5, %%mm7            \n\t" */
-        "1:                             \n\t"
-        "wldrd wr2, [%[block]]          \n\t" /* "movq (%0, %3), %%mm0          \n\t" */
-        "wldrd wr3, [%[block], #8]      \n\t" /* "movq 8(%0, %3), %%mm1         \n\t" */
-        "wmulsl wr0, wr6, wr2           \n\t" /* "pmullw %%mm6, %%mm0           \n\t" */
-        "wmulsl wr1, wr6, wr3           \n\t" /* "pmullw %%mm6, %%mm1           \n\t" */
-/*      "movq (%0, %3), %%mm2           \n\t" */
-/*      "movq 8(%0, %3), %%mm3          \n\t" */
-        "wcmpgtsh wr2, wr4, wr2         \n\t" /* "pcmpgtw %%mm4, %%mm2          \n\t" // block[i] < 0 ? -1 : 0 */
-        "wcmpgtsh wr3, wr4, wr2         \n\t" /* "pcmpgtw %%mm4, %%mm3          \n\t" // block[i] < 0 ? -1 : 0 */
-        "wxor wr0, wr2, wr0             \n\t" /* "pxor %%mm2, %%mm0             \n\t" */
-        "wxor wr1, wr3, wr1             \n\t" /* "pxor %%mm3, %%mm1             \n\t" */
-        "waddh wr0, wr7, wr0            \n\t" /* "paddw %%mm7, %%mm0            \n\t" */
-        "waddh wr1, wr7, wr1            \n\t" /* "paddw %%mm7, %%mm1            \n\t" */
-        "wxor wr2, wr0, wr2             \n\t" /* "pxor %%mm0, %%mm2             \n\t" */
-        "wxor wr3, wr1, wr3             \n\t" /* "pxor %%mm1, %%mm3             \n\t" */
-        "wcmpeqh wr0, wr7, wr0          \n\t" /* "pcmpeqw %%mm7, %%mm0          \n\t" // block[i] == 0 ? -1 : 0 */
-        "wcmpeqh wr1, wr7, wr1          \n\t" /* "pcmpeqw %%mm7, %%mm1          \n\t" // block[i] == 0 ? -1 : 0 */
-        "wandn wr0, wr2, wr0            \n\t" /* "pandn %%mm2, %%mm0            \n\t" */
-        "wandn wr1, wr3, wr1            \n\t" /* "pandn %%mm3, %%mm1            \n\t" */
-        "wstrd wr0, [%[block]]          \n\t" /* "movq %%mm0, (%0, %3)          \n\t" */
-        "wstrd wr1, [%[block], #8]      \n\t" /* "movq %%mm1, 8(%0, %3)         \n\t" */
-        "add %[block], %[block], #16    \n\t" /* "addl $16, %3                  \n\t" */
-        "subs %[i], %[i], #1            \n\t"
-        "bne 1b                         \n\t" /* "jng 1b                                \n\t" */
-        :[block]"+r"(block)
-        :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
-        :"memory");
-
-    block_orig[0] = level;
-}
-
-#if 0
-static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
-                                             DCTELEM *block, int n, int qscale)
-{
-    int nCoeffs;
-
-    assert(s->block_last_index[n]>=0);
-
-    if(s->ac_pred)
-        nCoeffs=63;
-    else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
-}
-#endif
-
-void MPV_common_init_iwmmxt(MpegEncContext *s)
-{
-    if (!(mm_flags & FF_MM_IWMMXT)) return;
-
-    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
-#if 0
-    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
-#endif
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,151 +0,0 @@
-/*
- * ARM NEON optimised RDFT
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        preserve8
-
-function ff_rdft_calc_neon, export=1
-        push            {r4-r8,lr}
-
-        ldr             r6,  [r0, #4]           @ inverse
-        mov             r4,  r0
-        mov             r5,  r1
-
-        lsls            r6,  r6,  #31
-        bne             1f
-        add             r0,  r4,  #20
-        bl              X(ff_fft_permute_neon)
-        add             r0,  r4,  #20
-        mov             r1,  r5
-        bl              X(ff_fft_calc_neon)
-1:
-        ldr             r12, [r4, #0]           @ nbits
-        mov             r2,  #1
-        lsl             r12, r2,  r12
-        add             r0,  r5,  #8
-        add             r1,  r5,  r12, lsl #2
-        lsr             r12, r12, #2
-        ldr             r2,  [r4, #12]          @ tcos
-        sub             r12, r12, #2
-        ldr             r3,  [r4, #16]          @ tsin
-        mov             r7,  r0
-        sub             r1,  r1,  #8
-        mov             lr,  r1
-        mov             r8,  #-8
-        vld1.32         {d0},     [r0,:64]!     @ d1[0,1]
-        vld1.32         {d1},     [r1,:64], r8  @ d2[0,1]
-        vld1.32         {d4},     [r2,:64]!     @ tcos[i]
-        vld1.32         {d5},     [r3,:64]!     @ tsin[i]
-        vmov.f32        d18, #0.5               @ k1
-        vdup.32         d19, r6
-        pld             [r0, #32]
-        veor            d19, d18, d19           @ k2
-        vmov.i32        d16, #0
-        vmov.i32        d17, #1<<31
-        pld             [r1, #-32]
-        vtrn.32         d16, d17
-        pld             [r2, #32]
-        vrev64.32       d16, d16                @ d16=1,0 d17=0,1
-        pld             [r3, #32]
-2:
-        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
-        vld1.32         {d24},    [r0,:64]!     @  d1[0,1]
-        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
-        vld1.32         {d25},    [r1,:64], r8  @  d2[0,1]
-        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
-        veor            q3,  q12, q8            @ -d1[0],d1[1], d2[0],-d2[1]
-        pld             [r0, #32]
-        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
-        pld             [r1, #-32]
-        vadd.f32        d0,  d24, d7            @  d1[0]+d2[0], d1[1]-d2[1]
-        vadd.f32        d1,  d6,  d25           @ -d1[0]+d2[0], d1[1]+d2[1]
-        vmul.f32        q11, q0,  q9            @  ev.re, ev.im, od.im, od.re
-        veor            d7,  d21, d16           @ -od.im, od.re
-        vrev64.32       d3,  d21                @  od.re, od.im
-        veor            d6,  d20, d17           @  ev.re,-ev.im
-        veor            d2,  d3,  d16           @ -od.re, od.im
-        vmla.f32        d20, d3,  d4[1]
-        vmla.f32        d20, d7,  d5[1]
-        vmla.f32        d6,  d2,  d4[1]
-        vmla.f32        d6,  d21, d5[1]
-        vld1.32         {d4},     [r2,:64]!     @  tcos[i]
-        veor            d7,  d23, d16           @ -od.im, od.re
-        vld1.32         {d5},     [r3,:64]!     @  tsin[i]
-        veor            d24, d22, d17           @  ev.re,-ev.im
-        vrev64.32       d3,  d23                @  od.re, od.im
-        pld             [r2, #32]
-        veor            d2,  d3,  d16           @ -od.re, od.im
-        pld             [r3, #32]
-        vmla.f32        d22, d3,  d4[0]
-        vmla.f32        d22, d7,  d5[0]
-        vmla.f32        d24, d2,  d4[0]
-        vmla.f32        d24, d23, d5[0]
-        vld1.32         {d0},     [r0,:64]!     @  d1[0,1]
-        vld1.32         {d1},     [r1,:64], r8  @  d2[0,1]
-        vst1.32         {d20},    [r7,:64]!
-        vst1.32         {d6},     [lr,:64], r8
-        vst1.32         {d22},    [r7,:64]!
-        vst1.32         {d24},    [lr,:64], r8
-        subs            r12, r12, #2
-        bgt             2b
-
-        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
-        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
-        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
-        ldr             r2,  [r4, #8]           @  sign_convention
-        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
-        add             r0,  r0,  #4
-        bfc             r2,  #0,  #31
-        vld1.32         {d0[0]},  [r0,:32]
-        veor            d7,  d21, d16           @ -od.im, od.re
-        vrev64.32       d3,  d21                @  od.re, od.im
-        veor            d6,  d20, d17           @  ev.re,-ev.im
-        vld1.32         {d22},    [r5,:64]
-        vdup.32         d1,  r2
-        vmov            d23, d22
-        veor            d2,  d3,  d16           @ -od.re, od.im
-        vtrn.32         d22, d23
-        veor            d0,  d0,  d1
-        veor            d23, d23, d17
-        vmla.f32        d20, d3,  d4[1]
-        vmla.f32        d20, d7,  d5[1]
-        vmla.f32        d6,  d2,  d4[1]
-        vmla.f32        d6,  d21, d5[1]
-        vadd.f32        d22, d22, d23
-        vst1.32         {d20},    [r7,:64]
-        vst1.32         {d6},     [lr,:64]
-        vst1.32         {d0[0]},  [r0,:32]
-        vst1.32         {d22},    [r5,:64]
-
-        cmp             r6,  #0
-        popeq           {r4-r8,pc}
-
-        vmul.f32        d22, d22, d18
-        vst1.32         {d22},    [r5,:64]
-        add             r0,  r4,  #20
-        mov             r1,  r5
-        bl              X(ff_fft_permute_neon)
-        add             r0,  r4,  #20
-        mov             r1,  r5
-        pop             {r4-r8,lr}
-        b               X(ff_fft_calc_neon)
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,486 +0,0 @@
-/*
- * simple_idct_arm.S
- * Copyright (C) 2002 Frederic 'dilb' Boulay
- *
- * Author: Frederic Boulay <dilb@handhelds.org>
- *
- * The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the FFmpeg project.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-/* useful constants for the algorithm, they are save in __constant_ptr__ at */
-/* the end of the source code.*/
-#define W1  22725
-#define W2  21407
-#define W3  19266
-#define W4  16383
-#define W5  12873
-#define W6  8867
-#define W7  4520
-#define MASK_MSHW 0xFFFF0000
-
-/* offsets of the constants in the vector */
-#define offW1  0
-#define offW2  4
-#define offW3  8
-#define offW4  12
-#define offW5  16
-#define offW6  20
-#define offW7  24
-#define offMASK_MSHW 28
-
-#define ROW_SHIFT 11
-#define ROW_SHIFT2MSHW (16-11)
-#define COL_SHIFT 20
-#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
-#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
-
-
-        .text
-
-function ff_simple_idct_arm, export=1
-        @@ void simple_idct_arm(int16_t *block)
-        @@ save stack for reg needed (take all of them),
-        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
-        @@ so it must not be overwritten, if it is not saved!!
-        @@ R12 is another scratch register, so it should not be saved too
-        @@ save all registers
-        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
-        @@ at this point, R0=block, other registers are free.
-        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
-        adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
-        @@ add 2 temporary variables in the stack: R0 and R14
-        sub sp, sp, #8          @ allow 2 local variables
-        str r0, [sp, #0]        @ save block in sp[0]
-        @@ stack status
-        @@ sp+4   free
-        @@ sp+0   R0  (block)
-
-
-        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
-
-
-__row_loop:
-        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
-        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
-        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
-        ldr r3, [r14, #8]        @ R3=ROWr32[2]
-        ldr r4, [r14, #12]       @ R4=ROWr32[3]
-        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
-        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
-        @@ else follow the complete algorithm.
-        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
-        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
-        orr r5, r4, r3           @ R5=R4 | R3
-        orr r5, r5, r2           @ R5=R4 | R3 | R2
-        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
-        beq __end_row_loop
-        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
-        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
-        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
-        beq __almost_empty_row
-
-__b_evaluation:
-        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
-        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
-
-        @@ MUL16(b0, W1, row[1]);
-        @@ MUL16(b1, W3, row[1]);
-        @@ MUL16(b2, W5, row[1]);
-        @@ MUL16(b3, W7, row[1]);
-        @@ MAC16(b0, W3, row[3]);
-        @@ MAC16(b1, -W7, row[3]);
-        @@ MAC16(b2, -W1, row[3]);
-        @@ MAC16(b3, -W5, row[3]);
-        ldr r8, [r12, #offW1]    @ R8=W1
-        mov r2, r2, asr #16      @ R2=ROWr16[3]
-        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        ldr r9, [r12, #offW3]    @ R9=W3
-        ldr r10, [r12, #offW5]   @ R10=W5
-        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        ldr r11, [r12, #offW7]   @ R11=W7
-        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-                teq r2, #0               @ if null avoid muls
-                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-        rsbne r2, r2, #0         @ R2=-ROWr16[3]
-        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-
-        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
-        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
-        @@ if (temp != 0) {}
-        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
-        beq __end_b_evaluation
-
-        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
-        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ MAC16(b0, W5, row[5]);
-        @@ MAC16(b2, W7, row[5]);
-        @@ MAC16(b3, W3, row[5]);
-        @@ MAC16(b1, -W1, row[5]);
-        @@ MAC16(b0, W7, row[7]);
-        @@ MAC16(b2, W3, row[7]);
-        @@ MAC16(b3, -W1, row[7]);
-        @@ MAC16(b1, -W5, row[7]);
-        mov r3, r3, asr #16      @ R3=ROWr16[5]
-                teq r3, #0               @ if null avoid muls
-        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
-        mov r4, r4, asr #16      @ R4=ROWr16[7]
-        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
-        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
-        rsbne r3, r3, #0         @ R3=-ROWr16[5]
-        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
-        @@ R3 is free now
-                teq r4, #0               @ if null avoid muls
-        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
-        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
-        rsbne r4, r4, #0         @ R4=-ROWr16[7]
-        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
-        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
-        @@ R4 is free now
-__end_b_evaluation:
-        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
-        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
-        @@     R12=__const_ptr_, R14=&block[n]
-
-__a_evaluation:
-        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
-        @@ a1 = a0 + W6 * row[2];
-        @@ a2 = a0 - W6 * row[2];
-        @@ a3 = a0 - W2 * row[2];
-        @@ a0 = a0 + W2 * row[2];
-        ldr r9, [r12, #offW4]    @ R9=W4
-        mul r6, r9, r6           @ R6=W4*ROWr16[0]
-        ldr r10, [r12, #offW6]   @ R10=W6
-        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
-        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
-
-        mul r11, r10, r4         @ R11=W6*ROWr16[2]
-        ldr r8, [r12, #offW2]    @ R8=W2
-        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
-        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
-        @@ if (temp != 0) {}
-        teq r2, #0
-        beq __end_bef_a_evaluation
-
-        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
-        mul r11, r8, r4          @ R11=W2*ROWr16[2]
-        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
-        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
-
-
-        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
-        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
-        @@     R12=__const_ptr_, R14=&block[n]
-
-
-        @@ a0 += W4*row[4]
-        @@ a1 -= W4*row[4]
-        @@ a2 -= W4*row[4]
-        @@ a3 += W4*row[4]
-        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
-                teq r11, #0              @ if null avoid muls
-        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
-        @@ R9 is free now
-        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
-        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
-        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
-        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
-        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
-        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
-                teq r9, #0               @ if null avoid muls
-        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
-        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
-        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
-        @@ a0 += W6*row[6];
-        @@ a3 -= W6*row[6];
-        @@ a1 -= W2*row[6];
-        @@ a2 += W2*row[6];
-        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
-        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
-        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
-
-__end_a_evaluation:
-        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
-        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
-        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
-        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
-        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
-        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
-        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
-        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
-        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
-        add r8, r6, r0           @ R8=a0+b0
-        add r9, r2, r1           @ R9=a1+b1
-        @@ put 2 16 bits half-words in a 32bits word
-        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
-        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
-        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
-        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
-        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
-        orr r8, r8, r9
-        str r8, [r14, #0]
-
-        add r8, r3, r5           @ R8=a2+b2
-        add r9, r4, r7           @ R9=a3+b3
-        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
-        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
-        orr r8, r8, r9
-        str r8, [r14, #4]
-
-        sub r8, r4, r7           @ R8=a3-b3
-        sub r9, r3, r5           @ R9=a2-b2
-        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
-        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
-        orr r8, r8, r9
-        str r8, [r14, #8]
-
-        sub r8, r2, r1           @ R8=a1-b1
-        sub r9, r6, r0           @ R9=a0-b0
-        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
-        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
-        orr r8, r8, r9
-        str r8, [r14, #12]
-
-        bal __end_row_loop
-
-__almost_empty_row:
-        @@ the row was empty, except ROWr16[0], now, management of this special case
-        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
-        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
-        @@                R8=0xFFFF (temp), R9-R11 free
-        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
-        sub r8, r8, #1           @ R8 is now ready.
-        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
-        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
-        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
-        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
-        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
-        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
-
-__end_row_loop:
-        @@ at this point, R0-R11 (free)
-        @@     R12=__const_ptr_, R14=&block[n]
-        ldr r0, [sp, #0]         @ R0=block
-        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
-        sub r14, r14, #16
-        bne __row_loop
-
-
-
-        @@ at this point, R0=block, R1-R11 (free)
-        @@     R12=__const_ptr_, R14=&block[n]
-        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
-__col_loop:
-
-__b_evaluation2:
-        @@ at this point, R0=block (temp),  R1-R11 (free)
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ proceed with b0-b3 first, followed by a0-a3
-        @@ MUL16(b0, W1, col[8x1]);
-        @@ MUL16(b1, W3, col[8x1]);
-        @@ MUL16(b2, W5, col[8x1]);
-        @@ MUL16(b3, W7, col[8x1]);
-        @@ MAC16(b0, W3, col[8x3]);
-        @@ MAC16(b1, -W7, col[8x3]);
-        @@ MAC16(b2, -W1, col[8x3]);
-        @@ MAC16(b3, -W5, col[8x3]);
-        ldr r8, [r12, #offW1]    @ R8=W1
-        ldrsh r7, [r14, #16]
-        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        ldr r9, [r12, #offW3]    @ R9=W3
-        ldr r10, [r12, #offW5]   @ R10=W5
-        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        ldr r11, [r12, #offW7]   @ R11=W7
-        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        ldrsh r2, [r14, #48]
-        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-        teq r2, #0               @ if 0, then avoid muls
-        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-        rsbne r2, r2, #0         @ R2=-ROWr16[3]
-        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-
-        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
-        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ MAC16(b0, W5, col[5x8]);
-        @@ MAC16(b2, W7, col[5x8]);
-        @@ MAC16(b3, W3, col[5x8]);
-        @@ MAC16(b1, -W1, col[5x8]);
-        @@ MAC16(b0, W7, col[7x8]);
-        @@ MAC16(b2, W3, col[7x8]);
-        @@ MAC16(b3, -W1, col[7x8]);
-        @@ MAC16(b1, -W5, col[7x8]);
-        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
-        teq r3, #0               @ if 0 then avoid muls
-        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
-        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
-        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
-        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
-        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
-        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
-        @@ R3 is free now
-        teq r4, #0               @ if 0 then avoid muls
-        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
-        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
-        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
-        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
-        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
-        @@ R4 is free now
-__end_b_evaluation2:
-        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
-        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
-        @@     R12=__const_ptr_, R14=&block[n]
-
-__a_evaluation2:
-        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
-        @@ a1 = a0 + W6 * row[2];
-        @@ a2 = a0 - W6 * row[2];
-        @@ a3 = a0 - W2 * row[2];
-        @@ a0 = a0 + W2 * row[2];
-        ldrsh r6, [r14, #0]
-        ldr r9, [r12, #offW4]    @ R9=W4
-        mul r6, r9, r6           @ R6=W4*ROWr16[0]
-        ldr r10, [r12, #offW6]   @ R10=W6
-        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
-        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
-        mul r11, r10, r4         @ R11=W6*ROWr16[2]
-        ldr r8, [r12, #offW2]    @ R8=W2
-        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
-        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
-        mul r11, r8, r4          @ R11=W2*ROWr16[2]
-        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
-        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
-
-        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
-        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ a0 += W4*row[4]
-        @@ a1 -= W4*row[4]
-        @@ a2 -= W4*row[4]
-        @@ a3 += W4*row[4]
-        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
-        teq r11, #0              @ if null avoid muls
-        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
-        @@ R9 is free now
-        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
-        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
-        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
-        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
-        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
-        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
-        teq r9, #0               @ if null avoid muls
-        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
-        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
-        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
-        @@ a0 += W6*row[6];
-        @@ a3 -= W6*row[6];
-        @@ a1 -= W2*row[6];
-        @@ a2 += W2*row[6];
-        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
-        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
-        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
-__end_a_evaluation2:
-        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
-        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
-        @@     R12=__const_ptr_, R14=&block[n]
-        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
-        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
-        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
-        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
-        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
-        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
-        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
-        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
-        @@@@@ no optimization here @@@@@
-        add r8, r6, r0           @ R8=a0+b0
-        add r9, r2, r1           @ R9=a1+b1
-        mov r8, r8, asr #COL_SHIFT
-        mov r9, r9, asr #COL_SHIFT
-        strh r8, [r14, #0]
-        strh r9, [r14, #16]
-        add r8, r3, r5           @ R8=a2+b2
-        add r9, r4, r7           @ R9=a3+b3
-        mov r8, r8, asr #COL_SHIFT
-        mov r9, r9, asr #COL_SHIFT
-        strh r8, [r14, #32]
-        strh r9, [r14, #48]
-        sub r8, r4, r7           @ R8=a3-b3
-        sub r9, r3, r5           @ R9=a2-b2
-        mov r8, r8, asr #COL_SHIFT
-        mov r9, r9, asr #COL_SHIFT
-        strh r8, [r14, #64]
-        strh r9, [r14, #80]
-        sub r8, r2, r1           @ R8=a1-b1
-        sub r9, r6, r0           @ R9=a0-b0
-        mov r8, r8, asr #COL_SHIFT
-        mov r9, r9, asr #COL_SHIFT
-        strh r8, [r14, #96]
-        strh r9, [r14, #112]
-
-__end_col_loop:
-        @@ at this point, R0-R11 (free)
-        @@     R12=__const_ptr_, R14=&block[n]
-        ldr r0, [sp, #0]         @ R0=block
-        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
-        sub r14, r14, #2
-        bne __col_loop
-
-
-
-
-__end_simple_idct_arm:
-        @@ restore registers to previous status!
-        add sp, sp, #8 @@ the local variables!
-        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
-
-
-
-@@ kind of sub-function, here not to overload the common case.
-__end_bef_a_evaluation:
-        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
-        mul r11, r8, r4          @ R11=W2*ROWr16[2]
-        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
-        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
-        bal __end_a_evaluation
-
-
-__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
-        .align
-        .word   W1
-        .word   W2
-        .word   W3
-        .word   W4
-        .word   W5
-        .word   W6
-        .word   W7
-        .word   MASK_MSHW
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,703 +0,0 @@
-/*
- * Simple IDCT
- *
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define ROW_SHIFT 11
-#define COL_SHIFT 20
-
-#define W13 (W1 | (W3 << 16))
-#define W26 (W2 | (W6 << 16))
-#define W57 (W5 | (W7 << 16))
-
-        .text
-        .align
-w13:    .long W13
-w26:    .long W26
-w57:    .long W57
-
-function idct_row_armv5te
-        str    lr, [sp, #-4]!
-
-        ldrd   v1, [a1, #8]
-        ldrd   a3, [a1]              /* a3 = row[1:0], a4 = row[3:2] */
-        orrs   v1, v1, v2
-        cmpeq  v1, a4
-        cmpeq  v1, a3, lsr #16
-        beq    row_dc_only
-
-        mov    v1, #(1<<(ROW_SHIFT-1))
-        mov    ip, #16384
-        sub    ip, ip, #1            /* ip = W4 */
-        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
-        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
-        smultb a2, ip, a4
-        smulbb lr, ip, a4
-        add    v2, v1, a2
-        sub    v3, v1, a2
-        sub    v4, v1, lr
-        add    v1, v1, lr
-
-        ldr    ip, w13               /* ip = W1 | (W3 << 16) */
-        ldr    lr, w57               /* lr = W5 | (W7 << 16) */
-        smulbt v5, ip, a3
-        smultt v6, lr, a4
-        smlatt v5, ip, a4, v5
-        smultt a2, ip, a3
-        smulbt v7, lr, a3
-        sub    v6, v6, a2
-        smulbt a2, ip, a4
-        smultt fp, lr, a3
-        sub    v7, v7, a2
-        smulbt a2, lr, a4
-        ldrd   a3, [a1, #8]          /* a3=row[5:4] a4=row[7:6] */
-        sub    fp, fp, a2
-
-        orrs   a2, a3, a4
-        beq    1f
-
-        smlabt v5, lr, a3, v5
-        smlabt v6, ip, a3, v6
-        smlatt v5, lr, a4, v5
-        smlabt v6, lr, a4, v6
-        smlatt v7, lr, a3, v7
-        smlatt fp, ip, a3, fp
-        smulbt a2, ip, a4
-        smlatt v7, ip, a4, v7
-        sub    fp, fp, a2
-
-        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
-        mov    a2, #16384
-        sub    a2, a2, #1            /* a2 =  W4 */
-        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
-        smultb lr, ip, a4            /* lr =  W6*row[6] */
-        add    v1, v1, a2            /* v1 += W4*row[4] */
-        add    v1, v1, lr            /* v1 += W6*row[6] */
-        add    v4, v4, a2            /* v4 += W4*row[4] */
-        sub    v4, v4, lr            /* v4 -= W6*row[6] */
-        smulbb lr, ip, a4            /* lr =  W2*row[6] */
-        sub    v2, v2, a2            /* v2 -= W4*row[4] */
-        sub    v2, v2, lr            /* v2 -= W2*row[6] */
-        sub    v3, v3, a2            /* v3 -= W4*row[4] */
-        add    v3, v3, lr            /* v3 += W2*row[6] */
-
-1:      add    a2, v1, v5
-        mov    a3, a2, lsr #11
-        bic    a3, a3, #0x1f0000
-        sub    a2, v2, v6
-        mov    a2, a2, lsr #11
-        add    a3, a3, a2, lsl #16
-        add    a2, v3, v7
-        mov    a4, a2, lsr #11
-        bic    a4, a4, #0x1f0000
-        add    a2, v4, fp
-        mov    a2, a2, lsr #11
-        add    a4, a4, a2, lsl #16
-        strd   a3, [a1]
-
-        sub    a2, v4, fp
-        mov    a3, a2, lsr #11
-        bic    a3, a3, #0x1f0000
-        sub    a2, v3, v7
-        mov    a2, a2, lsr #11
-        add    a3, a3, a2, lsl #16
-        add    a2, v2, v6
-        mov    a4, a2, lsr #11
-        bic    a4, a4, #0x1f0000
-        sub    a2, v1, v5
-        mov    a2, a2, lsr #11
-        add    a4, a4, a2, lsl #16
-        strd   a3, [a1, #8]
-
-        ldr    pc, [sp], #4
-
-row_dc_only:
-        orr    a3, a3, a3, lsl #16
-        bic    a3, a3, #0xe000
-        mov    a3, a3, lsl #3
-        mov    a4, a3
-        strd   a3, [a1]
-        strd   a3, [a1, #8]
-
-        ldr    pc, [sp], #4
-endfunc
-
-        .macro idct_col
-        ldr    a4, [a1]              /* a4 = col[1:0] */
-        mov    ip, #16384
-        sub    ip, ip, #1            /* ip = W4 */
-#if 0
-        mov    v1, #(1<<(COL_SHIFT-1))
-        smlabt v2, ip, a4, v1        /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
-        smlabb v1, ip, a4, v1        /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
-        ldr    a4, [a1, #(16*4)]
-#else
-        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
-        add    v2, v1, a4, asr #16
-        rsb    v2, v2, v2, lsl #14
-        mov    a4, a4, lsl #16
-        add    v1, v1, a4, asr #16
-        ldr    a4, [a1, #(16*4)]
-        rsb    v1, v1, v1, lsl #14
-#endif
-
-        smulbb lr, ip, a4
-        smulbt a3, ip, a4
-        sub    v3, v1, lr
-        sub    v5, v1, lr
-        add    v7, v1, lr
-        add    v1, v1, lr
-        sub    v4, v2, a3
-        sub    v6, v2, a3
-        add    fp, v2, a3
-        ldr    ip, w26
-        ldr    a4, [a1, #(16*2)]
-        add    v2, v2, a3
-
-        smulbb lr, ip, a4
-        smultb a3, ip, a4
-        add    v1, v1, lr
-        sub    v7, v7, lr
-        add    v3, v3, a3
-        sub    v5, v5, a3
-        smulbt lr, ip, a4
-        smultt a3, ip, a4
-        add    v2, v2, lr
-        sub    fp, fp, lr
-        add    v4, v4, a3
-        ldr    a4, [a1, #(16*6)]
-        sub    v6, v6, a3
-
-        smultb lr, ip, a4
-        smulbb a3, ip, a4
-        add    v1, v1, lr
-        sub    v7, v7, lr
-        sub    v3, v3, a3
-        add    v5, v5, a3
-        smultt lr, ip, a4
-        smulbt a3, ip, a4
-        add    v2, v2, lr
-        sub    fp, fp, lr
-        sub    v4, v4, a3
-        add    v6, v6, a3
-
-        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
-
-        ldr    ip, w13
-        ldr    a4, [a1, #(16*1)]
-        ldr    lr, w57
-        smulbb v1, ip, a4
-        smultb v3, ip, a4
-        smulbb v5, lr, a4
-        smultb v7, lr, a4
-        smulbt v2, ip, a4
-        smultt v4, ip, a4
-        smulbt v6, lr, a4
-        smultt fp, lr, a4
-        rsb    v4, v4, #0
-        ldr    a4, [a1, #(16*3)]
-        rsb    v3, v3, #0
-
-        smlatb v1, ip, a4, v1
-        smlatb v3, lr, a4, v3
-        smulbb a3, ip, a4
-        smulbb a2, lr, a4
-        sub    v5, v5, a3
-        sub    v7, v7, a2
-        smlatt v2, ip, a4, v2
-        smlatt v4, lr, a4, v4
-        smulbt a3, ip, a4
-        smulbt a2, lr, a4
-        sub    v6, v6, a3
-        ldr    a4, [a1, #(16*5)]
-        sub    fp, fp, a2
-
-        smlabb v1, lr, a4, v1
-        smlabb v3, ip, a4, v3
-        smlatb v5, lr, a4, v5
-        smlatb v7, ip, a4, v7
-        smlabt v2, lr, a4, v2
-        smlabt v4, ip, a4, v4
-        smlatt v6, lr, a4, v6
-        ldr    a3, [a1, #(16*7)]
-        smlatt fp, ip, a4, fp
-
-        smlatb v1, lr, a3, v1
-        smlabb v3, lr, a3, v3
-        smlatb v5, ip, a3, v5
-        smulbb a4, ip, a3
-        smlatt v2, lr, a3, v2
-        sub    v7, v7, a4
-        smlabt v4, lr, a3, v4
-        smulbt a4, ip, a3
-        smlatt v6, ip, a3, v6
-        sub    fp, fp, a4
-        .endm
-
-function idct_col_armv5te
-        str    lr, [sp, #-4]!
-
-        idct_col
-
-        ldmfd  sp!, {a3, a4}
-        adds   a2, a3, v1
-        mov    a2, a2, lsr #20
-        orrmi  a2, a2, #0xf000
-        add    ip, a4, v2
-        mov    ip, ip, asr #20
-        orr    a2, a2, ip, lsl #16
-        str    a2, [a1]
-        subs   a3, a3, v1
-        mov    a2, a3, lsr #20
-        orrmi  a2, a2, #0xf000
-        sub    a4, a4, v2
-        mov    a4, a4, asr #20
-        orr    a2, a2, a4, lsl #16
-        ldmfd  sp!, {a3, a4}
-        str    a2, [a1, #(16*7)]
-
-        subs   a2, a3, v3
-        mov    a2, a2, lsr #20
-        orrmi  a2, a2, #0xf000
-        sub    ip, a4, v4
-        mov    ip, ip, asr #20
-        orr    a2, a2, ip, lsl #16
-        str    a2, [a1, #(16*1)]
-        adds   a3, a3, v3
-        mov    a2, a3, lsr #20
-        orrmi  a2, a2, #0xf000
-        add    a4, a4, v4
-        mov    a4, a4, asr #20
-        orr    a2, a2, a4, lsl #16
-        ldmfd  sp!, {a3, a4}
-        str    a2, [a1, #(16*6)]
-
-        adds   a2, a3, v5
-        mov    a2, a2, lsr #20
-        orrmi  a2, a2, #0xf000
-        add    ip, a4, v6
-        mov    ip, ip, asr #20
-        orr    a2, a2, ip, lsl #16
-        str    a2, [a1, #(16*2)]
-        subs   a3, a3, v5
-        mov    a2, a3, lsr #20
-        orrmi  a2, a2, #0xf000
-        sub    a4, a4, v6
-        mov    a4, a4, asr #20
-        orr    a2, a2, a4, lsl #16
-        ldmfd  sp!, {a3, a4}
-        str    a2, [a1, #(16*5)]
-
-        adds   a2, a3, v7
-        mov    a2, a2, lsr #20
-        orrmi  a2, a2, #0xf000
-        add    ip, a4, fp
-        mov    ip, ip, asr #20
-        orr    a2, a2, ip, lsl #16
-        str    a2, [a1, #(16*3)]
-        subs   a3, a3, v7
-        mov    a2, a3, lsr #20
-        orrmi  a2, a2, #0xf000
-        sub    a4, a4, fp
-        mov    a4, a4, asr #20
-        orr    a2, a2, a4, lsl #16
-        str    a2, [a1, #(16*4)]
-
-        ldr    pc, [sp], #4
-endfunc
-
-function idct_col_put_armv5te
-        str    lr, [sp, #-4]!
-
-        idct_col
-
-        ldmfd  sp!, {a3, a4}
-        ldr    lr, [sp, #32]
-        add    a2, a3, v1
-        movs   a2, a2, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    ip, a4, v2
-        movs   ip, ip, asr #20
-        movmi  ip, #0
-        cmp    ip, #255
-        movgt  ip, #255
-        orr    a2, a2, ip, lsl #8
-        sub    a3, a3, v1
-        movs   a3, a3, asr #20
-        movmi  a3, #0
-        cmp    a3, #255
-        movgt  a3, #255
-        sub    a4, a4, v2
-        movs   a4, a4, asr #20
-        movmi  a4, #0
-        cmp    a4, #255
-        ldr    v1, [sp, #28]
-        movgt  a4, #255
-        strh   a2, [v1]
-        add    a2, v1, #2
-        str    a2, [sp, #28]
-        orr    a2, a3, a4, lsl #8
-        rsb    v2, lr, lr, lsl #3
-        ldmfd  sp!, {a3, a4}
-        strh   a2, [v2, v1]!
-
-        sub    a2, a3, v3
-        movs   a2, a2, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        sub    ip, a4, v4
-        movs   ip, ip, asr #20
-        movmi  ip, #0
-        cmp    ip, #255
-        movgt  ip, #255
-        orr    a2, a2, ip, lsl #8
-        strh   a2, [v1, lr]!
-        add    a3, a3, v3
-        movs   a2, a3, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    a4, a4, v4
-        movs   a4, a4, asr #20
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        orr    a2, a2, a4, lsl #8
-        ldmfd  sp!, {a3, a4}
-        strh   a2, [v2, -lr]!
-
-        add    a2, a3, v5
-        movs   a2, a2, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    ip, a4, v6
-        movs   ip, ip, asr #20
-        movmi  ip, #0
-        cmp    ip, #255
-        movgt  ip, #255
-        orr    a2, a2, ip, lsl #8
-        strh   a2, [v1, lr]!
-        sub    a3, a3, v5
-        movs   a2, a3, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        sub    a4, a4, v6
-        movs   a4, a4, asr #20
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        orr    a2, a2, a4, lsl #8
-        ldmfd  sp!, {a3, a4}
-        strh   a2, [v2, -lr]!
-
-        add    a2, a3, v7
-        movs   a2, a2, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    ip, a4, fp
-        movs   ip, ip, asr #20
-        movmi  ip, #0
-        cmp    ip, #255
-        movgt  ip, #255
-        orr    a2, a2, ip, lsl #8
-        strh   a2, [v1, lr]
-        sub    a3, a3, v7
-        movs   a2, a3, asr #20
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        sub    a4, a4, fp
-        movs   a4, a4, asr #20
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        orr    a2, a2, a4, lsl #8
-        strh   a2, [v2, -lr]
-
-        ldr    pc, [sp], #4
-endfunc
-
-function idct_col_add_armv5te
-        str    lr, [sp, #-4]!
-
-        idct_col
-
-        ldr    lr, [sp, #36]
-
-        ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr]
-        add    a2, a3, v1
-        mov    a2, a2, asr #20
-        sub    a3, a3, v1
-        and    v1, ip, #255
-        adds   a2, a2, v1
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    v1, a4, v2
-        mov    v1, v1, asr #20
-        adds   v1, v1, ip, lsr #8
-        movmi  v1, #0
-        cmp    v1, #255
-        movgt  v1, #255
-        orr    a2, a2, v1, lsl #8
-        ldr    v1, [sp, #32]
-        sub    a4, a4, v2
-        rsb    v2, v1, v1, lsl #3
-        ldrh   ip, [v2, lr]!
-        strh   a2, [lr]
-        mov    a3, a3, asr #20
-        and    a2, ip, #255
-        adds   a3, a3, a2
-        movmi  a3, #0
-        cmp    a3, #255
-        movgt  a3, #255
-        mov    a4, a4, asr #20
-        adds   a4, a4, ip, lsr #8
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        add    a2, lr, #2
-        str    a2, [sp, #28]
-        orr    a2, a3, a4, lsl #8
-        strh   a2, [v2]
-
-        ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr, v1]!
-        sub    a2, a3, v3
-        mov    a2, a2, asr #20
-        add    a3, a3, v3
-        and    v3, ip, #255
-        adds   a2, a2, v3
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        sub    v3, a4, v4
-        mov    v3, v3, asr #20
-        adds   v3, v3, ip, lsr #8
-        movmi  v3, #0
-        cmp    v3, #255
-        movgt  v3, #255
-        orr    a2, a2, v3, lsl #8
-        add    a4, a4, v4
-        ldrh   ip, [v2, -v1]!
-        strh   a2, [lr]
-        mov    a3, a3, asr #20
-        and    a2, ip, #255
-        adds   a3, a3, a2
-        movmi  a3, #0
-        cmp    a3, #255
-        movgt  a3, #255
-        mov    a4, a4, asr #20
-        adds   a4, a4, ip, lsr #8
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        orr    a2, a3, a4, lsl #8
-        strh   a2, [v2]
-
-        ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr, v1]!
-        add    a2, a3, v5
-        mov    a2, a2, asr #20
-        sub    a3, a3, v5
-        and    v3, ip, #255
-        adds   a2, a2, v3
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    v3, a4, v6
-        mov    v3, v3, asr #20
-        adds   v3, v3, ip, lsr #8
-        movmi  v3, #0
-        cmp    v3, #255
-        movgt  v3, #255
-        orr    a2, a2, v3, lsl #8
-        sub    a4, a4, v6
-        ldrh   ip, [v2, -v1]!
-        strh   a2, [lr]
-        mov    a3, a3, asr #20
-        and    a2, ip, #255
-        adds   a3, a3, a2
-        movmi  a3, #0
-        cmp    a3, #255
-        movgt  a3, #255
-        mov    a4, a4, asr #20
-        adds   a4, a4, ip, lsr #8
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        orr    a2, a3, a4, lsl #8
-        strh   a2, [v2]
-
-        ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr, v1]!
-        add    a2, a3, v7
-        mov    a2, a2, asr #20
-        sub    a3, a3, v7
-        and    v3, ip, #255
-        adds   a2, a2, v3
-        movmi  a2, #0
-        cmp    a2, #255
-        movgt  a2, #255
-        add    v3, a4, fp
-        mov    v3, v3, asr #20
-        adds   v3, v3, ip, lsr #8
-        movmi  v3, #0
-        cmp    v3, #255
-        movgt  v3, #255
-        orr    a2, a2, v3, lsl #8
-        sub    a4, a4, fp
-        ldrh   ip, [v2, -v1]!
-        strh   a2, [lr]
-        mov    a3, a3, asr #20
-        and    a2, ip, #255
-        adds   a3, a3, a2
-        movmi  a3, #0
-        cmp    a3, #255
-        movgt  a3, #255
-        mov    a4, a4, asr #20
-        adds   a4, a4, ip, lsr #8
-        movmi  a4, #0
-        cmp    a4, #255
-        movgt  a4, #255
-        orr    a2, a3, a4, lsl #8
-        strh   a2, [v2]
-
-        ldr    pc, [sp], #4
-endfunc
-
-function ff_simple_idct_armv5te, export=1
-        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
-
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-
-        sub    a1, a1, #(16*7)
-
-        bl     idct_col_armv5te
-        add    a1, a1, #4
-        bl     idct_col_armv5te
-        add    a1, a1, #4
-        bl     idct_col_armv5te
-        add    a1, a1, #4
-        bl     idct_col_armv5te
-
-        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
-endfunc
-
-function ff_simple_idct_add_armv5te, export=1
-        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
-
-        mov    a1, a3
-
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-
-        sub    a1, a1, #(16*7)
-
-        bl     idct_col_add_armv5te
-        add    a1, a1, #4
-        bl     idct_col_add_armv5te
-        add    a1, a1, #4
-        bl     idct_col_add_armv5te
-        add    a1, a1, #4
-        bl     idct_col_add_armv5te
-
-        add    sp, sp, #8
-        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
-endfunc
-
-function ff_simple_idct_put_armv5te, export=1
-        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
-
-        mov    a1, a3
-
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-        add    a1, a1, #16
-        bl     idct_row_armv5te
-
-        sub    a1, a1, #(16*7)
-
-        bl     idct_col_put_armv5te
-        add    a1, a1, #4
-        bl     idct_col_put_armv5te
-        add    a1, a1, #4
-        bl     idct_col_put_armv5te
-        add    a1, a1, #4
-        bl     idct_col_put_armv5te
-
-        add    sp, sp, #8
-        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,433 +0,0 @@
-/*
- * Simple IDCT
- *
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define ROW_SHIFT 11
-#define COL_SHIFT 20
-
-#define W13 (W1 | (W3 << 16))
-#define W26 (W2 | (W6 << 16))
-#define W42 (W4 | (W2 << 16))
-#define W42n (-W4&0xffff | (-W2 << 16))
-#define W46 (W4 | (W6 << 16))
-#define W57 (W5 | (W7 << 16))
-
-        .text
-        .align
-w13:    .long W13
-w26:    .long W26
-w42:    .long W42
-w42n:   .long W42n
-w46:    .long W46
-w57:    .long W57
-
-/*
-  Compute partial IDCT of single row.
-  shift = left-shift amount
-  r0 = source address
-  r2 = row[2,0] <= 2 cycles
-  r3 = row[3,1]
-  ip = w42      <= 2 cycles
-
-  Output in registers r4--r11
-*/
-        .macro idct_row shift
-        ldr    lr, w46               /* lr  = W4 | (W6 << 16) */
-        mov    r1, #(1<<(\shift-1))
-        smlad  r4, r2, ip, r1
-        smlsd  r7, r2, ip, r1
-        ldr    ip, w13               /* ip  = W1 | (W3 << 16) */
-        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
-        smlad  r5, r2, lr, r1
-        smlsd  r6, r2, lr, r1
-
-        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
-        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
-        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
-        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
-        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
-        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
-        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
-        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
-
-        ldr    r3, w42n              /* r3 =  -W4 | (-W2 << 16) */
-        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
-        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
-        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
-        ldr    ip, w46               /* ip =   W4 | (W6 << 16) */
-        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
-
-        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
-        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
-        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
-        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
-        .endm
-
-/*
-  Compute partial IDCT of half row.
-  shift = left-shift amount
-  r2 = row[2,0]
-  r3 = row[3,1]
-  ip = w42
-
-  Output in registers r4--r11
-*/
-        .macro idct_row4 shift
-        ldr    lr, w46               /* lr =  W4 | (W6 << 16) */
-        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
-        mov    r1, #(1<<(\shift-1))
-        smlad  r4, r2, ip, r1
-        smlsd  r7, r2, ip, r1
-        ldr    ip, w13               /* ip =  W1 | (W3 << 16) */
-        smlad  r5, r2, lr, r1
-        smlsd  r6, r2, lr, r1
-        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
-        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
-        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
-        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
-        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
-        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
-        .endm
-
-/*
-  Compute final part of IDCT single row without shift.
-  Input in registers r4--r11
-  Output in registers ip, r4--r6, lr, r8--r10
-*/
-        .macro idct_finish
-        add    ip, r4, r8            /* r1 = A0 + B0 */
-        sub    lr, r4, r8            /* r2 = A0 - B0 */
-        sub    r4, r5, r9            /* r2 = A1 + B1 */
-        add    r8, r5, r9            /* r2 = A1 - B1 */
-        add    r5, r6, r10           /* r1 = A2 + B2 */
-        sub    r9, r6, r10           /* r1 = A2 - B2 */
-        add    r6, r7, r11           /* r2 = A3 + B3 */
-        sub    r10,r7, r11           /* r2 = A3 - B3 */
-        .endm
-
-/*
-  Compute final part of IDCT single row.
-  shift = right-shift amount
-  Input/output in registers r4--r11
-*/
-        .macro idct_finish_shift shift
-        add    r3, r4, r8            /* r3 = A0 + B0 */
-        sub    r2, r4, r8            /* r2 = A0 - B0 */
-        mov    r4, r3, asr #\shift
-        mov    r8, r2, asr #\shift
-
-        sub    r3, r5, r9            /* r3 = A1 + B1 */
-        add    r2, r5, r9            /* r2 = A1 - B1 */
-        mov    r5, r3, asr #\shift
-        mov    r9, r2, asr #\shift
-
-        add    r3, r6, r10           /* r3 = A2 + B2 */
-        sub    r2, r6, r10           /* r2 = A2 - B2 */
-        mov    r6, r3, asr #\shift
-        mov    r10,r2, asr #\shift
-
-        add    r3, r7, r11           /* r3 = A3 + B3 */
-        sub    r2, r7, r11           /* r2 = A3 - B3 */
-        mov    r7, r3, asr #\shift
-        mov    r11,r2, asr #\shift
-        .endm
-
-/*
-  Compute final part of IDCT single row, saturating results at 8 bits.
-  shift = right-shift amount
-  Input/output in registers r4--r11
-*/
-        .macro idct_finish_shift_sat shift
-        add    r3, r4, r8            /* r3 = A0 + B0 */
-        sub    ip, r4, r8            /* ip = A0 - B0 */
-        usat   r4, #8, r3, asr #\shift
-        usat   r8, #8, ip, asr #\shift
-
-        sub    r3, r5, r9            /* r3 = A1 + B1 */
-        add    ip, r5, r9            /* ip = A1 - B1 */
-        usat   r5, #8, r3, asr #\shift
-        usat   r9, #8, ip, asr #\shift
-
-        add    r3, r6, r10           /* r3 = A2 + B2 */
-        sub    ip, r6, r10           /* ip = A2 - B2 */
-        usat   r6, #8, r3, asr #\shift
-        usat   r10,#8, ip, asr #\shift
-
-        add    r3, r7, r11           /* r3 = A3 + B3 */
-        sub    ip, r7, r11           /* ip = A3 - B3 */
-        usat   r7, #8, r3, asr #\shift
-        usat   r11,#8, ip, asr #\shift
-        .endm
-
-/*
-  Compute IDCT of single row, storing as column.
-  r0 = source
-  r1 = dest
-*/
-function idct_row_armv6
-        push   {lr}
-
-        ldr    lr, [r0, #12]         /* lr = row[7,5] */
-        ldr    ip, [r0, #4]          /* ip = row[6,4] */
-        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
-        ldr    r2, [r0]              /* r2 = row[2,0] */
-        orrs   lr, lr, ip
-        cmpeq  lr, r3
-        cmpeq  lr, r2, lsr #16
-        beq    1f
-        push   {r1}
-        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
-        cmp    lr, #0
-        beq    2f
-
-        idct_row   ROW_SHIFT
-        b      3f
-
-2:      idct_row4  ROW_SHIFT
-
-3:      pop    {r1}
-        idct_finish_shift ROW_SHIFT
-
-        strh   r4, [r1]
-        strh   r5, [r1, #(16*2)]
-        strh   r6, [r1, #(16*4)]
-        strh   r7, [r1, #(16*6)]
-        strh   r11,[r1, #(16*1)]
-        strh   r10,[r1, #(16*3)]
-        strh   r9, [r1, #(16*5)]
-        strh   r8, [r1, #(16*7)]
-
-        pop    {pc}
-
-1:      mov    r2, r2, lsl #3
-        strh   r2, [r1]
-        strh   r2, [r1, #(16*2)]
-        strh   r2, [r1, #(16*4)]
-        strh   r2, [r1, #(16*6)]
-        strh   r2, [r1, #(16*1)]
-        strh   r2, [r1, #(16*3)]
-        strh   r2, [r1, #(16*5)]
-        strh   r2, [r1, #(16*7)]
-        pop    {pc}
-endfunc
-
-/*
-  Compute IDCT of single column, read as row.
-  r0 = source
-  r1 = dest
-*/
-function idct_col_armv6
-        push   {r1, lr}
-
-        ldr    r2, [r0]              /* r2 = row[2,0] */
-        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
-        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
-        idct_row COL_SHIFT
-        pop    {r1}
-        idct_finish_shift COL_SHIFT
-
-        strh   r4, [r1]
-        strh   r5, [r1, #(16*1)]
-        strh   r6, [r1, #(16*2)]
-        strh   r7, [r1, #(16*3)]
-        strh   r11,[r1, #(16*4)]
-        strh   r10,[r1, #(16*5)]
-        strh   r9, [r1, #(16*6)]
-        strh   r8, [r1, #(16*7)]
-
-        pop    {pc}
-endfunc
-
-/*
-  Compute IDCT of single column, read as row, store saturated 8-bit.
-  r0 = source
-  r1 = dest
-  r2 = line size
-*/
-function idct_col_put_armv6
-        push   {r1, r2, lr}
-
-        ldr    r2, [r0]              /* r2 = row[2,0] */
-        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
-        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
-        idct_row COL_SHIFT
-        pop    {r1, r2}
-        idct_finish_shift_sat COL_SHIFT
-
-        strb   r4, [r1], r2
-        strb   r5, [r1], r2
-        strb   r6, [r1], r2
-        strb   r7, [r1], r2
-        strb   r11,[r1], r2
-        strb   r10,[r1], r2
-        strb   r9, [r1], r2
-        strb   r8, [r1], r2
-
-        sub    r1, r1, r2, lsl #3
-
-        pop    {pc}
-endfunc
-
-/*
-  Compute IDCT of single column, read as row, add/store saturated 8-bit.
-  r0 = source
-  r1 = dest
-  r2 = line size
-*/
-function idct_col_add_armv6
-        push   {r1, r2, lr}
-
-        ldr    r2, [r0]              /* r2 = row[2,0] */
-        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
-        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
-        idct_row COL_SHIFT
-        pop    {r1, r2}
-        idct_finish
-
-        ldrb   r3, [r1]
-        ldrb   r7, [r1, r2]
-        ldrb   r11,[r1, r2, lsl #2]
-        add    ip, r3, ip, asr #COL_SHIFT
-        usat   ip, #8, ip
-        add    r4, r7, r4, asr #COL_SHIFT
-        strb   ip, [r1], r2
-        ldrb   ip, [r1, r2]
-        usat   r4, #8, r4
-        ldrb   r11,[r1, r2, lsl #2]
-        add    r5, ip, r5, asr #COL_SHIFT
-        usat   r5, #8, r5
-        strb   r4, [r1], r2
-        ldrb   r3, [r1, r2]
-        ldrb   ip, [r1, r2, lsl #2]
-        strb   r5, [r1], r2
-        ldrb   r7, [r1, r2]
-        ldrb   r4, [r1, r2, lsl #2]
-        add    r6, r3, r6, asr #COL_SHIFT
-        usat   r6, #8, r6
-        add    r10,r7, r10,asr #COL_SHIFT
-        usat   r10,#8, r10
-        add    r9, r11,r9, asr #COL_SHIFT
-        usat   r9, #8, r9
-        add    r8, ip, r8, asr #COL_SHIFT
-        usat   r8, #8, r8
-        add    lr, r4, lr, asr #COL_SHIFT
-        usat   lr, #8, lr
-        strb   r6, [r1], r2
-        strb   r10,[r1], r2
-        strb   r9, [r1], r2
-        strb   r8, [r1], r2
-        strb   lr, [r1], r2
-
-        sub    r1, r1, r2, lsl #3
-
-        pop    {pc}
-endfunc
-
-/*
-  Compute 8 IDCT row transforms.
-  func = IDCT row->col function
-  width = width of columns in bytes
-*/
-        .macro idct_rows func width
-        bl     \func
-        add    r0, r0, #(16*2)
-        add    r1, r1, #\width
-        bl     \func
-        add    r0, r0, #(16*2)
-        add    r1, r1, #\width
-        bl     \func
-        add    r0, r0, #(16*2)
-        add    r1, r1, #\width
-        bl     \func
-        sub    r0, r0, #(16*5)
-        add    r1, r1, #\width
-        bl     \func
-        add    r0, r0, #(16*2)
-        add    r1, r1, #\width
-        bl     \func
-        add    r0, r0, #(16*2)
-        add    r1, r1, #\width
-        bl     \func
-        add    r0, r0, #(16*2)
-        add    r1, r1, #\width
-        bl     \func
-
-        sub    r0, r0, #(16*7)
-        .endm
-
-/* void ff_simple_idct_armv6(DCTELEM *data); */
-function ff_simple_idct_armv6, export=1
-        push   {r4-r11, lr}
-        sub    sp, sp, #128
-
-        mov    r1, sp
-        idct_rows idct_row_armv6, 2
-        mov    r1, r0
-        mov    r0, sp
-        idct_rows idct_col_armv6, 2
-
-        add    sp, sp, #128
-        pop    {r4-r11, pc}
-endfunc
-
-/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
-function ff_simple_idct_add_armv6, export=1
-        push   {r0, r1, r4-r11, lr}
-        sub    sp, sp, #128
-
-        mov    r0, r2
-        mov    r1, sp
-        idct_rows idct_row_armv6, 2
-        mov    r0, sp
-        ldr    r1, [sp, #128]
-        ldr    r2, [sp, #(128+4)]
-        idct_rows idct_col_add_armv6, 1
-
-        add    sp, sp, #(128+8)
-        pop    {r4-r11, pc}
-endfunc
-
-/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
-function ff_simple_idct_put_armv6, export=1
-        push   {r0, r1, r4-r11, lr}
-        sub    sp, sp, #128
-
-        mov    r0, r2
-        mov    r1, sp
-        idct_rows idct_row_armv6, 2
-        mov    r0, sp
-        ldr    r1, [sp, #128]
-        ldr    r2, [sp, #(128+4)]
-        idct_rows idct_col_put_armv6, 1
-
-        add    sp, sp, #(128+8)
-        pop    {r4-r11, pc}
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,373 +0,0 @@
-/*
- * ARM NEON IDCT
- *
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * Based on Simple IDCT
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W4c ((1<<(COL_SHIFT-1))/W4)
-#define ROW_SHIFT 11
-#define COL_SHIFT 20
-
-#define w1 d0[0]
-#define w2 d0[1]
-#define w3 d0[2]
-#define w4 d0[3]
-#define w5 d1[0]
-#define w6 d1[1]
-#define w7 d1[2]
-#define w4c d1[3]
-
-        .macro idct_col4_top
-        vmull.s16       q7,  d6,  w2    /* q9   = W2 * col[2] */
-        vmull.s16       q8,  d6,  w6    /* q10  = W6 * col[2] */
-        vmull.s16       q9,  d4,  w1    /* q9   = W1 * col[1] */
-        vadd.i32        q11, q15, q7
-        vmull.s16       q10, d4,  w3    /* q10  = W3 * col[1] */
-        vadd.i32        q12, q15, q8
-        vmull.s16       q5,  d4,  w5    /* q5   = W5 * col[1] */
-        vsub.i32        q13, q15, q8
-        vmull.s16       q6,  d4,  w7    /* q6   = W7 * col[1] */
-        vsub.i32        q14, q15, q7
-
-        vmlal.s16       q9,  d8,  w3    /* q9  += W3 * col[3] */
-        vmlsl.s16       q10, d8,  w7    /* q10 -= W7 * col[3] */
-        vmlsl.s16       q5,  d8,  w1    /* q5  -= W1 * col[3] */
-        vmlsl.s16       q6,  d8,  w5    /* q6  -= W5 * col[3] */
-        .endm
-
-        .text
-        .align 6
-
-function idct_row4_pld_neon
-        pld             [r0]
-        add             r3,  r0,  r1,  lsl #2
-        pld             [r0, r1]
-        pld             [r0, r1, lsl #1]
-        pld             [r3, -r1]
-        pld             [r3]
-        pld             [r3, r1]
-        add             r3,  r3,  r1,  lsl #1
-        pld             [r3]
-        pld             [r3, r1]
-endfunc
-
-function idct_row4_neon
-        vmov.i32        q15, #(1<<(ROW_SHIFT-1))
-        vld1.64         {d2-d5},  [r2,:128]!
-        vmlal.s16       q15, d2,  w4    /* q15  += W4 * col[0] */
-        vld1.64         {d6,d7},  [r2,:128]!
-        vorr            d10, d3,  d5
-        vld1.64         {d8,d9},  [r2,:128]!
-        add             r2,  r2,  #-64
-
-        vorr            d11, d7,  d9
-        vorr            d10, d10, d11
-        vmov            r3,  r4,  d10
-
-        idct_col4_top
-
-        orrs            r3,  r3,  r4
-        beq             1f
-
-        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
-        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
-        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
-        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
-        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
-        vadd.i32        q11, q11, q7
-        vsub.i32        q12, q12, q7
-        vsub.i32        q13, q13, q7
-        vadd.i32        q14, q14, q7
-        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
-        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
-        vmlal.s16       q9,  d9,  w7
-        vmlsl.s16       q10, d9,  w5
-        vmlal.s16       q5,  d9,  w3
-        vmlsl.s16       q6,  d9,  w1
-        vadd.i32        q11, q11, q7
-        vsub.i32        q12, q12, q8
-        vadd.i32        q13, q13, q8
-        vsub.i32        q14, q14, q7
-
-1:      vadd.i32        q3,  q11, q9
-        vadd.i32        q4,  q12, q10
-        vshrn.i32       d2,  q3,  #ROW_SHIFT
-        vshrn.i32       d4,  q4,  #ROW_SHIFT
-        vadd.i32        q7,  q13, q5
-        vadd.i32        q8,  q14, q6
-        vtrn.16         d2,  d4
-        vshrn.i32       d6,  q7,  #ROW_SHIFT
-        vshrn.i32       d8,  q8,  #ROW_SHIFT
-        vsub.i32        q14, q14, q6
-        vsub.i32        q11, q11, q9
-        vtrn.16         d6,  d8
-        vsub.i32        q13, q13, q5
-        vshrn.i32       d3,  q14, #ROW_SHIFT
-        vtrn.32         d2,  d6
-        vsub.i32        q12, q12, q10
-        vtrn.32         d4,  d8
-        vshrn.i32       d5,  q13, #ROW_SHIFT
-        vshrn.i32       d7,  q12, #ROW_SHIFT
-        vshrn.i32       d9,  q11, #ROW_SHIFT
-
-        vtrn.16         d3,  d5
-        vtrn.16         d7,  d9
-        vtrn.32         d3,  d7
-        vtrn.32         d5,  d9
-
-        vst1.64         {d2-d5},  [r2,:128]!
-        vst1.64         {d6-d9},  [r2,:128]!
-
-        bx              lr
-endfunc
-
-function idct_col4_neon
-        mov             ip,  #16
-        vld1.64         {d2}, [r2,:64], ip /* d2 = col[0] */
-        vdup.16         d30, w4c
-        vld1.64         {d4}, [r2,:64], ip /* d3 = col[1] */
-        vadd.i16        d30, d30, d2
-        vld1.64         {d6}, [r2,:64], ip /* d4 = col[2] */
-        vmull.s16       q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
-        vld1.64         {d8}, [r2,:64], ip /* d5 = col[3] */
-
-        ldrd            r4,  [r2]
-        ldrd            r6,  [r2, #16]
-        orrs            r4,  r4,  r5
-
-        idct_col4_top
-        addeq           r2,  r2,  #16
-        beq             1f
-
-        vld1.64         {d3}, [r2,:64], ip /* d6 = col[4] */
-        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
-        vadd.i32        q11, q11, q7
-        vsub.i32        q12, q12, q7
-        vsub.i32        q13, q13, q7
-        vadd.i32        q14, q14, q7
-
-1:      orrs            r6,  r6,  r7
-        ldrd            r4,  [r2, #16]
-        addeq           r2,  r2,  #16
-        beq             2f
-
-        vld1.64         {d5}, [r2,:64], ip /* d7 = col[5] */
-        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
-        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
-        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
-        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
-
-2:      orrs            r4,  r4,  r5
-        ldrd            r4,  [r2, #16]
-        addeq           r2,  r2,  #16
-        beq             3f
-
-        vld1.64         {d7}, [r2,:64], ip /* d8 = col[6] */
-        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
-        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
-        vadd.i32        q11, q11, q7
-        vsub.i32        q14, q14, q7
-        vsub.i32        q12, q12, q8
-        vadd.i32        q13, q13, q8
-
-3:      orrs            r4,  r4,  r5
-        addeq           r2,  r2,  #16
-        beq             4f
-
-        vld1.64         {d9}, [r2,:64], ip /* d9 = col[7] */
-        vmlal.s16       q9,  d9,  w7
-        vmlsl.s16       q10, d9,  w5
-        vmlal.s16       q5,  d9,  w3
-        vmlsl.s16       q6,  d9,  w1
-
-4:      vaddhn.i32      d2,  q11, q9
-        vaddhn.i32      d3,  q12, q10
-        vaddhn.i32      d4,  q13, q5
-        vaddhn.i32      d5,  q14, q6
-        vsubhn.i32      d9,  q11, q9
-        vsubhn.i32      d8,  q12, q10
-        vsubhn.i32      d7,  q13, q5
-        vsubhn.i32      d6,  q14, q6
-
-        bx              lr
-endfunc
-
-        .align 6
-
-function idct_col4_st8_neon
-        vqshrun.s16     d2,  q1,  #COL_SHIFT-16
-        vqshrun.s16     d3,  q2,  #COL_SHIFT-16
-        vqshrun.s16     d4,  q3,  #COL_SHIFT-16
-        vqshrun.s16     d5,  q4,  #COL_SHIFT-16
-        vst1.32         {d2[0]}, [r0,:32], r1
-        vst1.32         {d2[1]}, [r0,:32], r1
-        vst1.32         {d3[0]}, [r0,:32], r1
-        vst1.32         {d3[1]}, [r0,:32], r1
-        vst1.32         {d4[0]}, [r0,:32], r1
-        vst1.32         {d4[1]}, [r0,:32], r1
-        vst1.32         {d5[0]}, [r0,:32], r1
-        vst1.32         {d5[1]}, [r0,:32], r1
-
-        bx              lr
-endfunc
-
-        .section .rodata
-        .align 4
-idct_coeff_neon:
-        .short W1, W2, W3, W4, W5, W6, W7, W4c
-        .previous
-
-        .macro idct_start data
-        push            {r4-r7, lr}
-        pld             [\data]
-        pld             [\data, #64]
-        vpush           {d8-d15}
-        movrel          r3,  idct_coeff_neon
-        vld1.64         {d0,d1}, [r3,:128]
-        .endm
-
-        .macro idct_end
-        vpop            {d8-d15}
-        pop             {r4-r7, pc}
-        .endm
-
-/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
-function ff_simple_idct_put_neon, export=1
-        idct_start      r2
-
-        bl              idct_row4_pld_neon
-        bl              idct_row4_neon
-        add             r2,  r2,  #-128
-        bl              idct_col4_neon
-        bl              idct_col4_st8_neon
-        sub             r0,  r0,  r1, lsl #3
-        add             r0,  r0,  #4
-        add             r2,  r2,  #-120
-        bl              idct_col4_neon
-        bl              idct_col4_st8_neon
-
-        idct_end
-endfunc
-
-        .align 6
-
-function idct_col4_add8_neon
-        mov             ip,  r0
-
-        vld1.32         {d10[0]}, [r0,:32], r1
-        vshr.s16        q1,  q1,  #COL_SHIFT-16
-        vld1.32         {d10[1]}, [r0,:32], r1
-        vshr.s16        q2,  q2,  #COL_SHIFT-16
-        vld1.32         {d11[0]}, [r0,:32], r1
-        vshr.s16        q3,  q3,  #COL_SHIFT-16
-        vld1.32         {d11[1]}, [r0,:32], r1
-        vshr.s16        q4,  q4,  #COL_SHIFT-16
-        vld1.32         {d12[0]}, [r0,:32], r1
-        vaddw.u8        q1,  q1,  d10
-        vld1.32         {d12[1]}, [r0,:32], r1
-        vaddw.u8        q2,  q2,  d11
-        vld1.32         {d13[0]}, [r0,:32], r1
-        vqmovun.s16     d2,  q1
-        vld1.32         {d13[1]}, [r0,:32], r1
-        vaddw.u8        q3,  q3,  d12
-        vst1.32         {d2[0]},  [ip,:32], r1
-        vqmovun.s16     d3,  q2
-        vst1.32         {d2[1]},  [ip,:32], r1
-        vaddw.u8        q4,  q4,  d13
-        vst1.32         {d3[0]},  [ip,:32], r1
-        vqmovun.s16     d4,  q3
-        vst1.32         {d3[1]},  [ip,:32], r1
-        vqmovun.s16     d5,  q4
-        vst1.32         {d4[0]},  [ip,:32], r1
-        vst1.32         {d4[1]},  [ip,:32], r1
-        vst1.32         {d5[0]},  [ip,:32], r1
-        vst1.32         {d5[1]},  [ip,:32], r1
-
-        bx              lr
-endfunc
-
-/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
-function ff_simple_idct_add_neon, export=1
-        idct_start      r2
-
-        bl              idct_row4_pld_neon
-        bl              idct_row4_neon
-        add             r2,  r2,  #-128
-        bl              idct_col4_neon
-        bl              idct_col4_add8_neon
-        sub             r0,  r0,  r1, lsl #3
-        add             r0,  r0,  #4
-        add             r2,  r2,  #-120
-        bl              idct_col4_neon
-        bl              idct_col4_add8_neon
-
-        idct_end
-endfunc
-
-        .align 6
-
-function idct_col4_st16_neon
-        mov             ip,  #16
-
-        vshr.s16        q1,  q1,  #COL_SHIFT-16
-        vshr.s16        q2,  q2,  #COL_SHIFT-16
-        vst1.64         {d2}, [r2,:64], ip
-        vshr.s16        q3,  q3,  #COL_SHIFT-16
-        vst1.64         {d3}, [r2,:64], ip
-        vshr.s16        q4,  q4,  #COL_SHIFT-16
-        vst1.64         {d4}, [r2,:64], ip
-        vst1.64         {d5}, [r2,:64], ip
-        vst1.64         {d6}, [r2,:64], ip
-        vst1.64         {d7}, [r2,:64], ip
-        vst1.64         {d8}, [r2,:64], ip
-        vst1.64         {d9}, [r2,:64], ip
-
-        bx              lr
-endfunc
-
-/* void ff_simple_idct_neon(DCTELEM *data); */
-function ff_simple_idct_neon, export=1
-        idct_start      r0
-
-        mov             r2,  r0
-        bl              idct_row4_neon
-        bl              idct_row4_neon
-        add             r2,  r2,  #-128
-        bl              idct_col4_neon
-        add             r2,  r2,  #-128
-        bl              idct_col4_st16_neon
-        add             r2,  r2,  #-120
-        bl              idct_col4_neon
-        add             r2,  r2,  #-128
-        bl              idct_col4_st16_neon
-
-        idct_end
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-        preserve8
-
-function ff_synth_filter_float_neon, export=1
-        push            {r3-r11,lr}
-
-        ldr             r4,  [r2]               @ synth_buf_offset
-        add             r1,  r1,  r4,  lsl #2   @ synth_buf
-        sub             r12, r4,  #32
-        bfc             r12, #9,  #23
-        bic             r4,  r4,  #63
-        str             r12, [r2]
-
-        ldr             r2,  [sp, #12*4]        @ in
-        mov             r9,  r1                 @ synth_buf
-
-VFP     vpush           {d0}
-        bl              ff_imdct_half_neon
-VFP     vpop            {d0}
-        pop             {r3}
-
-        ldr             r5,  [sp, #9*4]         @ window
-        ldr             r2,  [sp, #10*4]        @ out
-NOVFP   vldr            d0,  [sp, #12*4]        @ scale, bias
-        add             r8,  r9,  #12*4
-
-        mov             lr,  #64*4
-        mov             r1,  #4
-1:
-        add             r10, r9,  #16*4         @ synth_buf
-        add             r11, r8,  #16*4
-        add             r0,  r5,  #16*4         @ window
-        add             r6,  r5,  #32*4
-        add             r7,  r5,  #48*4
-
-        vld1.32         {q10},    [r3,:128]     @ a
-        add             r3,  r3,  #16*4
-        vld1.32         {q1},     [r3,:128]     @ b
-        vmov.f32        q2,  #0.0               @ c
-        vmov.f32        q3,  #0.0               @ d
-
-        mov             r12, #512
-2:
-        vld1.32         {q9},     [r8, :128], lr
-        vrev64.32       q9,  q9
-        vld1.32         {q8},     [r5, :128], lr
-        vmls.f32        d20, d16, d19
-        vld1.32         {q11},    [r0, :128], lr
-        vmls.f32        d21, d17, d18
-        vld1.32         {q12},    [r9, :128], lr
-        vmla.f32        d2,  d22, d24
-        vld1.32         {q8},     [r6, :128], lr
-        vmla.f32        d3,  d23, d25
-        vld1.32         {q9},     [r10,:128], lr
-        vmla.f32        d4,  d16, d18
-        vld1.32         {q12},    [r11,:128], lr
-        vmla.f32        d5,  d17, d19
-        vrev64.32       q12, q12
-        vld1.32         {q11},    [r7, :128], lr
-        vmla.f32        d6,  d22, d25
-        vmla.f32        d7,  d23, d24
-        subs            r12, r12, #64
-        beq             3f
-        cmp             r12, r4
-        bne             2b
-        sub             r8,  r8,  #512*4
-        sub             r9,  r9,  #512*4
-        sub             r10, r10, #512*4
-        sub             r11, r11, #512*4
-        b               2b
-3:
-        vdup.32         q8,  d0[1]
-        vdup.32         q9,  d0[1]
-        vmla.f32        q8,  q10, d0[0]
-        vmla.f32        q9,  q1,  d0[0]
-        vst1.32         {q3},     [r3,:128]
-        sub             r3,  r3,  #16*4
-        vst1.32         {q2},     [r3,:128]
-        vst1.32         {q8},     [r2,:128]
-        add             r2,  r2,  #16*4
-        vst1.32         {q9},     [r2,:128]
-
-        subs            r1,  r1,  #1
-        popeq           {r4-r11,pc}
-
-        cmp             r4,  #0
-        subeq           r8,  r8,  #512*4
-        subeq           r9,  r9,  #512*4
-        sub             r5,  r5,  #512*4
-        sub             r2,  r2,  #12*4         @ out
-        add             r3,  r3,  #4*4          @ synth_buf2
-        add             r5,  r5,  #4*4          @ window
-        add             r9,  r9,  #4*4          @ synth_buf
-        sub             r8,  r8,  #4*4          @ synth_buf
-        b               1b
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S
--- a/ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,420 +0,0 @@
-/*
- * Copyright (c) 2009 David Conrad
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "asm.S"
-
-.section .rodata
-.align 4
-
-vp3_idct_constants:
-.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
-
-#define xC1S7 d0[0]
-#define xC2S6 d0[1]
-#define xC3S5 d0[2]
-#define xC4S4 d0[3]
-#define xC5S3 d1[0]
-#define xC6S2 d1[1]
-#define xC7S1 d1[2]
-
-.text
-
-.macro vp3_loop_filter
-    vsubl.u8        q3,  d18, d17
-    vsubl.u8        q2,  d16, d19
-    vadd.i16        q1,  q3,  q3
-    vadd.i16        q2,  q2,  q3
-    vadd.i16        q0,  q1,  q2
-    vrshr.s16       q0,  q0,  #3
-    vmovl.u8        q9,  d18
-    vdup.u16        q15, r2
-
-    vabs.s16        q1,  q0
-    vshr.s16        q0,  q0,  #15
-    vqsub.u16       q2,  q15, q1
-    vqsub.u16       q3,  q2,  q1
-    vsub.i16        q1,  q2,  q3
-    veor            q1,  q1,  q0
-    vsub.i16        q0,  q1,  q0
-
-    vaddw.u8        q2,  q0,  d17
-    vsub.i16        q3,  q9,  q0
-    vqmovun.s16     d0,  q2
-    vqmovun.s16     d1,  q3
-.endm
-
-function ff_vp3_v_loop_filter_neon, export=1
-    sub             ip,  r0,  r1
-    sub             r0,  r0,  r1,  lsl #1
-    vld1.64         {d16}, [r0,:64], r1
-    vld1.64         {d17}, [r0,:64], r1
-    vld1.64         {d18}, [r0,:64], r1
-    vld1.64         {d19}, [r0,:64], r1
-    ldrb            r2,    [r2, #129*4]
-
-    vp3_loop_filter
-
-    vst1.64         {d0},  [ip,:64], r1
-    vst1.64         {d1},  [ip,:64], r1
-    bx              lr
-endfunc
-
-function ff_vp3_h_loop_filter_neon, export=1
-    sub             ip,  r0,  #1
-    sub             r0,  r0,  #2
-    vld1.32         {d16[]},  [r0], r1
-    vld1.32         {d17[]},  [r0], r1
-    vld1.32         {d18[]},  [r0], r1
-    vld1.32         {d19[]},  [r0], r1
-    vld1.32         {d16[1]}, [r0], r1
-    vld1.32         {d17[1]}, [r0], r1
-    vld1.32         {d18[1]}, [r0], r1
-    vld1.32         {d19[1]}, [r0], r1
-    ldrb            r2,  [r2, #129*4]
-
-    vtrn.8          d16, d17
-    vtrn.8          d18, d19
-    vtrn.16         d16, d18
-    vtrn.16         d17, d19
-
-    vp3_loop_filter
-
-    vtrn.8          d0,  d1
-
-    vst1.16         {d0[0]}, [ip], r1
-    vst1.16         {d1[0]}, [ip], r1
-    vst1.16         {d0[1]}, [ip], r1
-    vst1.16         {d1[1]}, [ip], r1
-    vst1.16         {d0[2]}, [ip], r1
-    vst1.16         {d1[2]}, [ip], r1
-    vst1.16         {d0[3]}, [ip], r1
-    vst1.16         {d1[3]}, [ip], r1
-    bx              lr
-endfunc
-
-
-function vp3_idct_start_neon
-    vpush           {d8-d15}
-    movrel          r3,  vp3_idct_constants
-    vld1.64         {d0-d1},   [r3,:128]
-    vld1.64         {d16-d19}, [r2,:128]!
-    vld1.64         {d20-d23}, [r2,:128]!
-    vld1.64         {d24-d27}, [r2,:128]!
-    vadd.s16        q1,  q8,  q12
-    vsub.s16        q8,  q8,  q12
-    vld1.64         {d28-d31}, [r2,:128]!
-endfunc
-
-function vp3_idct_core_neon
-    vmull.s16       q2,  d18, xC1S7     // (ip[1] * C1) << 16
-    vmull.s16       q3,  d19, xC1S7
-    vmull.s16       q4,  d2,  xC4S4     // ((ip[0] + ip[4]) * C4) << 16
-    vmull.s16       q5,  d3,  xC4S4
-    vmull.s16       q6,  d16, xC4S4     // ((ip[0] - ip[4]) * C4) << 16
-    vmull.s16       q7,  d17, xC4S4
-    vshrn.s32       d4,  q2,  #16
-    vshrn.s32       d5,  q3,  #16
-    vshrn.s32       d6,  q4,  #16
-    vshrn.s32       d7,  q5,  #16
-    vshrn.s32       d8,  q6,  #16
-    vshrn.s32       d9,  q7,  #16
-    vadd.s16        q12, q1,  q3        // E = (ip[0] + ip[4]) * C4
-    vadd.s16        q8,  q8,  q4        // F = (ip[0] - ip[4]) * C4
-    vadd.s16        q1,  q2,  q9        // ip[1] * C1
-
-    vmull.s16       q2,  d30, xC1S7     // (ip[7] * C1) << 16
-    vmull.s16       q3,  d31, xC1S7
-    vmull.s16       q4,  d30, xC7S1     // (ip[7] * C7) << 16
-    vmull.s16       q5,  d31, xC7S1
-    vmull.s16       q6,  d18, xC7S1     // (ip[1] * C7) << 16
-    vmull.s16       q7,  d19, xC7S1
-    vshrn.s32       d4,  q2,  #16
-    vshrn.s32       d5,  q3,  #16
-    vshrn.s32       d6,  q4,  #16       // ip[7] * C7
-    vshrn.s32       d7,  q5,  #16
-    vshrn.s32       d8,  q6,  #16       // ip[1] * C7
-    vshrn.s32       d9,  q7,  #16
-    vadd.s16        q2,  q2,  q15       // ip[7] * C1
-    vadd.s16        q9,  q1,  q3        // A = ip[1] * C1 + ip[7] * C7
-    vsub.s16        q15, q4,  q2        // B = ip[1] * C7 - ip[7] * C1
-
-    vmull.s16       q2,  d22, xC5S3     // (ip[3] * C5) << 16
-    vmull.s16       q3,  d23, xC5S3
-    vmull.s16       q4,  d22, xC3S5     // (ip[3] * C3) << 16
-    vmull.s16       q5,  d23, xC3S5
-    vmull.s16       q6,  d26, xC5S3     // (ip[5] * C5) << 16
-    vmull.s16       q7,  d27, xC5S3
-    vshrn.s32       d4,  q2,  #16
-    vshrn.s32       d5,  q3,  #16
-    vshrn.s32       d6,  q4,  #16
-    vshrn.s32       d7,  q5,  #16
-    vshrn.s32       d8,  q6,  #16
-    vshrn.s32       d9,  q7,  #16
-    vadd.s16        q3,  q3,  q11       // ip[3] * C3
-    vadd.s16        q4,  q4,  q13       // ip[5] * C5
-    vadd.s16        q1,  q2,  q11       // ip[3] * C5
-    vadd.s16        q11, q3,  q4        // C = ip[3] * C3 + ip[5] * C5
-
-    vmull.s16       q2,  d26, xC3S5     // (ip[5] * C3) << 16
-    vmull.s16       q3,  d27, xC3S5
-    vmull.s16       q4,  d20, xC2S6     // (ip[2] * C2) << 16
-    vmull.s16       q5,  d21, xC2S6
-    vmull.s16       q6,  d28, xC6S2     // (ip[6] * C6) << 16
-    vmull.s16       q7,  d29, xC6S2
-    vshrn.s32       d4,  q2,  #16
-    vshrn.s32       d5,  q3,  #16
-    vshrn.s32       d6,  q4,  #16
-    vshrn.s32       d7,  q5,  #16
-    vshrn.s32       d8,  q6,  #16       // ip[6] * C6
-    vshrn.s32       d9,  q7,  #16
-    vadd.s16        q2,  q2,  q13       // ip[5] * C3
-    vadd.s16        q3,  q3,  q10       // ip[2] * C2
-    vsub.s16        q13, q2,  q1        // D = ip[5] * C3 - ip[3] * C5
-    vsub.s16        q1,  q9,  q11       // (A - C)
-    vadd.s16        q11, q9,  q11       // Cd = A + C
-    vsub.s16        q9,  q15, q13       // (B - D)
-    vadd.s16        q13, q15, q13       // Dd = B + D
-    vadd.s16        q15, q3,  q4        // G = ip[2] * C2 + ip[6] * C6
-
-    vmull.s16       q2,  d2,  xC4S4     // ((A - C) * C4) << 16
-    vmull.s16       q3,  d3,  xC4S4
-    vmull.s16       q4,  d28, xC2S6     // (ip[6] * C2) << 16
-    vmull.s16       q5,  d29, xC2S6
-    vmull.s16       q6,  d20, xC6S2     // (ip[2] * C6) << 16
-    vmull.s16       q7,  d21, xC6S2
-    vshrn.s32       d4,  q2,  #16
-    vshrn.s32       d5,  q3,  #16
-    vshrn.s32       d6,  q4,  #16
-    vshrn.s32       d7,  q5,  #16
-    vshrn.s32       d8,  q6,  #16       // ip[2] * C6
-    vmull.s16       q5,  d18, xC4S4     // ((B - D) * C4) << 16
-    vmull.s16       q6,  d19, xC4S4
-    vshrn.s32       d9,  q7,  #16
-    vadd.s16        q3,  q3,  q14       // ip[6] * C2
-    vadd.s16        q10, q1,  q2        // Ad = (A - C) * C4
-    vsub.s16        q14, q4,  q3        // H = ip[2] * C6 - ip[6] * C2
-    bx              lr
-endfunc
-
-.macro VP3_IDCT_END type
-function vp3_idct_end_\type\()_neon
-.ifc \type, col
-    vdup.16         q0,  r3
-    vadd.s16        q12, q12, q0
-    vadd.s16        q8,  q8,  q0
-.endif
-
-    vshrn.s32       d2,  q5,  #16
-    vshrn.s32       d3,  q6,  #16
-    vadd.s16        q2,  q12, q15       // Gd  = E + G
-    vadd.s16        q9,  q1,  q9        // (B - D) * C4
-    vsub.s16        q12, q12, q15       // Ed  = E - G
-    vsub.s16        q3,  q8,  q10       // Fd  = F - Ad
-    vadd.s16        q10, q8,  q10       // Add = F + Ad
-    vadd.s16        q4,  q9,  q14       // Hd  = Bd + H
-    vsub.s16        q14, q9,  q14       // Bdd = Bd - H
-    vadd.s16        q8,  q2,  q11       // [0] = Gd + Cd
-    vsub.s16        q15, q2,  q11       // [7] = Gd - Cd
-    vadd.s16        q9,  q10, q4        // [1] = Add + Hd
-    vsub.s16        q10, q10, q4        // [2] = Add - Hd
-    vadd.s16        q11, q12, q13       // [3] = Ed + Dd
-    vsub.s16        q12, q12, q13       // [4] = Ed - Dd
-.ifc \type, row
-    vtrn.16         q8,  q9
-.endif
-    vadd.s16        q13, q3,  q14       // [5] = Fd + Bdd
-    vsub.s16        q14, q3,  q14       // [6] = Fd - Bdd
-
-.ifc \type, row
-    // 8x8 transpose
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vswp            d17, d24
-    vswp            d19, d26
-    vadd.s16        q1,  q8,  q12
-    vswp            d21, d28
-    vsub.s16        q8,  q8,  q12
-    vswp            d23, d30
-.endif
-    bx              lr
-endfunc
-.endm
-
-VP3_IDCT_END row
-VP3_IDCT_END col
-
-function ff_vp3_idct_neon, export=1
-    mov             ip,  lr
-    mov             r2,  r0
-    bl              vp3_idct_start_neon
-    bl              vp3_idct_end_row_neon
-    mov             r3,  #8
-    bl              vp3_idct_core_neon
-    bl              vp3_idct_end_col_neon
-    mov             lr,  ip
-    vpop            {d8-d15}
-
-    vshr.s16        q8,  q8,  #4
-    vshr.s16        q9,  q9,  #4
-    vshr.s16        q10, q10, #4
-    vshr.s16        q11, q11, #4
-    vshr.s16        q12, q12, #4
-    vst1.64         {d16-d19}, [r0,:128]!
-    vshr.s16        q13, q13, #4
-    vshr.s16        q14, q14, #4
-    vst1.64         {d20-d23}, [r0,:128]!
-    vshr.s16        q15, q15, #4
-    vst1.64         {d24-d27}, [r0,:128]!
-    vst1.64         {d28-d31}, [r0,:128]!
-    bx              lr
-endfunc
-
-function ff_vp3_idct_put_neon, export=1
-    mov             ip,  lr
-    bl              vp3_idct_start_neon
-    bl              vp3_idct_end_row_neon
-    mov             r3,  #8
-    add             r3,  r3,  #2048         // convert signed pixel to unsigned
-    bl              vp3_idct_core_neon
-    bl              vp3_idct_end_col_neon
-    mov             lr,  ip
-    vpop            {d8-d15}
-
-    vqshrun.s16     d0,  q8,  #4
-    vqshrun.s16     d1,  q9,  #4
-    vqshrun.s16     d2,  q10, #4
-    vqshrun.s16     d3,  q11, #4
-    vst1.64         {d0}, [r0,:64], r1
-    vqshrun.s16     d4,  q12, #4
-    vst1.64         {d1}, [r0,:64], r1
-    vqshrun.s16     d5,  q13, #4
-    vst1.64         {d2}, [r0,:64], r1
-    vqshrun.s16     d6,  q14, #4
-    vst1.64         {d3}, [r0,:64], r1
-    vqshrun.s16     d7,  q15, #4
-    vst1.64         {d4}, [r0,:64], r1
-    vst1.64         {d5}, [r0,:64], r1
-    vst1.64         {d6}, [r0,:64], r1
-    vst1.64         {d7}, [r0,:64], r1
-    bx              lr
-endfunc
-
-function ff_vp3_idct_add_neon, export=1
-    mov             ip,  lr
-    bl              vp3_idct_start_neon
-    bl              vp3_idct_end_row_neon
-    mov             r3,  #8
-    bl              vp3_idct_core_neon
-    bl              vp3_idct_end_col_neon
-    mov             lr,  ip
-    vpop            {d8-d15}
-    mov             r2,  r0
-
-    vld1.64         {d0}, [r0,:64], r1
-    vshr.s16        q8,  q8,  #4
-    vld1.64         {d1}, [r0,:64], r1
-    vshr.s16        q9,  q9,  #4
-    vld1.64         {d2}, [r0,:64], r1
-    vaddw.u8        q8,  q8,  d0
-    vld1.64         {d3}, [r0,:64], r1
-    vaddw.u8        q9,  q9,  d1
-    vld1.64         {d4}, [r0,:64], r1
-    vshr.s16        q10, q10, #4
-    vld1.64         {d5}, [r0,:64], r1
-    vshr.s16        q11, q11, #4
-    vld1.64         {d6}, [r0,:64], r1
-    vqmovun.s16     d0,  q8
-    vld1.64         {d7}, [r0,:64], r1
-    vqmovun.s16     d1,  q9
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vshr.s16        q12, q12, #4
-    vshr.s16        q13, q13, #4
-    vqmovun.s16     d2,  q10
-    vqmovun.s16     d3,  q11
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vshr.s16        q14, q14, #4
-    vshr.s16        q15, q15, #4
-    vst1.64         {d0}, [r2,:64], r1
-    vqmovun.s16     d4,  q12
-    vst1.64         {d1}, [r2,:64], r1
-    vqmovun.s16     d5,  q13
-    vst1.64         {d2}, [r2,:64], r1
-    vaddw.u8        q14, q14, d6
-    vst1.64         {d3}, [r2,:64], r1
-    vaddw.u8        q15, q15, d7
-    vst1.64         {d4}, [r2,:64], r1
-    vqmovun.s16     d6,  q14
-    vst1.64         {d5}, [r2,:64], r1
-    vqmovun.s16     d7,  q15
-    vst1.64         {d6}, [r2,:64], r1
-    vst1.64         {d7}, [r2,:64], r1
-    bx              lr
-endfunc
-
-function ff_vp3_idct_dc_add_neon, export=1
-    ldrsh           r2,  [r2]
-    movw            r3,  #46341
-    mul             r2,  r3,  r2
-    smulwt          r2,  r3,  r2
-    mov             r3,  r0
-    vdup.16         q15, r2
-    vrshr.s16       q15, q15, #4
-
-    vld1.8          {d0}, [r0,:64], r1
-    vld1.8          {d1}, [r0,:64], r1
-    vld1.8          {d2}, [r0,:64], r1
-    vaddw.u8        q8,  q15, d0
-    vld1.8          {d3}, [r0,:64], r1
-    vaddw.u8        q9,  q15, d1
-    vld1.8          {d4}, [r0,:64], r1
-    vaddw.u8        q10, q15, d2
-    vld1.8          {d5}, [r0,:64], r1
-    vaddw.u8        q11, q15, d3
-    vld1.8          {d6}, [r0,:64], r1
-    vaddw.u8        q12, q15, d4
-    vld1.8          {d7}, [r0,:64], r1
-    vaddw.u8        q13, q15, d5
-    vqmovun.s16     d0,  q8
-    vaddw.u8        q14, q15, d6
-    vqmovun.s16     d1,  q9
-    vaddw.u8        q15, q15, d7
-    vqmovun.s16     d2,  q10
-    vst1.8          {d0}, [r3,:64], r1
-    vqmovun.s16     d3,  q11
-    vst1.8          {d1}, [r3,:64], r1
-    vqmovun.s16     d4,  q12
-    vst1.8          {d2}, [r3,:64], r1
-    vqmovun.s16     d5,  q13
-    vst1.8          {d3}, [r3,:64], r1
-    vqmovun.s16     d6,  q14
-    vst1.8          {d4}, [r3,:64], r1
-    vqmovun.s16     d7,  q15
-    vst1.8          {d5}, [r3,:64], r1
-    vst1.8          {d6}, [r3,:64], r1
-    vst1.8          {d7}, [r3,:64], r1
-    bx              lr
-endfunc
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/avcodec.h
--- a/ffmpeg_smp/h264dec/libavcodec/avcodec.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,407 +0,0 @@
-#ifndef AVCODEC_AVCODEC_H
-#define AVCODEC_AVCODEC_H
-
-#include <errno.h>
-#include <stdint.h>
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#define MAX_SPS_COUNT 32
-#define MAX_PPS_COUNT 256
-
-
-#ifndef CABAC
-#define CABAC h->pps.cabac
-#endif
-
-#define EXTENDED_SAR          255
-
-#define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
-#define MB_TYPE_8x8DCT     0x01000000
-#define IS_REF0(a)         ((a) & MB_TYPE_REF0)
-#define IS_8x8DCT(a)       ((a) & MB_TYPE_8x8DCT)
-
-#define LIST_NOT_USED -1
-#define PART_NOT_AVAILABLE -2
-
-/* dct code */
-typedef short DCTELEM;
-
-/**
-* Required number of additionally allocated bytes at the end of the input bitstream for decoding.
-* This is mainly needed because some optimized bitstream readers read
-* 32 or 64 bit at once and could read over the end.<br>
-* Note: If the first 23 bits of the additional bytes are not 0, then damaged
-* MPEG bitstreams could cause overread and segfault.
-*/
-#define FF_INPUT_BUFFER_PADDING_SIZE 8
-
-enum AVColorPrimaries{
-    AVCOL_PRI_BT709      =1, ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B
-    AVCOL_PRI_UNSPECIFIED=2,
-    AVCOL_PRI_BT470M     =4,
-    AVCOL_PRI_BT470BG    =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM
-    AVCOL_PRI_SMPTE170M  =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC
-    AVCOL_PRI_SMPTE240M  =7, ///< functionally identical to above
-    AVCOL_PRI_FILM       =8,
-    AVCOL_PRI_NB           , ///< Not part of ABI
-};
-
-enum AVColorTransferCharacteristic{
-    AVCOL_TRC_BT709      =1, ///< also ITU-R BT1361
-    AVCOL_TRC_UNSPECIFIED=2,
-    AVCOL_TRC_GAMMA22    =4, ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM
-    AVCOL_TRC_GAMMA28    =5, ///< also ITU-R BT470BG
-    AVCOL_TRC_NB           , ///< Not part of ABI
-};
-
-enum AVColorSpace{
-    AVCOL_SPC_RGB        =0,
-    AVCOL_SPC_BT709      =1, ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B
-    AVCOL_SPC_UNSPECIFIED=2,
-    AVCOL_SPC_FCC        =4,
-    AVCOL_SPC_BT470BG    =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601
-    AVCOL_SPC_SMPTE170M  =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC / functionally identical to above
-    AVCOL_SPC_SMPTE240M  =7,
-    AVCOL_SPC_NB           , ///< Not part of ABI
-};
-
-enum AVColorRange{
-    AVCOL_RANGE_UNSPECIFIED=0,
-    AVCOL_RANGE_MPEG       =1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges
-    AVCOL_RANGE_JPEG       =2, ///< the normal     2^n-1   "JPEG" YUV ranges
-    AVCOL_RANGE_NB           , ///< Not part of ABI
-};
-
-#define MAX_MMCO_COUNT 66
-/**
-* Memory management control operation opcode.
-*/
-typedef enum MMCOOpcode{
-    MMCO_END=0,
-    MMCO_SHORT2UNUSED,
-    MMCO_LONG2UNUSED,
-    MMCO_SHORT2LONG,
-    MMCO_SET_MAX_LONG,
-    MMCO_RESET,
-    MMCO_LONG,
-} MMCOOpcode;
-
-/* NAL unit types */
-enum {
-    NAL_SLICE=1,
-    NAL_DPA,
-    NAL_DPB,
-    NAL_DPC,
-    NAL_IDR_SLICE,
-    NAL_SEI,
-    NAL_SPS,
-    NAL_PPS,
-    NAL_AUD,
-    NAL_END_SEQUENCE,
-    NAL_END_STREAM,
-    NAL_FILLER_DATA,
-    NAL_SPS_EXT,
-    NAL_AUXILIARY_SLICE=19
-};
-
-/**
-* SEI message types
-*/
-typedef enum {
-    SEI_BUFFERING_PERIOD             =  0, ///< buffering period (H.264, D.1.1)
-    SEI_TYPE_PIC_TIMING              =  1, ///< picture timing
-    SEI_TYPE_USER_DATA_UNREGISTERED  =  5, ///< unregistered user data
-    SEI_TYPE_RECOVERY_POINT          =  6  ///< recovery point (frame # to decoder sync)
-} SEI_Type;
-
-/**
-* pic_struct in picture timing SEI message
-*/
-typedef enum {
-    SEI_PIC_STRUCT_FRAME             = 0, ///<  0: %frame
-    SEI_PIC_STRUCT_TOP_FIELD         = 1, ///<  1: top field
-    SEI_PIC_STRUCT_BOTTOM_FIELD      = 2, ///<  2: bottom field
-    SEI_PIC_STRUCT_TOP_BOTTOM        = 3, ///<  3: top field, bottom field, in that order
-    SEI_PIC_STRUCT_BOTTOM_TOP        = 4, ///<  4: bottom field, top field, in that order
-    SEI_PIC_STRUCT_TOP_BOTTOM_TOP    = 5, ///<  5: top field, bottom field, top field repeated, in that order
-    SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM = 6, ///<  6: bottom field, top field, bottom field repeated, in that order
-    SEI_PIC_STRUCT_FRAME_DOUBLING    = 7, ///<  7: %frame doubling
-    SEI_PIC_STRUCT_FRAME_TRIPLING    = 8  ///<  8: %frame tripling
-} SEI_PicStructType;
-
-#define FF_MAX_B_FRAMES 16
-
-
-//The following defines may change, don't expect compatibility if you use them.
-#define MB_TYPE_INTRA4x4   0x0001
-#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific
-#define MB_TYPE_INTRA_PCM  0x0004 //FIXME H.264-specific
-#define MB_TYPE_16x16      0x0008
-#define MB_TYPE_16x8       0x0010
-#define MB_TYPE_8x16       0x0020
-#define MB_TYPE_8x8        0x0040
-#define MB_TYPE_INTERLACED 0x0080
-#define MB_TYPE_DIRECT2    0x0100 //FIXME
-#define MB_TYPE_ACPRED     0x0200
-#define MB_TYPE_GMC        0x0400
-#define MB_TYPE_SKIP       0x0800
-#define MB_TYPE_P0L0       0x1000
-#define MB_TYPE_P1L0       0x2000
-#define MB_TYPE_P0L1       0x4000
-#define MB_TYPE_P1L1       0x8000
-#define MB_TYPE_L0         (MB_TYPE_P0L0 | MB_TYPE_P1L0)
-#define MB_TYPE_L1         (MB_TYPE_P0L1 | MB_TYPE_P1L1)
-#define MB_TYPE_L0L1       (MB_TYPE_L0   | MB_TYPE_L1)
-#define MB_TYPE_QUANT      0x00010000
-#define MB_TYPE_CBP        0x00020000
-//Note bits 24-31 are reserved for codec specific use (h264 ref0, mpeg1 0mv, ...)
-
-#define FF_BUFFER_TYPE_INTERNAL 1
-#define FF_BUFFER_TYPE_USER     2 ///< direct rendering buffers (image is (de)allocated by user)
-#define FF_BUFFER_TYPE_SHARED   4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared.
-#define FF_BUFFER_TYPE_COPY     8 ///< Just a (modified) copy of some other buffer, don't deallocate anything.
-
-
-#define FF_I_TYPE  1 ///< Intra
-#define FF_P_TYPE  2 ///< Predicted
-#define FF_B_TYPE  3 ///< Bi-dir predicted
-#define FF_S_TYPE  4 ///< S(GMC)-VOP MPEG4
-#define FF_SI_TYPE 5 ///< Switching Intra
-#define FF_SP_TYPE 6 ///< Switching Predicted
-#define FF_BI_TYPE 7
-
-#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if there is just one type
-#define IS_INTRA4x4(a)   ((a)&MB_TYPE_INTRA4x4)
-#define IS_INTRA16x16(a) ((a)&MB_TYPE_INTRA16x16)
-#define IS_PCM(a)        ((a)&MB_TYPE_INTRA_PCM)
-#define IS_INTRA(a)      ((a)&7)
-#define IS_INTER(a)      ((a)&(MB_TYPE_16x16|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8))
-#define IS_SKIP(a)       ((a)&MB_TYPE_SKIP)
-#define IS_INTRA_PCM(a)  ((a)&MB_TYPE_INTRA_PCM)
-#define IS_INTERLACED(a) ((a)&MB_TYPE_INTERLACED)
-#define IS_DIRECT(a)     ((a)&MB_TYPE_DIRECT2)
-#define IS_GMC(a)        ((a)&MB_TYPE_GMC)
-#define IS_16X16(a)      ((a)&MB_TYPE_16x16)
-#define IS_16X8(a)       ((a)&MB_TYPE_16x8)
-#define IS_8X16(a)       ((a)&MB_TYPE_8x16)
-#define IS_8X8(a)        ((a)&MB_TYPE_8x8)
-#define IS_SUB_8X8(a)    ((a)&MB_TYPE_16x16) //note reused
-#define IS_SUB_8X4(a)    ((a)&MB_TYPE_16x8)  //note reused
-#define IS_SUB_4X8(a)    ((a)&MB_TYPE_8x16)  //note reused
-#define IS_SUB_4X4(a)    ((a)&MB_TYPE_8x8)   //note reused
-#define IS_ACPRED(a)     ((a)&MB_TYPE_ACPRED)
-#define IS_QUANT(a)      ((a)&MB_TYPE_QUANT)
-#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list))))
-#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0|MB_TYPE_P1L0)<<(2*(list)))) ///< does this mb use listX, note does not work if subMBs
-#define HAS_CBP(a)        ((a)&MB_TYPE_CBP)
-
-
-#define FF_MM_FORCE    0x80000000 /* Force usage of selected flags (OR) */
-    /* lower 16 bits - CPU features */
-#define FF_MM_MMX      0x0001 ///< standard MMX
-#define FF_MM_3DNOW    0x0004 ///< AMD 3DNOW
-#define FF_MM_MMX2     0x0002 ///< SSE integer functions or AMD MMX ext
-#define FF_MM_SSE      0x0008 ///< SSE functions
-#define FF_MM_SSE2     0x0010 ///< PIV SSE2 functions
-#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
-#define FF_MM_SSE3     0x0040 ///< Prescott SSE3 functions
-#define FF_MM_SSSE3    0x0080 ///< Conroe SSSE3 functions
-#define FF_MM_SSE4     0x0100 ///< Penryn SSE4.1 functions
-#define FF_MM_SSE42    0x0200 ///< Nehalem SSE4.2 functions
-#define FF_MM_IWMMXT   0x0100 ///< XScale IWMMXT
-#define FF_MM_ALTIVEC  0x0001 ///< standard AltiVec
-
-
-/**
-* Sequence parameter set
-*/
-typedef struct SPS{
-
-    int profile_idc;
-    int level_idc;
-    int chroma_format_idc;
-    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
-    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
-    int poc_type;                      ///< pic_order_cnt_type
-    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
-    int delta_pic_order_always_zero_flag;
-    int offset_for_non_ref_pic;
-    int offset_for_top_to_bottom_field;
-    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
-    int ref_frame_count;               ///< num_ref_frames
-    int gaps_in_frame_num_allowed_flag;
-    int mb_width;                      ///< pic_width_in_mbs_minus1 + 1
-    int mb_height;                     ///< pic_height_in_map_units_minus1 + 1
-    int frame_mbs_only_flag;
-    int mb_aff;                        ///<mb_adaptive_frame_field_flag
-    int direct_8x8_inference_flag;
-    int crop;                   ///< frame_cropping_flag
-    unsigned int crop_left;            ///< frame_cropping_rect_left_offset
-    unsigned int crop_right;           ///< frame_cropping_rect_right_offset
-    unsigned int crop_top;             ///< frame_cropping_rect_top_offset
-    unsigned int crop_bottom;          ///< frame_cropping_rect_bottom_offset
-    int vui_parameters_present_flag;
-    int num,den;
-
-    int video_signal_type_present_flag;
-    int full_range;
-    int colour_description_present_flag;
-    enum AVColorPrimaries color_primaries;
-    enum AVColorTransferCharacteristic color_trc;
-    enum AVColorSpace colorspace;
-    int timing_info_present_flag;
-    uint32_t num_units_in_tick;
-    uint32_t time_scale;
-    int fixed_frame_rate_flag;
-    short offset_for_ref_frame[256]; //FIXME dyn aloc?
-    int bitstream_restriction_flag;
-    int num_reorder_frames;
-    int scaling_matrix_present;
-    uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
-    int nal_hrd_parameters_present_flag;
-    int vcl_hrd_parameters_present_flag;
-    int pic_struct_present_flag;
-    int time_offset_length;
-    int cpb_cnt;                       ///< See H.264 E.1.2
-    int initial_cpb_removal_delay_length; ///< initial_cpb_removal_delay_length_minus1 +1
-    int cpb_removal_delay_length;      ///< cpb_removal_delay_length_minus1 + 1
-    int dpb_output_delay_length;       ///< dpb_output_delay_length_minus1 + 1
-    int bit_depth_luma;                ///< bit_depth_luma_minus8 + 8
-    int bit_depth_chroma;              ///< bit_depth_chroma_minus8 + 8
-    int residual_color_transform_flag; ///< residual_colour_transform_flag
-}SPS;
-
-/**
-* Picture parameter set
-*/
-typedef struct PPS{
-    unsigned int sps_id;
-    int cabac;                  ///< entropy_coding_mode_flag
-    int pic_order_present;      ///< pic_order_present_flag
-    int slice_group_count;      ///< num_slice_groups_minus1 + 1
-    int mb_slice_group_map_type;
-    unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
-    int weighted_pred;          ///< weighted_pred_flag
-    int weighted_bipred_idc;
-    int init_qp;                ///< pic_init_qp_minus26 + 26
-    int init_qs;                ///< pic_init_qs_minus26 + 26
-    int chroma_qp_index_offset[2];
-    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
-    int constrained_intra_pred; ///< constrained_intra_pred_flag
-    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
-    int transform_8x8_mode;     ///< transform_8x8_mode_flag
-    uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
-    uint8_t chroma_qp_table[2][64];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
-    int chroma_qp_diff;
-}PPS;
-
-typedef struct TopBorder{
-    uint8_t unfiltered_y[16];
-    uint8_t unfiltered_cb[8];
-    uint8_t unfiltered_cr[8];
-
-    uint8_t top_borders_y[16*4];
-    uint8_t top_borders_cb[8*2];
-    uint8_t top_borders_cr[8*2];
-}TopBorder;
-
-typedef struct LeftBorder{
-    uint8_t unfiltered_y[17];
-    uint8_t unfiltered_cb[9];
-    uint8_t unfiltered_cr[9];
-}LeftBorder;
-
-typedef struct H264Mb {
-    //variables copied in after cabac decoding
-    int16_t mb_x, mb_y;
-    int32_t mb_type;
-
-    uint16_t cbp;                                               // coded block pattern, idct, deblock
-    int8_t qscale_mb_xy;                                        // qp, deblock
-    int8_t qscale_left_mb_xy; //not required
-    int8_t qscale_top_mb_xy;
-
-    DECLARE_ALIGNED(8, uint16_t, sub_mb_type[4]);
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count[24]);            //idct deblock
-    DECLARE_ALIGNED(16, int16_t, mb[16*24]);                    //coeffs, idct
-
-    union{
-        struct {
-        DECLARE_ALIGNED(8, int8_t, ref_index[2][4]);            //mc, deblock
-        DECLARE_ALIGNED(16, int16_t, mvd[2][16][2]);            //mc, deblock
-        };
-        struct {
-        DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode[16]);     //intra, deblock
-        int8_t chroma_pred_mode;                                //intra
-        int8_t intra16x16_pred_mode;                            //intra, deblock
-        };
-    };
-
-#if OMPSS
-    DECLARE_ALIGNED(8, uint8_t, top_border[16+ 2*8]);
-    DECLARE_ALIGNED(8, uint8_t, top_border_next[8]);
-    DECLARE_ALIGNED(8, uint8_t, left_border[17+2*9]);
-    int8_t intra4x4_pred_mode_left[4];
-#endif
-
-} H264Mb;
-
-typedef struct RawFrame {
-    uint8_t *data;
-    int size;
-    unsigned int data_size;
-    int64_t pos;                            ///< byte position in stream, -1 if unknown
-    int state;
-} RawFrame;
-
-typedef struct PictureInfo{
-    int ref_poc[2][16];      ///< h264 POCs of the frames used as reference
-    int ref_count[2];        ///< number of entries in ref_poc
-    int poc;                    ///< h264 frame POC
-    int frame_num;              ///< h264 frame_num (raw frame_num from slice header)
-    int pic_id;
-    int long_ref;
-    int cpn;                    ///coded picture number
-    int slice_type_nos;
-//     int key_frame;
-//     int mmco_reset;             ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET.
-
-    int reference;  //Set to 4 for delayed, non-reference frames. 1-3 for reference. FIXME
-
-}PictureInfo;
-
-typedef struct DecodedPicture{
-    int16_t (*motion_val[2])[2];
-    int16_t (*motion_val_base[2])[2];
-
-    /**
-    * motion reference frame index
-    * the order in which these are stored can depend on the codec.
-    * - encoding: Set by user.
-    * - decoding: Set by libavcodec.
-    */
-    int8_t *ref_index[2];
-    uint32_t *mb_type;          //mb_type_base + mb_width + 2
-    uint32_t *mb_type_base;
-
-    int8_t *intra4x4_pred_mode;
-    int8_t *non_zero_count;
-
-    uint8_t *data[3]; //point to first pixel in the frame
-    int linesize[3];
-    uint8_t *base[3]; //base of picture planes
-
-    int cpn;                /// coded picture number
-    int poc;                    ///< h264 frame POC
-    int reference;  // 0 -> free, 1 -> needs to be displayed, 2 -> needed for reference, 3 -> 1 && 2
-    int key_frame;
-    int mmco_reset;             ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET.
-
-} DecodedPicture;
-
-
-#endif /* AVCODEC_AVCODEC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cabac.c
--- a/ffmpeg_smp/h264dec/libavcodec/cabac.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,242 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Context Adaptive Binary Arithmetic Coder.
- */
-
-#include <string.h>
-
-#include "libavutil/common.h"
-//#include "get_bits.h"
-#include "cabac.h"
-
-static const uint8_t lps_range[64][4]= {
-{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
-{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
-{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
-{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
-{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
-{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
-{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
-{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
-{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
-{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
-{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
-{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
-{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
-{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
-{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
-{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
-};
-
-uint8_t ff_h264_mlps_state[4*64];
-uint8_t ff_h264_lps_range[4*2*64];
-uint8_t ff_h264_lps_state[2*64];
-uint8_t ff_h264_mps_state[2*64];
-
-static const uint8_t mps_state[64]= {
-  1, 2, 3, 4, 5, 6, 7, 8,
-  9,10,11,12,13,14,15,16,
- 17,18,19,20,21,22,23,24,
- 25,26,27,28,29,30,31,32,
- 33,34,35,36,37,38,39,40,
- 41,42,43,44,45,46,47,48,
- 49,50,51,52,53,54,55,56,
- 57,58,59,60,61,62,62,63,
-};
-
-static const uint8_t lps_state[64]= {
-  0, 0, 1, 2, 2, 4, 4, 5,
-  6, 7, 8, 9, 9,11,11,12,
- 13,13,15,15,16,16,18,18,
- 19,19,21,21,22,22,23,24,
- 24,25,26,26,27,27,28,29,
- 29,30,30,30,31,32,32,33,
- 33,33,34,34,35,35,35,36,
- 36,36,37,37,37,38,38,63,
-};
-
-const uint8_t ff_h264_norm_shift[512]= {
- 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
- 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-};
-
-/**
- *
- * @param buf_size size of buf in bits
- */
-void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
-    c->bytestream_start=
-    c->bytestream= buf;
-    c->bytestream_end= buf + buf_size;
-
-#if CABAC_BITS == 16
-    c->low =  (*c->bytestream++)<<18;
-    c->low+=  (*c->bytestream++)<<10;
-#else
-    c->low =  (*c->bytestream++)<<10;
-#endif
-    c->low+= ((*c->bytestream++)<<2) + 2;
-    c->range= 0x1FE;
-}
-
-void ff_init_cabac_states(){
-    int i, j;
-
-    for(i=0; i<64; i++){
-        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
-            ff_h264_lps_range[j*2*64+2*i+0]=
-            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
-        }
-
-        ff_h264_mlps_state[128+2*i+0]=
-        ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0;
-        ff_h264_mlps_state[128+2*i+1]=
-        ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1;
-
-        if( i ){
-#ifdef BRANCHLESS_CABAC_DECODER
-            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
-            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
-        }else{
-            ff_h264_mlps_state[128-2*i-1]= 1;
-            ff_h264_mlps_state[128-2*i-2]= 0;
-#else
-            ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0;
-            ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1;
-        }else{
-            ff_h264_lps_state[2*i+0]= 1;
-            ff_h264_lps_state[2*i+1]= 0;
-#endif
-        }
-    }
-}
-
-#ifdef TEST
-#define SIZE 10240
-#define START_TIMER
-#define STOP_TIMER(...)
-#define av_log(...)
-// #include "libavutil/lfg.h"
-#include "avcodec.h"
-#include "cabac.h"
-
-int main(void){
-    CABACContext c;
-    uint8_t b[9*SIZE];
-    uint8_t r[9*SIZE];
-    int i;
-    uint8_t state[10]= {0};
-//    AVLFG prng;
-
-// //     av_lfg_init(&prng, 1);
-//     ff_init_cabac_encoder(&c, b, SIZE);
-//     ff_init_cabac_states();
-//
-//     for(i=0; i<SIZE; i++){
-//         r[i] = i%7; //av_lfg_get(&prng) % 7;
-//     }
-//
-//     for(i=0; i<SIZE; i++){
-// START_TIMER
-//         put_cabac_bypass(&c, r[i]&1);
-// STOP_TIMER("put_cabac_bypass")
-//     }
-//
-//     for(i=0; i<SIZE; i++){
-// START_TIMER
-//         put_cabac(&c, state, r[i]&1);
-// STOP_TIMER("put_cabac")
-//     }
-//
-//     for(i=0; i<SIZE; i++){
-// START_TIMER
-//         put_cabac_u(&c, state, r[i], 6, 3, i&1);
-// STOP_TIMER("put_cabac_u")
-//     }
-//
-//     for(i=0; i<SIZE; i++){
-// START_TIMER
-//         put_cabac_ueg(&c, state, r[i], 3, 0, 1, 2);
-// STOP_TIMER("put_cabac_ueg")
-//     }
-//
-//     put_cabac_terminate(&c, 1);
-
-    ff_init_cabac_decoder(&c, b, SIZE);
-
-    memset(state, 0, sizeof(state));
-
-    for(i=0; i<SIZE; i++){
-START_TIMER
-        if( (r[i]&1) != get_cabac_bypass(&c) )
-            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
-STOP_TIMER("get_cabac_bypass")
-    }
-
-    for(i=0; i<SIZE; i++){
-START_TIMER
-        if( (r[i]&1) != get_cabac(&c, state) )
-            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
-STOP_TIMER("get_cabac")
-    }
-#if 0
-    for(i=0; i<SIZE; i++){
-START_TIMER
-        if( r[i] != get_cabac_u(&c, state, (i&1) ? 6 : 7, 3, i&1) )
-            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
-STOP_TIMER("get_cabac_u")
-    }
-
-    for(i=0; i<SIZE; i++){
-START_TIMER
-        if( r[i] != get_cabac_ueg(&c, state, 3, 0, 1, 2))
-            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
-STOP_TIMER("get_cabac_ueg")
-    }
-#endif
-    if(!get_cabac_terminate(&c))
-        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
-
-    return 0;
-}
-
-#endif /* TEST */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cabac.h
--- a/ffmpeg_smp/h264dec/libavcodec/cabac.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,206 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Context Adaptive Binary Arithmetic Coder.
- */
-
-#ifndef AVCODEC_CABAC_H
-#define AVCODEC_CABAC_H
-
-//#undef NDEBUG
-#include <assert.h>
-#include "libavutil/x86_cpu.h"
-#include "libavutil/attributes.h"
-
-#define CABAC_BITS 16
-#define CABAC_MASK ((1<<CABAC_BITS)-1)
-#define BRANCHLESS_CABAC_DECODER 1
-
-typedef struct CABACContext{
-    int low;
-    int range;
-    int outstanding_count;
-#ifdef STRICT_LIMITS
-    int symCount;
-#endif
-    const uint8_t *bytestream_start;
-    const uint8_t *bytestream;
-    const uint8_t *bytestream_end;
-    uint8_t  cabac_state[460];
-}CABACContext;
-
-extern uint8_t ff_h264_mlps_state[4*64];
-extern uint8_t ff_h264_lps_range[4*2*64];  ///< rangeTabLPS
-extern uint8_t ff_h264_mps_state[2*64];     ///< transIdxMPS
-extern uint8_t ff_h264_lps_state[2*64];     ///< transIdxLPS
-extern const uint8_t ff_h264_norm_shift[512];
-
-void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
-void ff_init_cabac_states(void);
-
-static void refill(CABACContext *c){
-#if CABAC_BITS == 16
-        c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
-#else
-        c->low+= c->bytestream[0]<<1;
-#endif
-    c->low -= CABAC_MASK;
-    c->bytestream+= CABAC_BITS/8;
-}
-
-static void refill2(CABACContext *c){
-    int i, x;
-
-    x= c->low ^ (c->low-1);
-    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
-
-    x= -CABAC_MASK;
-
-#if CABAC_BITS == 16
-        x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
-#else
-        x+= c->bytestream[0]<<1;
-#endif
-
-    c->low += x<<i;
-    c->bytestream+= CABAC_BITS/8;
-}
-
-static inline void renorm_cabac_decoder(CABACContext *c){
-    while(c->range < 0x100){
-        c->range+= c->range;
-        c->low+= c->low;
-        if(!(c->low & CABAC_MASK))
-            refill(c);
-    }
-}
-
-static inline void renorm_cabac_decoder_once(CABACContext *c){
-
-    int shift= (uint32_t)(c->range - 0x100)>>31;
-    c->range<<= shift;
-    c->low  <<= shift;
-
-    if(!(c->low & CABAC_MASK))
-        refill(c);
-}
-
-static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
-
-    int s = *state;
-    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
-    int bit, lps_mask av_unused;
-
-    c->range -= RangeLPS;
-#ifndef BRANCHLESS_CABAC_DECODER
-    if(c->low < (c->range<<(CABAC_BITS+1))){
-        bit= s&1;
-        *state= ff_h264_mps_state[s];
-        renorm_cabac_decoder_once(c);
-    }else{
-        bit= ff_h264_norm_shift[RangeLPS];
-        c->low -= (c->range<<(CABAC_BITS+1));
-        *state= ff_h264_lps_state[s];
-        c->range = RangeLPS<<bit;
-        c->low <<= bit;
-        bit= (s&1)^1;
-
-        if(!(c->low & CABAC_MASK)){
-            refill2(c);
-        }
-    }
-#else /* BRANCHLESS_CABAC_DECODER */
-    lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
-
-    c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
-    c->range += (RangeLPS - c->range) & lps_mask;
-
-    s^=lps_mask;
-    *state= (ff_h264_mlps_state+128)[s];
-    bit= s&1;
-
-    lps_mask= ff_h264_norm_shift[c->range];
-    c->range<<= lps_mask;
-    c->low  <<= lps_mask;
-    if(!(c->low & CABAC_MASK))
-        refill2(c);
-#endif /* BRANCHLESS_CABAC_DECODER */
-
-    return bit;
-}
-
-static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){
-    return get_cabac_inline(c, state);
-}
-
-static int av_unused get_cabac(CABACContext *c, uint8_t * const state){
-    return get_cabac_inline(c, state);
-}
-
-static int av_unused get_cabac_bypass(CABACContext *c){
-
-    int range;
-    c->low += c->low;
-
-    if(!(c->low & CABAC_MASK))
-        refill(c);
-
-    range= c->range<<(CABAC_BITS+1);
-    if(c->low < range){
-        return 0;
-    }else{
-        c->low -= range;
-        return 1;
-    }
-}
-
-static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
-    int range, mask;
-    c->low += c->low;
-
-    if(!(c->low & CABAC_MASK))
-        refill(c);
-
-    range= c->range<<(CABAC_BITS+1);
-    c->low -= range;
-    mask= c->low >> 31;
-    range &= mask;
-    c->low += range;
-    return (val^mask)-mask;
-}
-
-/**
- *
- * @return the number of bytes read or 0 if no end
- */
-static int av_unused get_cabac_terminate(CABACContext *c){
-    c->range -= 2;
-    if(c->low < c->range<<(CABAC_BITS+1)){
-        renorm_cabac_decoder_once(c);
-        return 0;
-    }else{
-        return c->bytestream - c->bytestream_start;
-    }
-}
-
-#endif /* AVCODEC_CABAC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,140 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Context Adaptive Binary Arithmetic Coder.
- */
-
-#include <string.h>
-
-#include "libavutil/common.h"
-//#include "get_bits.h"
-#include "cabac_spu.h"
-#define av_log(...)
-
-int bytecount =0;
-static const uint8_t lps_range[64][4]= {
-{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
-{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
-{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
-{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
-{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
-{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
-{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
-{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
-{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
-{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
-{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
-{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
-{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
-{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
-{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
-{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
-};
-
-uint8_t ff_h264_mlps_state[4*64];
-uint8_t ff_h264_lps_range[4*2*64];
-uint8_t ff_h264_lps_state[2*64];
-uint8_t ff_h264_mps_state[2*64];
-
-static const uint8_t mps_state[64]= {
-  1, 2, 3, 4, 5, 6, 7, 8,
-  9,10,11,12,13,14,15,16,
- 17,18,19,20,21,22,23,24,
- 25,26,27,28,29,30,31,32,
- 33,34,35,36,37,38,39,40,
- 41,42,43,44,45,46,47,48,
- 49,50,51,52,53,54,55,56,
- 57,58,59,60,61,62,62,63,
-};
-
-static const uint8_t lps_state[64]= {
-  0, 0, 1, 2, 2, 4, 4, 5,
-  6, 7, 8, 9, 9,11,11,12,
- 13,13,15,15,16,16,18,18,
- 19,19,21,21,22,22,23,24,
- 24,25,26,26,27,27,28,29,
- 29,30,30,30,31,32,32,33,
- 33,33,34,34,35,35,35,36,
- 36,36,37,37,37,38,38,63,
-};
-
-const uint8_t ff_h264_norm_shift[512]= {
- 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
- 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-};
-
-/**
- *
- * @param buf_size size of buf in bits
- */
-
-void ff_init_cabac_states(){
-    int i, j;
-
-    for(i=0; i<64; i++){
-        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
-            ff_h264_lps_range[j*2*64+2*i+0]=
-            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
-        }
-
-        ff_h264_mlps_state[128+2*i+0]=
-        ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0;
-        ff_h264_mlps_state[128+2*i+1]=
-        ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1;
-
-        if( i ){
-#ifdef BRANCHLESS_CABAC_DECODER
-            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
-            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
-        }else{
-            ff_h264_mlps_state[128-2*i-1]= 1;
-            ff_h264_mlps_state[128-2*i-2]= 0;
-#else
-            ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0;
-            ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1;
-        }else{
-            ff_h264_lps_state[2*i+0]= 1;
-            ff_h264_lps_state[2*i+1]= 0;
-#endif
-        }
-    }
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,233 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Context Adaptive Binary Arithmetic Coder.
- */
-
-#ifndef AVCODEC_CABAC_H
-#define AVCODEC_CABAC_H
-
-//#undef NDEBUG
-#include <assert.h>
-#include "h264_dma.h"
-#include "libavutil/x86_cpu.h"
-#include "libavutil/attributes.h"
-
-#define CABAC_BITS 16
-#define CABAC_MASK ((1<<CABAC_BITS)-1)
-#define BRANCHLESS_CABAC_DECODER 1
-
-typedef struct CABACContext{
-    int low;
-    int range;
-    int outstanding_count;
-#ifdef STRICT_LIMITS
-    int symCount;
-#endif
-	const uint8_t *bytestream_ea_start;
-    const uint8_t *bytestream_ea;
-	const uint8_t *bytestream_ea_end;
-	int slot;
-	int bufsize;
-
-	uint8_t *bytestream_start;
-    uint8_t *bytestream;
-    uint8_t *bytestream_end;
-    uint8_t  cabac_state[460];
-}CABACContext;
-
-extern uint8_t ff_h264_mlps_state[4*64];
-extern uint8_t ff_h264_lps_range[4*2*64];  ///< rangeTabLPS
-extern uint8_t ff_h264_mps_state[2*64];     ///< transIdxMPS
-extern uint8_t ff_h264_lps_state[2*64];     ///< transIdxLPS
-extern const uint8_t ff_h264_norm_shift[512];
-
-void ff_init_cabac_states(void);
-
-extern DECLARE_ALIGNED(128,uint8_t, bytestream_ls[4096]);
-extern int bytecount;
-static inline void dma_cabac(CABACContext *c){
-	bytecount++;
-	if (c->bytestream == c->bytestream_end){
-		if (c->bufsize>0){
-			int size = (c->bufsize > sizeof(bytestream_ls)) ?  sizeof(bytestream_ls) : c->bufsize;
-			int align = size &0xF;
-			int dma_size = size + (align? 16-align : 0);
-
-			spu_dma_get(bytestream_ls, (unsigned) c->bytestream_ea, dma_size, ED_raw);
-			wait_dma_id(ED_raw);
-			c->bytestream = bytestream_ls;
-			c->bytestream_end = &bytestream_ls[size];
-			c->bytestream_ea += dma_size;
-			c->bufsize -= size;
-		} 
-		bytecount =0;
-	}else if((unsigned)c->bytestream > (unsigned)c->bytestream_end +2){		
-		//fprintf(stderr, "Read beyond end of frame %d\n", c->bufsize);
-		bytecount =0;
-	}
-}
-
-static void refill(CABACContext *c){
-	dma_cabac(c); 
-
-	c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
-
-    c->low -= CABAC_MASK;
-    c->bytestream+= CABAC_BITS/8;
-}
-
-static void refill2(CABACContext *c){
-    int i, x;
-
-	dma_cabac(c);
-
-    x= c->low ^ (c->low-1);
-    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
-
-    x= -CABAC_MASK;
-
-	x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
-
-    c->low += x<<i;
-    c->bytestream+= CABAC_BITS/8;
-}
-
-static inline void renorm_cabac_decoder(CABACContext *c){
-    while(c->range < 0x100){
-        c->range+= c->range;
-        c->low+= c->low;
-        if(!(c->low & CABAC_MASK))
-            refill(c);
-    }
-}
-
-static inline void renorm_cabac_decoder_once(CABACContext *c){
-
-    int shift= (uint32_t)(c->range - 0x100)>>31;
-    c->range<<= shift;
-    c->low  <<= shift;
-
-    if(!(c->low & CABAC_MASK))
-        refill(c);
-}
-
-static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
-
-    int s = *state;
-    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
-    int bit, lps_mask av_unused;
-
-    c->range -= RangeLPS;
-#ifndef BRANCHLESS_CABAC_DECODER
-    if(c->low < (c->range<<(CABAC_BITS+1))){
-        bit= s&1;
-        *state= ff_h264_mps_state[s];
-        renorm_cabac_decoder_once(c);
-    }else{
-        bit= ff_h264_norm_shift[RangeLPS];
-        c->low -= (c->range<<(CABAC_BITS+1));
-        *state= ff_h264_lps_state[s];
-        c->range = RangeLPS<<bit;
-        c->low <<= bit;
-        bit= (s&1)^1;
-
-        if(!(c->low & CABAC_MASK)){
-            refill2(c);
-        }
-    }
-#else /* BRANCHLESS_CABAC_DECODER */
-    lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
-
-    c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
-    c->range += (RangeLPS - c->range) & lps_mask;
-
-    s^=lps_mask;
-    *state= (ff_h264_mlps_state+128)[s];
-    bit= s&1;
-
-    lps_mask= ff_h264_norm_shift[c->range];
-    c->range<<= lps_mask;
-    c->low  <<= lps_mask;
-    if(!(c->low & CABAC_MASK))
-        refill2(c);
-#endif /* BRANCHLESS_CABAC_DECODER */
-
-    return bit;
-}
-
-static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){
-    return get_cabac_inline(c, state);
-}
-
-static int av_unused get_cabac(CABACContext *c, uint8_t * const state){
-    return get_cabac_inline(c, state);
-}
-
-static int av_unused get_cabac_bypass(CABACContext *c){
-
-    int range;
-    c->low += c->low;
-
-    if(!(c->low & CABAC_MASK))
-        refill(c);
-
-    range= c->range<<(CABAC_BITS+1);
-    if(c->low < range){
-        return 0;
-    }else{
-        c->low -= range;
-        return 1;
-    }
-}
-
-static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
-    int range, mask;
-    c->low += c->low;
-
-    if(!(c->low & CABAC_MASK))
-        refill(c);
-
-    range= c->range<<(CABAC_BITS+1);
-    c->low -= range;
-    mask= c->low >> 31;
-    range &= mask;
-    c->low += range;
-    return (val^mask)-mask;
-}
-
-/**
- *
- * @return the number of bytes read or 0 if no end
- */
-static int av_unused get_cabac_terminate(CABACContext *c){
-    c->range -= 2;
-    if(c->low < c->range<<(CABAC_BITS+1)){
-        renorm_cabac_decoder_once(c);
-        return 0;
-    }else{
-        return c->bytestream - c->bytestream_start;
-    }
-}
-
-#endif /* AVCODEC_CABAC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1147 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding. 
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD SPU kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-
-#include "dsputil_spu.h"
-#include "h264_idct_spu.h"
-#include "h264_deblock_spu.h"
-#include "types_spu.h"
-#include "libavutil/intreadwrite.h"
-
-#include <stdio.h>
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <assert.h>
-
-//Luma interpolation
-#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s
-#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s)
-
-#define OP_U8_SPU                          PUT_OP_U8_SPU
-#define PREFIX_h264_qpel16_h_lowpass_spu   put_h264_qpel16_h_lowpass_spu
-#define PREFIX_h264_qpel16_v_lowpass_spu   put_h264_qpel16_v_lowpass_spu
-#define PREFIX_h264_qpel16_hv_lowpass_spu  put_h264_qpel16_hv_lowpass_spu
-#define PREFIX_h264_qpel8_h_lowpass_spu    put_h264_qpel8_h_lowpass_spu
-#define PREFIX_h264_qpel8_v_lowpass_spu    put_h264_qpel8_v_lowpass_spu
-#define PREFIX_h264_qpel8_hv_lowpass_spu   put_h264_qpel8_hv_lowpass_spu
-#define PREFIX_h264_qpel4_h_lowpass_spu    put_h264_qpel4_h_lowpass_spu
-#define PREFIX_h264_qpel4_v_lowpass_spu    put_h264_qpel4_v_lowpass_spu
-#define PREFIX_h264_qpel4_hv_lowpass_spu   put_h264_qpel4_hv_lowpass_spu
-#include "h264_luma_template_spu.c"
-#undef OP_U8_SPU                          
-#undef PREFIX_h264_qpel16_h_lowpass_spu
-#undef PREFIX_h264_qpel16_v_lowpass_spu
-#undef PREFIX_h264_qpel16_hv_lowpass_spu
-#undef PREFIX_h264_qpel8_h_lowpass_spu
-#undef PREFIX_h264_qpel8_v_lowpass_spu
-#undef PREFIX_h264_qpel8_hv_lowpass_spu
-#undef PREFIX_h264_qpel4_h_lowpass_spu
-#undef PREFIX_h264_qpel4_v_lowpass_spu
-#undef PREFIX_h264_qpel4_hv_lowpass_spu
-
-#define OP_U8_SPU                          AVG_OP_U8_SPU
-#define PREFIX_h264_qpel16_h_lowpass_spu   avg_h264_qpel16_h_lowpass_spu
-#define PREFIX_h264_qpel16_v_lowpass_spu   avg_h264_qpel16_v_lowpass_spu
-#define PREFIX_h264_qpel16_hv_lowpass_spu  avg_h264_qpel16_hv_lowpass_spu
-#define PREFIX_h264_qpel8_h_lowpass_spu    avg_h264_qpel8_h_lowpass_spu
-#define PREFIX_h264_qpel8_v_lowpass_spu    avg_h264_qpel8_v_lowpass_spu
-#define PREFIX_h264_qpel8_hv_lowpass_spu   avg_h264_qpel8_hv_lowpass_spu
-#define PREFIX_h264_qpel4_h_lowpass_spu    avg_h264_qpel4_h_lowpass_spu
-#define PREFIX_h264_qpel4_v_lowpass_spu    avg_h264_qpel4_v_lowpass_spu
-#define PREFIX_h264_qpel4_hv_lowpass_spu   avg_h264_qpel4_hv_lowpass_spu
-#include "h264_luma_template_spu.c"
-#undef OP_U8_SPU                          
-#undef PREFIX_h264_qpel16_h_lowpass_spu
-#undef PREFIX_h264_qpel16_v_lowpass_spu
-#undef PREFIX_h264_qpel16_hv_lowpass_spu
-#undef PREFIX_h264_qpel8_h_lowpass_spu
-#undef PREFIX_h264_qpel8_v_lowpass_spu
-#undef PREFIX_h264_qpel8_hv_lowpass_spu
-#undef PREFIX_h264_qpel4_h_lowpass_spu
-#undef PREFIX_h264_qpel4_v_lowpass_spu
-#undef PREFIX_h264_qpel4_hv_lowpass_spu
-
-#define H264_MC(OPNAME, SIZE, CODETYPE) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \
-    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
-    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
-    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
-    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
-    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
-    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
-    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
-}\
-
-
-/**************************/
-/* put pixels functions   */
-/*************************/
-
-static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
-                                    const uint8_t * src2, int dst_stride,
-                                    int src_stride1, int h)
-{
-  int i;
-
-  const int perm_src1 = (unsigned int) src1 & 15;
-
-  for (i=0; i<h; i++){
-      //unaligned load of src1
-      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
-      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
-      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
-
-      //aligned load of src2
-      const vuint8_t srcb = *(vuint8_t *)(src2);
-
-      //average and rounding
-      const vuint8_t avgc = spu_avg(srca,srcb);
-
-      // 16x16 dest luma blocks are always aligned
-      *(vuint8_t *)dst=avgc;
-
-      src1 +=src_stride1;
-      src2 +=16;
-      dst  +=dst_stride;
-  }
-}
-
-static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
-                                    const uint8_t * src2, int dst_stride,
-                                    int src_stride1, int h)
-{
-  int i;
-
-  const int perm_src1 = (unsigned int) src1 & 15;
-
-  for (i=0; i<h; i++){
-      //unaligned load of src1
-      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
-      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
-      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
-
-      //aligned load of src2
-      const vuint8_t srcb = *(vuint8_t *)(src2);
-
-      //average and rounding
-      const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst);
-
-      // 16x16 dest luma blocks are always aligned
-      *(vuint8_t *)dst=avgc;
-
-      src1 +=src_stride1;
-      src2 +=16;
-      dst  +=dst_stride;
-  }
-}
-
-// next one assumes that ((line_size % 16) == 0)
-void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
-{
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    const int perm = (unsigned int) src & 15;
-    int i;
-	register int line_size   = src_stride;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-    register int dst_stride_2 = dst_stride << 1;
-    register int dst_stride_3 = dst_stride_2 + dst_stride;
-    register int dst_stride_4 = dst_stride << 2;
-
-    for(i=0; i<h; i+=4) {
-      pixelsv1 = *(vuint8_t *)(src);
-      pixelsv2 = *(vuint8_t *)(src+16);
-      pixelsv1B = *(vuint8_t *)(src + line_size);
-      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
-      pixelsv1C = *(vuint8_t *)(src + line_size_2);
-      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
-      pixelsv1D = *(vuint8_t *)(src + line_size_3);
-      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
-
-      *(vuint8_t *) dst                 = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16));
-      *(vuint8_t *)(dst +   dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16));
-      *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16));
-      *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16));
-
-      src+= line_size_4;
-      dst+= dst_stride_4;
-    }
-}
-
-// next one assumes that ((line_size % 16) == 0)
-void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
-{
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    const int perm = (unsigned int) src & 15;
-    int i;
-	register int line_size   = src_stride;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-    register int dst_stride_2 = dst_stride << 1;
-    register int dst_stride_3 = dst_stride_2 + dst_stride;
-    register int dst_stride_4 = dst_stride << 2;
-
-
-    for(i=0; i<h; i+=4) {
-      pixelsv1 = *(vuint8_t *)(src);
-      pixelsv2 = *(vuint8_t *)(src+16);
-      pixelsv1B = *(vuint8_t *)(src + line_size);
-      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
-      pixelsv1C = *(vuint8_t *)(src + line_size_2);
-      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
-      pixelsv1D = *(vuint8_t *)(src + line_size_3);
-      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
-
-      *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst);
-      *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride));
-      *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2));
-      *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3));
-
-      src+= line_size_4;
-      dst+= dst_stride_4;
-    }
-}
-
-void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
-				   int dst_stride, int src_stride1, int h)
-{
-  int i;
-
-  const int perm_src1 = (unsigned int) src1 & 15;
-  const int shift_dst = (unsigned int) dst & 15;
-
-  // 8x dest luma blocks are aligned or desaligned by 8
-  vuint8_t dstmask;
-  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-  if(shift_dst==0){
-    dstmask = dst8mask1;
-  }
-  else{
-    dstmask = dst8mask2;
-  }
-
-  for (i=0; i<h; i++){
-      //unaligned load of src1
-      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
-      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
-      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
-
-      //aligned load of src2
-      const vuint8_t srcb = *(vuint8_t *)(src2);
-
-      //average and rounding
-      const vuint8_t avgc = spu_avg(srca,srcb);
-
-      const vuint8_t dst1 = *(vuint8_t *)dst;
-
-      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
-
-      *(vuint8_t *)dst=davgc;
-
-      src1 +=src_stride1;
-      src2 +=16;
-      dst  +=dst_stride;
-  }
-}
-
-void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
-				   int dst_stride, int src_stride1, int h)
-{
-  int i;
-
-  const int perm_src1 = (unsigned int) src1 & 15;
-  const int shift_dst = (unsigned int) dst & 15;
-
-  // 8x dest luma blocks are aligned or desaligned by 8
-  vuint8_t dstmask;
-  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-  if(shift_dst==0){
-    dstmask = dst8mask1;
-  }
-  else{
-    dstmask = dst8mask2;
-  }
-
-  for (i=0; i<h; i++){
-      //unaligned load of src1
-      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
-      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
-      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
-
-      //aligned load of src2
-      const vuint8_t srcb = *(vuint8_t *)(src2);
-
-      //average and rounding
-      const vuint8_t avgc = spu_avg(srca,srcb);
-
-      const vuint8_t dst1 = *(vuint8_t *)dst;
-
-      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
-
-      const vuint8_t davgc = spu_avg(dst1,davgc1);
-
-      *(vuint8_t *)dst=davgc;
-
-      src1 +=src_stride1;
-      src2 +=16;
-      dst  +=dst_stride;
-  }
-}
-
-// next one assumes that ((line_size % 16) == 0)
-void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
-{
-    register vector unsigned char pixelsv1A, pixelsv2A;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    const int perm = (unsigned int) src & 15;
-    const int shift_dst = (unsigned int) dst & 15;
-
-    // 8x dest luma blocks are aligned or desaligned by 8
-    vuint8_t dstmask;
-    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-    if(shift_dst==0){
-      dstmask = dst8mask1;
-    }
-    else{
-      dstmask = dst8mask2;
-    }
-
-    int i;
-	register int line_size   = src_stride;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-    register int dst_stride_2 = dst_stride << 1;
-    register int dst_stride_3 = dst_stride_2 + dst_stride;
-    register int dst_stride_4 = dst_stride << 2;
-
-    for(i=0; i<h; i+=4) {
-      pixelsv1A = *(vuint8_t *)(src);
-      pixelsv2A = *(vuint8_t *)(src+16);
-      pixelsv1B = *(vuint8_t *)(src + line_size);
-      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
-      pixelsv1C = *(vuint8_t *)(src + line_size_2);
-      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
-      pixelsv1D = *(vuint8_t *)(src + line_size_3);
-      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
-
-      const vuint8_t block1 = *(vuint8_t *)dst;
-      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
-      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
-      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
-      const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride);
-      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
-      const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride);
-      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
-
-      *(vuint8_t *) dst = put1;
-      *(vuint8_t *)(dst + dst_stride) = put2;
-      *(vuint8_t *)(dst + dst_stride_2) = put3;
-      *(vuint8_t *)(dst + dst_stride_3) = put4;
-
-      src += line_size_4;
-      dst += dst_stride_4;
-    }
-}
-
-// next one assumes that ((line_size % 16) == 0)
-void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
-{
-    register vector unsigned char pixelsv1A, pixelsv2A;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    const int perm = (unsigned int) src & 15;
-    const int shift_dst = (unsigned int) dst & 15;
-
-    // 8x dest luma blocks are aligned or desaligned by 8
-    vuint8_t dstmask;
-    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-    if(shift_dst==0){
-      dstmask = dst8mask1;
-    }
-    else{
-      dstmask = dst8mask2;
-    }
-
-    int i;
-	register int line_size   = src_stride;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-	register int dst_stride_2 = dst_stride << 1;
-    register int dst_stride_3 = dst_stride_2 + dst_stride;
-    register int dst_stride_4 = dst_stride << 2;
-
-    for(i=0; i<h; i+=4) {
-      pixelsv1A = *(vuint8_t *)(src);
-      pixelsv2A = *(vuint8_t *)(src+16);
-      pixelsv1B = *(vuint8_t *)(src + line_size);
-      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
-      pixelsv1C = *(vuint8_t *)(src + line_size_2);
-      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
-      pixelsv1D = *(vuint8_t *)(src + line_size_3);
-      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
-
-      const vuint8_t block1 = *(vuint8_t *) dst;
-      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
-      const vuint8_t put1 = spu_avg(block1,put1a);
-
-      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
-      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
-      const vuint8_t put2 = spu_avg(block2,put2a);
-
-      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
-      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
-      const vuint8_t put3 = spu_avg(block3,put3a);
-
-      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
-      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
-      const vuint8_t put4 = spu_avg(block4,put4a);
-
-      *(vuint8_t *) dst = put1;
-      *(vuint8_t *)(dst + dst_stride) = put2;
-      *(vuint8_t *)(dst + dst_stride_2) = put3;
-      *(vuint8_t *)(dst + dst_stride_3) = put4;
-
-      src+= line_size_4;
-      dst+= dst_stride_4;
-    }
-}
-
-void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
-				   int dst_stride, int src_stride1, int h)
-{
-  int i;
-
-  const int perm_src1 = (unsigned int) src1 & 15;
-  const int shift_dst = (unsigned int) dst & 15;
-
-  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-  switch(shift_dst){
-    case 0:  dstmask = dstmask0;
-             break;
-    case 4:  dstmask = dstmask4;
-             break;
-    case 8:  dstmask = dstmask8;
-             break;
-    case 12: dstmask = dstmask12;
-             break;
-  }
-
-  for (i=0; i<h; i++){
-      //unaligned load of src1
-      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
-      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
-      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
-
-      //aligned load of src2
-      const vuint8_t srcb = *(vuint8_t *)(src2);
-
-      //average and rounding
-      const vuint8_t avgc = spu_avg(srca,srcb);
-
-      const vuint8_t dst1 = *(vuint8_t *)dst;
-
-      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
-
-      *(vuint8_t *)dst=davgc;
-
-      src1 +=src_stride1;
-      src2 +=16;
-      dst  +=dst_stride;
-  }
-}
-
-void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
-				   int dst_stride, int src_stride1, int h)
-{
-  int i;
-
-  const int perm_src1 = (unsigned int) src1 & 15;
-  const int shift_dst = (unsigned int) dst & 15;
-
-  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-  switch(shift_dst){
-    case 0:  dstmask = dstmask0;
-             break;
-    case 4:  dstmask = dstmask4;
-             break;
-    case 8:  dstmask = dstmask8;
-             break;
-    case 12: dstmask = dstmask12;
-             break;
-  }
-
-  for (i=0; i<h; i++){
-      //unaligned load of src1
-      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
-      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
-      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
-
-      //aligned load of src2
-      const vuint8_t srcb = *(vuint8_t *)(src2);
-
-      //average and rounding
-      const vuint8_t avgc = spu_avg(srca,srcb);
-
-      const vuint8_t dst1 = *(vuint8_t *)dst;
-
-      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
-
-      const vuint8_t davgc = spu_avg(dst1,davgc1);
-
-      *(vuint8_t *)dst=davgc;
-
-      src1 +=src_stride1;
-      src2 +=16;
-      dst  +=dst_stride;
-  }
-}
-
-// next one assumes that ((line_size % 16) == 0)
-void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
-{
-    register vector unsigned char pixelsv1A, pixelsv2A;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    const int perm = (unsigned int) src & 15;
-    const int shift_dst = (unsigned int) dst & 15;
-
-    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
-    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-    switch(shift_dst){
-      case 0:  dstmask = dstmask0;
-               break;
-      case 4:  dstmask = dstmask4;
-               break;
-      case 8:  dstmask = dstmask8;
-               break;
-      case 12: dstmask = dstmask12;
-               break;
-    }
-
-    int i;
-	register int line_size   = src_stride;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-	register int dst_stride_2 = dst_stride << 1;
-    register int dst_stride_3 = dst_stride_2 + dst_stride;
-    register int dst_stride_4 = dst_stride << 2;
-
-    for(i=0; i<h; i+=4) {
-	  pixelsv1A = *(vuint8_t *)(src);
-      pixelsv2A = *(vuint8_t *)(src+16);
-      pixelsv1B = *(vuint8_t *)(src + line_size);
-      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
-      pixelsv1C = *(vuint8_t *)(src + line_size_2);
-      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
-      pixelsv1D = *(vuint8_t *)(src + line_size_3);
-      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
-
-      const vuint8_t block1 = *(vuint8_t *)dst;
-      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
-      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
-      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
-      const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2);
-      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
-      const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3);
-      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
-
-      *(vuint8_t *) dst = put1;
-      *(vuint8_t *)(dst + dst_stride) = put2;
-      *(vuint8_t *)(dst + dst_stride_2) = put3;
-      *(vuint8_t *)(dst + dst_stride_3) = put4;
-
-      src += line_size_4;
-      dst += dst_stride_4;
-    }
-}
-
-// next one assumes that ((line_size % 16) == 0)
-void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
-{
-    register vector unsigned char pixelsv1A, pixelsv2A;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    const int perm = (unsigned int) src & 15;
-    const int shift_dst = (unsigned int) dst & 15;
-
-    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
-    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-    switch(shift_dst){
-      case 0:  dstmask = dstmask0;
-               break;
-      case 4:  dstmask = dstmask4;
-               break;
-      case 8:  dstmask = dstmask8;
-               break;
-      case 12: dstmask = dstmask12;
-               break;
-    }
-
-    int i;
-	register int line_size   = src_stride;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-	register int dst_stride_2 = dst_stride << 1;
-    register int dst_stride_3 = dst_stride_2 + dst_stride;
-    register int dst_stride_4 = dst_stride << 2;
-
-    for(i=0; i<h; i+=4) {
-	  pixelsv1A = *(vuint8_t *)(src);
-      pixelsv2A = *(vuint8_t *)(src+16);
-      pixelsv1B = *(vuint8_t *)(src + line_size);
-      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
-      pixelsv1C = *(vuint8_t *)(src + line_size_2);
-      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
-      pixelsv1D = *(vuint8_t *)(src + line_size_3);
-      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
-
-      const vuint8_t block1 = *(vuint8_t *) dst;
-      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
-      const vuint8_t put1 = spu_avg(block1,put1a);
-
-      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
-      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
-      const vuint8_t put2 = spu_avg(block2,put2a);
-
-      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
-      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
-      const vuint8_t put3 = spu_avg(block3,put3a);
-
-      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
-      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
-      const vuint8_t put4 = spu_avg(block4,put4a);
-
-      *(vuint8_t *) dst = put1;
-      *(vuint8_t *)(dst + dst_stride) = put2;
-      *(vuint8_t *)(dst + dst_stride_2) = put3;
-      *(vuint8_t *)(dst + dst_stride_3) = put4;
-
-      src+= line_size_4;
-      dst+= dst_stride_4;
-    }
-}
-
-/* Here we create all the interpolation modes H.264 motion compensation stage for luma */
-  H264_MC(put_, 16, spu)
-  H264_MC(put_, 8, spu)
-  H264_MC(put_, 4, spu)
-
-  H264_MC(avg_, 16, spu)
-  H264_MC(avg_, 8, spu)
-  H264_MC(avg_, 4, spu)
-
-
-//Chroma interpolation:
-
-#define OP_U8_SPU                          PUT_OP_U8_SPU
-#define PREFIX_h264_chroma_mc8_spu         put_h264_chroma_mc8_spu
-#define PREFIX_h264_chroma_mc4_spu         put_h264_chroma_mc4_spu
-#define PREFIX_h264_chroma_mc2_spu         put_h264_chroma_mc2_spu
-#include "h264_chroma_template_spu.c"
-#undef OP_U8_SPU
-#undef PREFIX_h264_chroma_mc8_spu
-#undef PREFIX_h264_chroma_mc4_spu
-#undef PREFIX_h264_chroma_mc2_spu
-
-#define OP_U8_SPU                          AVG_OP_U8_SPU
-#define PREFIX_h264_chroma_mc8_spu         avg_h264_chroma_mc8_spu
-#define PREFIX_h264_chroma_mc4_spu         avg_h264_chroma_mc4_spu
-#define PREFIX_h264_chroma_mc2_spu         avg_h264_chroma_mc2_spu
-#include "h264_chroma_template_spu.c"
-#undef OP_U8_SPU
-#undef PREFIX_h264_chroma_mc8_spu
-#undef PREFIX_h264_chroma_mc4_spu
-#undef PREFIX_h264_chroma_mc2_spu
-
-// Weight and Biweight functions
-
-#define op_scale1(x)  dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom )
-#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
-    int y; \
-    offset <<= log2_denom; \
-    if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, dst += stride){ \
-        op_scale1(0); \
-        op_scale1(1); \
-        if(W==2) continue; \
-        op_scale1(2); \
-        op_scale1(3); \
-        if(W==4) continue; \
-        op_scale1(4); \
-        op_scale1(5); \
-        op_scale1(6); \
-        op_scale1(7); \
-        if(W==8) continue; \
-        op_scale1(8); \
-        op_scale1(9); \
-        op_scale1(10); \
-        op_scale1(11); \
-        op_scale1(12); \
-        op_scale1(13); \
-        op_scale1(14); \
-        op_scale1(15); \
-    } \
-} \
-static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \
-    int y; \
-    offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \
-        op_scale2(0); \
-        op_scale2(1); \
-        if(W==2) continue; \
-        op_scale2(2); \
-        op_scale2(3); \
-        if(W==4) continue; \
-        op_scale2(4); \
-        op_scale2(5); \
-        op_scale2(6); \
-        op_scale2(7); \
-        if(W==8) continue; \
-        op_scale2(8); \
-        op_scale2(9); \
-        op_scale2(10); \
-        op_scale2(11); \
-        op_scale2(12); \
-        op_scale2(13); \
-        op_scale2(14); \
-        op_scale2(15); \
-    } \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
-
-#undef op_scale1
-#undef op_scale2
-#undef H264_WEIGHT
-
-/////////////////////////////////////////////////////////////////////////////////////////
-
-static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
-{
-    int i, d;
-    for( i = 0; i < 4; i++ ) {
-        if( tc0[i] < 0 ) {
-            pix += 4*ystride;
-            continue;
-        }
-        for( d = 0; d < 4; d++ ) {
-            const int p0 = pix[-1*xstride];
-            const int p1 = pix[-2*xstride];
-            const int p2 = pix[-3*xstride];
-            const int q0 = pix[0];
-            const int q1 = pix[1*xstride];
-            const int q2 = pix[2*xstride];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                int tc = tc0[i];
-                int i_delta;
-
-                if( FFABS( p2 - p0 ) < beta ) {
-                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
-                    tc++;
-                }
-                if( FFABS( q2 - q0 ) < beta ) {
-                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
-                    tc++;
-                }
-
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
-            }
-            pix += ystride;
-        }
-    }
-}
-static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
-}
-static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
-}
-
-static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
-{
-    int d;
-    for( d = 0; d < 16; d++ ) {
-        const int p2 = pix[-3*xstride];
-        const int p1 = pix[-2*xstride];
-        const int p0 = pix[-1*xstride];
-
-        const int q0 = pix[ 0*xstride];
-        const int q1 = pix[ 1*xstride];
-        const int q2 = pix[ 2*xstride];
-
-        if( FFABS( p0 - q0 ) < alpha &&
-            FFABS( p1 - p0 ) < beta &&
-            FFABS( q1 - q0 ) < beta ) {
-
-            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                if( FFABS( p2 - p0 ) < beta)
-                {
-                    const int p3 = pix[-4*xstride];
-                    /* p0', p1', p2' */
-                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                } else {
-                    /* p0' */
-                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                }
-                if( FFABS( q2 - q0 ) < beta)
-                {
-                    const int q3 = pix[3*xstride];
-                    /* q0', q1', q2' */
-                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                } else {
-                    /* q0' */
-                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                }
-            }else{
-                /* p0', q0' */
-                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-            }
-        }
-        pix += ystride;
-    }
-}
-static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
-}
-static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
-}
-
-static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
-{
-    int i, d;
-    for( i = 0; i < 4; i++ ) {
-        const int tc = tc0[i];
-        if( tc <= 0 ) {
-            pix += 2*ystride;
-            continue;
-        }
-        for( d = 0; d < 2; d++ ) {
-            const int p0 = pix[-1*xstride];
-            const int p1 = pix[-2*xstride];
-            const int q0 = pix[0];
-            const int q1 = pix[1*xstride];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-
-                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
-                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
-            }
-            pix += ystride;
-        }
-    }
-}
-static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
-}
-static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
-}
-
-static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
-{
-    int d;
-    for( d = 0; d < 8; d++ ) {
-        const int p0 = pix[-1*xstride];
-        const int p1 = pix[-2*xstride];
-        const int q0 = pix[0];
-        const int q1 = pix[1*xstride];
-
-        if( FFABS( p0 - q0 ) < alpha &&
-            FFABS( p1 - p0 ) < beta &&
-            FFABS( q1 - q0 ) < beta ) {
-
-            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
-            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
-        }
-        pix += ystride;
-    }
-}
-static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
-}
-static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
-}
-
-
-void dsputil_h264_init_cell(DSPContext_spu* c) {
-
-	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
-    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
-    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
-    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
-    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
-    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
-    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
-    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
-
-    c->h264_idct_add[0] = h264_idct8_add_spu;
-    c->h264_idct_add[1] = h264_idct4_add_spu;
-
-
-    c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu;
-    c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu;
-    c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu;
-    c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu;
-    c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu;
-    c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu;
-
-    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
-    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
-    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
-    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
-    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
-    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
-    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
-    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
-    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
-    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
-    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
-    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
-    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
-    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
-    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
-    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
-    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
-    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
-    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
-    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
-
-
-#define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \
-    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \
-    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \
-    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \
-    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \
-    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \
-    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \
-    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \
-    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \
-    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \
-    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \
-    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \
-    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \
-    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \
-    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \
-    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu
-
-    dspfunc(put_h264_qpel, 0, 16);
-    dspfunc(put_h264_qpel, 1, 8);
-    dspfunc(put_h264_qpel, 2, 4);
-
-    dspfunc(avg_h264_qpel, 0, 16);
-    dspfunc(avg_h264_qpel, 1, 8);
-    dspfunc(avg_h264_qpel, 2, 4);
-
-#undef dspfunc
-
-
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,34 +0,0 @@
-#ifndef DSPUTIL_CELL_H
-#define DSPUTIL_CELL_H
-
-#include "types_spu.h"
-
-typedef struct DSPContext_spu {
-	
-	void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
-    /* v/h_loop_filter_luma_intra: align 16 */
-    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
-    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
-    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
-    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
-	
-	qpel_mc_func put_h264_qpel_pixels_tab[3][16];
-	qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
-
-	h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
-	h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
-
-	h264_idct_func h264_idct_add[2];
-
-	h264_weight_func weight_h264_pixels_tab[10];
-	h264_biweight_func biweight_h264_pixels_tab[10];
-
-} DSPContext_spu;
-
-
-void dsputil_h264_init_cell(DSPContext_spu* c);
- 
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2633 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 cabac decoding.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-#define CELL_SPE
-#include <limits.h>
-#include <stdlib.h>
-#include "libavutil/intreadwrite.h"
-#include "libavutil/mem.h"
-#include "libavcodec/avcodec.h"
-#include "h264_deblock_spu.h"
-#include "h264_pred_spu.h"
-#include "h264_direct_spu.h"
-#include "h264_tables.h"
-#include "mathops_spu.h"
-//#include "libavcodec/h264_data.h"
-#include "cabac_spu.h"
-#include "rectangle_spu.h"
-#include "libavutil/log.h"
-
-//#undef NDEBUG
-#include <assert.h>
-#define INT_BIT (sizeof(int) * 8)
-/* Cabac pre state table */
-typedef struct IMbInfo{
-    uint16_t type;
-    uint8_t pred_mode;
-    uint8_t cbp;
-} IMbInfo;
-
-extern int bytecount;
-
-static const IMbInfo i_mb_type_info[26]={
-{MB_TYPE_INTRA4x4  , -1, -1},
-{MB_TYPE_INTRA16x16,  2,  0},
-{MB_TYPE_INTRA16x16,  1,  0},
-{MB_TYPE_INTRA16x16,  0,  0},
-{MB_TYPE_INTRA16x16,  3,  0},
-{MB_TYPE_INTRA16x16,  2,  16},
-{MB_TYPE_INTRA16x16,  1,  16},
-{MB_TYPE_INTRA16x16,  0,  16},
-{MB_TYPE_INTRA16x16,  3,  16},
-{MB_TYPE_INTRA16x16,  2,  32},
-{MB_TYPE_INTRA16x16,  1,  32},
-{MB_TYPE_INTRA16x16,  0,  32},
-{MB_TYPE_INTRA16x16,  3,  32},
-{MB_TYPE_INTRA16x16,  2,  15+0},
-{MB_TYPE_INTRA16x16,  1,  15+0},
-{MB_TYPE_INTRA16x16,  0,  15+0},
-{MB_TYPE_INTRA16x16,  3,  15+0},
-{MB_TYPE_INTRA16x16,  2,  15+16},
-{MB_TYPE_INTRA16x16,  1,  15+16},
-{MB_TYPE_INTRA16x16,  0,  15+16},
-{MB_TYPE_INTRA16x16,  3,  15+16},
-{MB_TYPE_INTRA16x16,  2,  15+32},
-{MB_TYPE_INTRA16x16,  1,  15+32},
-{MB_TYPE_INTRA16x16,  0,  15+32},
-{MB_TYPE_INTRA16x16,  3,  15+32},
-{MB_TYPE_INTRA_PCM , -1, -1},
-};
-
-typedef struct PMbInfo{
-    uint16_t type;
-    uint8_t partition_count;
-} PMbInfo;
-
-static const PMbInfo p_mb_type_info[5]={
-{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
-};
-
-static const PMbInfo p_sub_mb_type_info[4]={
-{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
-{MB_TYPE_16x8 |MB_TYPE_P0L0             , 2},
-{MB_TYPE_8x16 |MB_TYPE_P0L0             , 2},
-{MB_TYPE_8x8  |MB_TYPE_P0L0             , 4},
-};
-
-static const PMbInfo b_mb_type_info[23]={
-{MB_TYPE_DIRECT2|MB_TYPE_L0L1                                      , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
-{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
-};
-
-static const PMbInfo b_sub_mb_type_info[13]={
-{MB_TYPE_DIRECT2                                                   , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
-{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x8  |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 4, },
-{MB_TYPE_8x8               |MB_TYPE_P0L1             |MB_TYPE_P1L1, 4, },
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
-};
-
-static const int8_t cabac_context_init_I[460][2] =
-{
-    /* 0 - 10 */
-    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
-    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
-    { -6,  53 }, { -1, 54 },  {  7,  51 },
-
-    /* 11 - 23 unsused for I */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },
-
-    /* 24- 39 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-
-    /* 40 - 53 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },
-
-    /* 54 - 59 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },
-
-    /* 60 - 69 */
-    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-    { 13, 41 },  { 3, 62 },
-
-    /* 70 -> 87 */
-    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
-    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
-    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
-    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
-    { -12, 115 },{ -16, 122 },
-
-    /* 88 -> 104 */
-    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
-    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
-    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
-    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
-    { -22, 125 },
-
-    /* 105 -> 135 */
-    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
-    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
-    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
-    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
-    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
-    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
-    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
-    { 14, 62 },  { -13, 108 },{ -15, 100 },
-
-    /* 136 -> 165 */
-    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
-    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
-    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
-    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
-    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
-    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
-    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
-    { 0, 62 },   { 12, 72 },
-
-    /* 166 -> 196 */
-    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
-    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
-    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
-    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
-    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
-    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
-    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
-    { 0, 89 },   { 26, -19 }, { 22, -17 },
-
-    /* 197 -> 226 */
-    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
-    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
-    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
-    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
-    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
-    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
-    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
-    { 12, 68 },  { 2, 97 },
-
-    /* 227 -> 251 */
-    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
-    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
-    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
-    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
-    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
-    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
-    { -4, 65 },
-
-    /* 252 -> 275 */
-    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
-    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
-    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
-    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
-    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
-    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
-
-    /* 276 a bit special (not used, bypass is used instead) */
-    { 0, 0 },
-
-    /* 277 -> 307 */
-    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
-    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
-    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
-    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
-    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
-    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
-    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
-    { 9, 64 },   { -12, 104 },{ -11, 97 },
-
-    /* 308 -> 337 */
-    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
-    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
-    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
-    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
-    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
-    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
-    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
-    { 5, 64 },   { 12, 70 },
-
-    /* 338 -> 368 */
-    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
-    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
-    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
-    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
-    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
-    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
-    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
-    { -12, 109 },{ 36, -35 }, { 36, -34 },
-
-    /* 369 -> 398 */
-    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
-    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
-    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
-    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
-    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
-    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
-    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
-    { 29, 39 },  { 19, 66 },
-
-    /* 399 -> 435 */
-    {  31,  21 }, {  31,  31 }, {  25,  50 },
-    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
-    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
-    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
-    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
-    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
-    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
-    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
-    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
-    {   0,  68 }, {  -9,  92 },
-
-    /* 436 -> 459 */
-    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
-    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
-    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
-    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
-    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
-};
-
-static const int8_t cabac_context_init_PB[3][460][2] =
-{
-    /* i_cabac_init_idc == 0 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
-        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
-        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
-        {  17,  50 },
-
-        /* 24 - 39 */
-        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
-        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
-        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
-        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
-
-        /* 40 - 53 */
-        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
-        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
-        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
-        {  -3,  81 }, {   0,  88 },
-
-        /* 54 - 59 */
-        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
-        {  -7,  72 }, {   1,  58 },
-
-        /* 60 - 69 */
-        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
-        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
-        {  13,  41 }, {   3,  62 },
-
-        /* 70 - 87 */
-        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
-        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
-        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
-        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
-        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
-        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
-        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
-        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
-        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
-
-        /* 105 -> 165 */
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
-        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
-        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
-        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
-        {   9,  69 },
-
-        /* 166 - 226 */
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
-        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
-        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
-        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
-        {  -9, 108 },
-
-        /* 227 - 275 */
-        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
-        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
-        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
-        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
-        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
-        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
-        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
-        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
-        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
-        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
-        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
-        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
-        {  -8,  85 },
-
-        /* 276 a bit special (not used, bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
-        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
-        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
-        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
-        {  26,  43 },
-
-        /* 338 - 398 */
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
-        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
-        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
-        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
-        {  11,  86 },
-
-        /* 399 - 435 */
-        {  12,  40 }, {  11,  51 }, {  14,  59 },
-        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
-        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
-        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
-        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
-        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
-        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
-        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
-        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
-        {  -8,  66 }, {  -8,  76 },
-
-        /* 436 - 459 */
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
-        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
-        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
-    },
-
-    /* i_cabac_init_idc == 1 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
-        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
-        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
-        {  10,  54 },
-
-        /* 24 - 39 */
-        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
-        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
-        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
-        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
-
-        /* 40 - 53 */
-        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
-        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
-        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
-        {  -7,  86 },{  -5,  95 },
-
-        /* 54 - 59 */
-        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
-        {  -5,  72 },{   0,  61 },
-
-        /* 60 - 69 */
-        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-        { 13, 41 },  { 3, 62 },
-
-        /* 70 - 104 */
-        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
-        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
-        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
-        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
-        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
-        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
-        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
-        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
-        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
-
-        /* 105 -> 165 */
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
-        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
-        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
-        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
-        {   0,  89 },
-
-        /* 166 - 226 */
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
-        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
-        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
-        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
-        { -10, 116 },
-
-        /* 227 - 275 */
-        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
-        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
-        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
-        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
-        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
-        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
-        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
-        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
-        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
-        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
-        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
-        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
-        {  -4,  78 },
-
-        /* 276 a bit special (not used, bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
-        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
-        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
-        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
-        {  18,  50 },
-
-        /* 338 - 398 */
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
-        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
-        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
-        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
-        {  11,  83 },
-
-        /* 399 - 435 */
-        {  25,  32 }, {  21,  49 }, {  21,  54 },
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
-        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
-        {  -4,  67 }, {  -7,  82 },
-
-        /* 436 - 459 */
-        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
-        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
-        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
-        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-    },
-
-    /* i_cabac_init_idc == 2 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
-        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
-        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
-        {  14,  57 },
-
-        /* 24 - 39 */
-        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
-        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
-        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
-        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
-
-        /* 40 - 53 */
-        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
-        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
-        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
-        {  -3,  90 },{  -1,  101 },
-
-        /* 54 - 59 */
-        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
-        {  -7,  50 },{   1,  60 },
-
-        /* 60 - 69 */
-        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-        { 13, 41 },  { 3, 62 },
-
-        /* 70 - 104 */
-        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
-        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
-        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
-        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
-        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
-        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
-        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
-        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
-        {   3,  68 }, {  -8,  71 }, { -13,  98 },
-
-        /* 105 -> 165 */
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
-        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
-        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
-        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
-        { -22, 127 },
-
-        /* 166 - 226 */
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
-        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
-        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
-        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
-        { -24, 127 },
-
-        /* 227 - 275 */
-        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
-        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
-        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
-        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
-        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
-        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
-        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
-        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
-        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
-        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
-        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
-        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
-        { -10,  87 },
-
-        /* 276 a bit special (not used, bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
-        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
-        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
-        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
-        {  25,  42 },
-
-        /* 338 - 398 */
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
-        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
-        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
-        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
-        {  25,  61 },
-
-        /* 399 - 435 */
-        {  21,  33 }, {  19,  50 }, {  17,  61 },
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
-        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
-        {  -6,  68 }, { -10,  79 },
-
-        /* 436 - 459 */
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-    }
-};
-
-static const uint8_t left_block_options[4][16]={
-    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
-    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
-    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
-    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
-};
-
-void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c) {
-    int i;
-    const int8_t (*tab)[2];
-
-    if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I;
-    else                                 tab = cabac_context_init_PB[s->cabac_init_idc];
-
-    /* calculate pre-state */
-    for( i= 0; i < 460; i++ ) {
-        int pre = 2*(((tab[i][0] * s->qscale) >>4 ) + tab[i][1]) - 127;
-
-        pre^= pre>>31;
-        if(pre > 124)
-            pre= 124 + (pre&1);
-
-        c->cabac_state[i] =  pre;
-    }
-}
-
-static void fill_decode_neighbors(H264Cabac_spu *hc, EDSlice_spu *s){
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-	const int mb_y = m->mb_y;
-
-    m->top_type     = hc->mb_type_top[mb_x];
-    m->left_type    = hc->mb_type[mb_x-1] ;
-
-}
-
-static void fill_decode_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
-    H264Mb *m = s->m;
-    int topleft_xy, top_xy, topright_xy, left_xy;
-    int topleft_type, top_type, topright_type, left_type;
-    const uint8_t * left_block= left_block_options[0];
-	const int mb_x = m->mb_x;
-	const int mb_y = m->mb_y;
-	const int b_stride = hc->b_stride;
-    int i;
-
-    topleft_type = hc->mb_type_top[mb_x-1] ;
-    top_type     = m->top_type      ;
-	topright_type= hc->mb_type_top[mb_x+1] ;
-    left_type    = m->left_type     ;
-	
-	if (s->slice_type_nos == FF_B_TYPE){
-		get_list = get_list_buf;
-		for(int i=0; i<2; i++){
-			get_dma_list(hc->list1_motion_val[i], s->list1.motion_val[i][4*mb_x + 4*mb_y*b_stride], 16, 4, b_stride*2*sizeof(int16_t), ED_get_mv, 0);
-		}
-		if (hc->blocking) wait_dma_id(ED_get_mv);
-	}
-	
-    if(!IS_SKIP(mb_type)){
-        if(IS_INTRA(mb_type)){
-            int type_mask= s->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
-            m->topleft_samples_available=
-            m->top_samples_available=
-            m->left_samples_available= 0xFFFF;
-            m->topright_samples_available= 0xEEEA;
-
-            if(!(top_type & type_mask)){
-                m->topleft_samples_available= 0xB3FF;
-                m->top_samples_available= 0x33FF;
-                m->topright_samples_available= 0x26EA;
-            }
-            if(!(left_type & type_mask)){
-                m->topleft_samples_available&= 0xDF5F;
-                m->left_samples_available&= 0x5F5F;
-            }
-
-            if(!(topleft_type & type_mask))
-                m->topleft_samples_available&= 0x7FFF;
-
-            if(!(topright_type & type_mask))
-                m->topright_samples_available&= 0xFBFF;
-
-            if(IS_INTRA4x4(mb_type)){
-                if(IS_INTRA4x4(top_type)){
-                    AV_COPY32(m->intra4x4_pred_mode_cache+4+8*0, &hc->intra4x4_pred_mode_top[8*mb_x]);
-                }else{
-                    m->intra4x4_pred_mode_cache[4+8*0]=
-                    m->intra4x4_pred_mode_cache[5+8*0]=
-                    m->intra4x4_pred_mode_cache[6+8*0]=
-                    m->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask);
-                }
-                for(i=0; i<2; i++){
-                    if(IS_INTRA4x4(left_type)){
-                        int8_t *mode= &hc->intra4x4_pred_mode[8*(mb_x-1)];
-                        m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]];
-                        m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]];
-                    }else{
-                        m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
-                        m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type & type_mask);
-                    }
-                }
-            }
-        }
-        if(top_type){
-			AV_COPY32(&m->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]);
-            m->non_zero_count_cache[1+8*0]= hc->non_zero_count_top[mb_x][1+1*8];
-			m->non_zero_count_cache[2+8*0]= hc->non_zero_count_top[mb_x][2+1*8];
-			m->non_zero_count_cache[1+8*3]= hc->non_zero_count_top[mb_x][1+2*8];
-			m->non_zero_count_cache[2+8*3]= hc->non_zero_count_top[mb_x][2+2*8];
-        }else {
-            m->non_zero_count_cache[1+8*0]=
-            m->non_zero_count_cache[2+8*0]=
-            m->non_zero_count_cache[1+8*3]=
-            m->non_zero_count_cache[2+8*3]=
-            AV_WN32A(&m->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040);
-        }
-
-        for (i=0; i<2; i++) {
-            if(left_type){
-                m->non_zero_count_cache[3+8*1 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+0+2*i]];
-				m->non_zero_count_cache[3+8*2 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+1+2*i]];
-				m->non_zero_count_cache[0+8*1 +   8*i]= hc->non_zero_count[mb_x-1][left_block[8+4+2*i]];
-				m->non_zero_count_cache[0+8*4 +   8*i]= hc->non_zero_count[mb_x-1][left_block[8+5+2*i]];
-            }else{
-                    m->non_zero_count_cache[3+8*1 + 2*8*i]=
-                    m->non_zero_count_cache[3+8*2 + 2*8*i]=
-                    m->non_zero_count_cache[0+8*1 +   8*i]=
-                    m->non_zero_count_cache[0+8*4 +   8*i]= !IS_INTRA(mb_type) ? 0 : 64;
-            }
-        }
-
-
-		// top_cbp
-		if(top_type) {
-			hc->top_cbp = hc->cbp_top[mb_x];
-		} else {
-			hc->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
-		}
-		// left_cbp
-		if (left_type) {
-			hc->left_cbp = (hc->cbp[mb_x-1] & 0x1f0)
-			|  ((hc->cbp[mb_x-1]>>(left_block[0]&(~1)))&2)
-			| (((hc->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2);
-		} else {
-			hc->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
-		}
-    }
-
-    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
-        int list;
-
-        m->ref_cache[0][scan8[5 ]+1] = m->ref_cache[0][scan8[7 ]+1] = m->ref_cache[0][scan8[13]+1] =
-        m->ref_cache[1][scan8[5 ]+1] = m->ref_cache[1][scan8[7 ]+1] = m->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
-
-        for(list=0; list<s->list_count; list++){
-            if(!USES_LIST(mb_type, list)){
-                continue;
-            }
-            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
-
-            if(USES_LIST(top_type, list)){
-                const int b_xy= 4*mb_x + 3*hc->b_stride;
-                AV_COPY128(m->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]);
-                    m->ref_cache[list][scan8[0] + 0 - 1*8]=
-                    m->ref_cache[list][scan8[0] + 1 - 1*8]= hc->ref_index_top[list][4*mb_x + 2];
-                    m->ref_cache[list][scan8[0] + 2 - 1*8]=
-					m->ref_cache[list][scan8[0] + 3 - 1*8]= hc->ref_index_top[list][4*mb_x + 3];
-            }else{
-                AV_ZERO128(m->mv_cache[list][scan8[0] + 0 - 1*8]);
-                AV_WN32A(&m->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
-            }
-
-            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
-                for(i=0; i<2; i++){
-                    int cache_idx = scan8[0] - 1 + i*2*8;
-                    if(USES_LIST(left_type, list)){
-                        const int b_xy= 4*(mb_x-1) + 3;
-                        const int b8_x= 4*(mb_x-1) + 1;
-                        AV_COPY32(m->mv_cache[list][cache_idx  ], hc->motion_val[list][b_xy + hc->b_stride*left_block[0+i*2]]);
-                        AV_COPY32(m->mv_cache[list][cache_idx+8], hc->motion_val[list][b_xy + hc->b_stride*left_block[1+i*2]]);
-                        m->ref_cache[list][cache_idx  ]= hc->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
-                        m->ref_cache[list][cache_idx+8]= hc->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
-                    }else{
-                        AV_ZERO32(m->mv_cache [list][cache_idx  ]);
-                        AV_ZERO32(m->mv_cache [list][cache_idx+8]);
-                        m->ref_cache[list][cache_idx  ]=
-                        m->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
-                    }
-                }
-            }else{
-                if(USES_LIST(left_type, list)){
-					const int b_x = 4*(mb_x-1) + 3;
-                    const int b8_x= 4*(mb_x-1) + 1;
-                    AV_COPY32(m->mv_cache[list][scan8[0] - 1], hc->motion_val[list][b_x + hc->b_stride*left_block[0]]);
-                    m->ref_cache[list][scan8[0] - 1]= hc->ref_index[list][b8_x + (left_block[0]&~1)];
-                }else{
-                    AV_ZERO32(m->mv_cache [list][scan8[0] - 1]);
-                    m->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if(USES_LIST(topright_type, list)){
-                const int b_xy= 4*(mb_x+1) + 3*hc->b_stride;
-                AV_COPY32(m->mv_cache[list][scan8[0] + 4 - 1*8], hc->motion_val_top[list][b_xy]);
-                m->ref_cache[list][scan8[0] + 4 - 1*8]= hc->ref_index_top[list][4*(mb_x+1) + 2];
-            }else{
-                AV_ZERO32(m->mv_cache [list][scan8[0] + 4 - 1*8]);
-                m->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-            }
-            if(m->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
-                int topleft_partition= -1;
-                if(USES_LIST(topleft_type, list)){
-                    const int b_xy = 4*(mb_x-1) + 3 + hc->b_stride + (topleft_partition & 2*hc->b_stride);
-                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
-                    AV_COPY32(m->mv_cache[list][scan8[0] - 1 - 1*8], hc->motion_val_top[list][b_xy]);
-                    m->ref_cache[list][scan8[0] - 1 - 1*8]= hc->ref_index_top[list][b8_x];
-                }else{
-                    AV_ZERO32(m->mv_cache[list][scan8[0] - 1 - 1*8]);
-                    m->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
-                continue;
-
-            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
-                m->ref_cache[list][scan8[4 ]] =
-                m->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
-                AV_ZERO32(m->mv_cache [list][scan8[4 ]]);
-                AV_ZERO32(m->mv_cache [list][scan8[12]]);
-
-
-				/* XXX beurk, Load mvd */
-				if(USES_LIST(top_type, list)){
-// 					const int b_xy= hc->mb2br_top_xy;
-					AV_COPY64(hc->mvd_cache[list][scan8[0] + 0 - 1*8], hc->mvd_top[list][8*mb_x + 0]);
-				}else{
-					AV_ZERO64(hc->mvd_cache[list][scan8[0] + 0 - 1*8]);
-				}
-				if(USES_LIST(left_type, list)){
-// 					const int b_xy= hc->mb2br_left_xy + 6;
-					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 0*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[0]]);
-					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 1*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[1]]);
-				}else{
-					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 0*8]);
-					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 1*8]);
-				}
-				if(USES_LIST(left_type, list)){
-// 					const int b_xy= hc->mb2br_left_xy + 6;
-					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 2*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[2]]);
-					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 3*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[3]]);
-				}else{
-					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 2*8]);
-					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 3*8]);
-				}
-				AV_ZERO16(hc->mvd_cache [list][scan8[4 ]]);
-				AV_ZERO16(hc->mvd_cache [list][scan8[12]]);
-				if(s->slice_type_nos == FF_B_TYPE){
-					fill_rectangle(&hc->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
-
-					if(IS_DIRECT(top_type)){
-						AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1));
-					}else if(IS_8X8(top_type)){
-						int b8_x = 4*mb_x;
-						hc->direct_cache[scan8[0] + 0 - 1*8]= hc->direct_top[b8_x + 2];
-						hc->direct_cache[scan8[0] + 2 - 1*8]= hc->direct_top[b8_x + 3];
-					}else{
-						AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1));
-					}
-
-					if(IS_DIRECT(left_type))
-						hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1;
-					else if(IS_8X8(left_type))
-						hc->direct_cache[scan8[0] - 1 + 0*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)];
-					else
-						hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1;
-
-					if(IS_DIRECT(left_type))
-						hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1;
-					else if(IS_8X8(left_type))
-						hc->direct_cache[scan8[0] - 1 + 2*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)];
-					else
-						hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1;
-				}
-            }
-        }
-    }
-    hc->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type);
-
-	if (s->slice_type_nos == FF_B_TYPE){
-		wait_dma_id(ED_get_mv);
-	}
-}
-
-static int check_mv(H264Cabac_spu *hc, EDSlice_spu *s, long b_idx, long bn_idx, int mvy_limit){
-	int v;
-
-	v= hc->ref_cache[0][b_idx] != hc->ref_cache[0][bn_idx];
-	if(!v && hc->ref_cache[0][b_idx]!=-1)
-		// absolute value >= 7 | ...
-		v= ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) |
-		((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit);
-
-	if(s->list_count==2){
-		if(!v)
-			v = (hc->ref_cache[1][b_idx] != hc->ref_cache[1][bn_idx]) |
-			((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) |
-			((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit);
-
-		if(v){
-			if((hc->ref_cache[0][b_idx] != hc->ref_cache[1][bn_idx]) |
-				(hc->ref_cache[1][b_idx] != hc->ref_cache[0][bn_idx]))
-				return 1;
-			return
-			((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) |
-			((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
-			((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) |
-			((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit);
-		}
-	}
-
-	return v;
-}
-
-static void calc_bS_values(H264Cabac_spu *hc, EDSlice_spu *s, int mvy_limit, int dir) {
-	H264Mb *m = s->m;
-	int mb_type = m->mb_type;
-	int edge;
-	const int mbm_type = dir == 0 ? m->left_type : m->top_type;
-
-	// how often to recheck mv-based bS when iterating between edges
-	static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
-	{0,3,1,1,3,3,3,3}};
-	const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
-	const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
-	// how often to recheck mv-based bS when iterating along each edge
-	const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
-
-	m->edges[dir]= edges;
-
-	if(mbm_type){
-		int16_t* bS=m->bS[dir][0];
-		if( IS_INTRA(mb_type|mbm_type)) {
-			AV_WN64A(bS, 0x0004000400040004ULL);
-		} else {
-			int i;
-			int mv_done;
-			if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
-				int b_idx= 8 + 4;
-				int bn_idx= b_idx - (dir ? 8:1);
-
-				bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, 8 + 4, bn_idx, mvy_limit);
-				mv_done = 1;
-			}
-			else
-				mv_done = 0;
-
-			for( i = 0; i < 4; i++ ) {
-				int x = dir == 0 ? 0 : i;
-				int y = dir == 0 ? i    : 0;
-				int b_idx= 8 + 4 + x + 8*y;
-				int bn_idx= b_idx - (dir ? 8:1);
-
-				if( hc->non_zero_count_cache[b_idx] |
-					hc->non_zero_count_cache[bn_idx] ) {
-					bS[i] = 2;
-				}
-				else if(!mv_done)
-				{
-					bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
-				}
-			}
-		}
-	}
-
-	/* Calculate bS */
-	for( edge = 1; edge < edges; edge++ ) {
-		int16_t* bS=m->bS[dir][edge];
-
-		if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
-			continue;
-
-		if( IS_INTRA(mb_type)) {
-			AV_WN64A(bS, 0x0003000300030003ULL);
-		} else {
-			int i;
-			int mv_done;
-
-			if( edge & mask_edge ) {
-				AV_ZERO64(bS);
-				mv_done = 1;
-			}
-			else if( mask_par0 ) {
-				int b_idx= 8 + 4 + edge * (dir ? 8:1);
-				int bn_idx= b_idx - (dir ? 8:1);
-
-				bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
-				mv_done = 1;
-			}
-			else
-				mv_done = 0;
-
-			for( i = 0; i < 4; i++ ) {
-				int x = dir == 0 ? edge : i;
-				int y = dir == 0 ? i    : edge;
-				int b_idx= 8 + 4 + x + 8*y;
-				int bn_idx= b_idx - (dir ? 8:1);
-
-				if( hc->non_zero_count_cache[b_idx] |
-					hc->non_zero_count_cache[bn_idx] ) {
-					bS[i] = 2;
-				}
-				else if(!mv_done)
-				{
-					bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
-				}
-			}
-
-			if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-				continue;
-		}
-
-	}
-}
-
-/**
-*
-* @return zero if the loop filter can be skiped
-*/
-static int fill_filter_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-    const int mb_y = m->mb_y;
-    int top_type, left_type;
-    int qp, top_qp, left_qp;
-    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
-
-    m->dequant4_coeff_y  = hc->dequant4_coeff[0][s->qscale][0];
-    m->dequant4_coeff_cb = hc->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][s->chroma_qp[0]][0];
-    m->dequant4_coeff_cr = hc->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][s->chroma_qp[1]][0];
-
-    m->qscale_mb_xy = qp = hc->qscale[mb_x];
-    m->qscale_left_mb_xy = left_qp = hc->qscale[mb_x-1];
-    m->qscale_top_mb_xy = top_qp = hc->qscale_top[mb_x];
-
-    //for sufficiently low qp, filtering wouldn't do anything
-    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
-	if(qp <= qp_thresh
-		&& (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
-		&& ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
-		m->deblock_mb = 0;
-		return 0;
-	}
-    
-
-    m->deblock_mb = 1;
-
-	top_type     = hc->mb_type_top[mb_x] ;
-	left_type    = hc->mb_type[mb_x -1];
-
-    m->top_type     = top_type ;
-    m->left_type    = left_type;
-
-    if(IS_INTRA(mb_type)){
-        calc_bS_values(hc, s, 4, 0);
-        calc_bS_values(hc, s, 4, 1);
-        return 1;
-    }
-
-    AV_COPY64(&hc->non_zero_count_cache[0+8*1], &hc->non_zero_count[mb_x][ 0]);
-    AV_COPY64(&hc->non_zero_count_cache[0+8*2], &hc->non_zero_count[mb_x][ 8]);
-    AV_COPY32(&hc->non_zero_count_cache[0+8*5], &hc->non_zero_count[mb_x][16]);
-    AV_COPY32(&hc->non_zero_count_cache[4+8*3], &hc->non_zero_count[mb_x][20]);
-    AV_COPY64(&hc->non_zero_count_cache[0+8*4], &hc->non_zero_count[mb_x][24]);
-
-    m->cbp= hc->cbp[mb_x];
-
-    {
-        int list;
-        for(list=0; list<s->list_count; list++){
-            int8_t *ref;
-            int y, b_stride;
-            int16_t (*mv_dst)[2];
-            int16_t (*mv_src)[2];
-
-            if(!USES_LIST(mb_type, list)){
-                fill_rectangle( hc->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
-                AV_WN32A(&hc->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                AV_WN32A(&hc->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                AV_WN32A(&hc->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                AV_WN32A(&hc->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                continue;
-            }
-
-            ref = &hc->ref_index[list][4*mb_x];
-            {
-                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
-                AV_WN32A(&hc->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-                AV_WN32A(&hc->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-                ref += 2;
-                AV_WN32A(&hc->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-                AV_WN32A(&hc->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-            }
-            b_stride = hc->b_stride;
-            mv_dst   = &hc->mv_cache[list][scan8[0]];
-            mv_src   = &hc->motion_val[list][4*mb_x];
-            for(y=0; y<4; y++){
-                AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride);
-            }
-
-        }
-    }
-
-    /*
-    0 . T T. T T T T
-    1 L . .L . . . .
-    2 L . .L . . . .
-    3 . T TL . . . .
-    4 L . .L . . . .
-    5 L . .. . . . .
-    */
-    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
-    if(top_type){
-        AV_COPY32(&hc->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]);
-    }
-
-    if(left_type){
-        hc->non_zero_count_cache[3+8*1]= hc->non_zero_count[mb_x-1][7+0*8];
-		hc->non_zero_count_cache[3+8*2]= hc->non_zero_count[mb_x-1][7+1*8];
-		hc->non_zero_count_cache[3+8*3]= hc->non_zero_count[mb_x-1][7+2*8];
-		hc->non_zero_count_cache[3+8*4]= hc->non_zero_count[mb_x-1][7+3*8];
-    }
-
-    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
-        int list;
-        for(list=0; list<s->list_count; list++){
-            if(USES_LIST(top_type, list)){
-                const int b_xy= 4*mb_x + 3*hc->b_stride;
-                const int b8_x= 4*mb_x + 2;
-                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
-                AV_COPY128(hc->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]);
-                hc->ref_cache[list][scan8[0] + 0 - 1*8]=
-                hc->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 0]];
-                hc->ref_cache[list][scan8[0] + 2 - 1*8]=
-                hc->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 1]];
-            }else{
-                AV_ZERO128(hc->mv_cache[list][scan8[0] + 0 - 1*8]);
-                AV_WN32A(&hc->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-            }
-
-            if(USES_LIST(left_type, list)){
-				const int b_x = 4*(mb_x-1) + 3;
-                const int b8_x= 4*(mb_x-1) + 1;
-                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
-                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 0 ], hc->motion_val[list][b_x + hc->b_stride*0]);
-                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 8 ], hc->motion_val[list][b_x + hc->b_stride*1]);
-                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +16 ], hc->motion_val[list][b_x + hc->b_stride*2]);
-                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +24 ], hc->motion_val[list][b_x + hc->b_stride*3]);
-                hc->ref_cache[list][scan8[0] - 1 + 0 ]=
-                hc->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*0]];
-                hc->ref_cache[list][scan8[0] - 1 +16 ]=
-                hc->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*1]];
-            }else{
-                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 0 ]);
-                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 8 ]);
-                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +16 ]);
-                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +24 ]);
-                hc->ref_cache[list][scan8[0] - 1 + 0  ]=
-                hc->ref_cache[list][scan8[0] - 1 + 8  ]=
-                hc->ref_cache[list][scan8[0] - 1 + 16 ]=
-                hc->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
-            }
-        }
-    }
-    calc_bS_values(hc, s, 4, 0);
-    calc_bS_values(hc, s, 4, 1);
-    return 1;
-}
-
-
-/**
-* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
-*/
-static int check_intra4x4_pred_mode(EDSlice_spu *s){
-    H264Mb *m = s->m;
-    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
-    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
-    int i;
-
-    if(!(m->top_samples_available&0x8000)){
-        for(i=0; i<4; i++){
-            int status= top[ m->intra4x4_pred_mode_cache[scan8[0] + i] ];
-            if(status<0){
-                fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
-                return -1;
-            } else if(status){
-                m->intra4x4_pred_mode_cache[scan8[0] + i]= status;
-            }
-        }
-    }
-
-    if((m->left_samples_available&0x8888)!=0x8888){
-        static const int mask[4]={0x8000,0x2000,0x80,0x20};
-        for(i=0; i<4; i++){
-            if(!(m->left_samples_available&mask[i])){
-                int status= left[ m->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
-                if(status<0){
-                    fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d, %x\n", status, m->mb_x, m->mb_y, m->left_samples_available);
-                    return -1;
-                } else if(status){
-                    m->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
-* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
-*/
-static int check_intra_pred_mode(EDSlice_spu *s, int mode){
-    H264Mb *m = s->m;
-    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
-    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
-
-    if(mode > 6) {
-        fprintf(stderr, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y);
-        return -1;
-    }
-
-    if(!(m->top_samples_available&0x8000)){
-        mode= top[ mode ];
-        if(mode<0){
-            fprintf(stderr, "top block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y);
-            return -1;
-        }
-    }
-
-    if((m->left_samples_available&0x8080) != 0x8080){
-        mode= left[ mode ];
-        if(m->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
-            mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(m->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
-        }
-        if(mode<0){
-            fprintf(stderr, "left block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y);
-            return -1;
-        }
-    }
-    return mode;
-}
-
-/**
- * gets the predicted intra4x4 prediction mode.
- */
-static inline int pred_intra_mode(EDSlice_spu *s, int n){
-    H264Mb *m = s->m;
-    const int index8= scan8[n];
-    const int left= m->intra4x4_pred_mode_cache[index8 - 1];
-    const int top = m->intra4x4_pred_mode_cache[index8 - 8];
-    const int min= FFMIN(left, top);
-
-    if(min<0) return DC_PRED;
-    else      return min;
-}
-
-static void write_back_intra_pred_mode(H264Cabac_spu *hc, EDSlice_spu *s){
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-    int8_t *mode= &hc->intra4x4_pred_mode[8*mb_x];
-
-    AV_COPY32(mode, m->intra4x4_pred_mode_cache + 4 + 8*4);
-    mode[4]= m->intra4x4_pred_mode_cache[7+8*3];
-    mode[5]= m->intra4x4_pred_mode_cache[7+8*2];
-    mode[6]= m->intra4x4_pred_mode_cache[7+8*1];
-}
-
-static inline void write_back_non_zero_count(H264Cabac_spu *hc, EDSlice_spu *s){
-    H264Mb *m = s->m;
-    const int mb_x= m->mb_x;
-
-    AV_COPY64(&hc->non_zero_count[mb_x][ 0], &m->non_zero_count_cache[0+8*1]);
-    AV_COPY64(&hc->non_zero_count[mb_x][ 8], &m->non_zero_count_cache[0+8*2]);
-    AV_COPY32(&hc->non_zero_count[mb_x][16], &m->non_zero_count_cache[0+8*5]);
-    AV_COPY32(&hc->non_zero_count[mb_x][20], &m->non_zero_count_cache[4+8*3]);
-    AV_COPY64(&hc->non_zero_count[mb_x][24], &m->non_zero_count_cache[0+8*4]);
-}
-
-static inline void write_back_motion(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-    int b_stride = hc->b_stride;
-    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
-    const int b8_x= 4*m->mb_x;
-    int list;
-
-    if(!USES_LIST(mb_type, 0))
-        fill_rectangle(&hc->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
-
-    for(list=0; list<s->list_count; list++){
-        int y;
-        int16_t (*mv_dst)[2];
-        int16_t (*mv_src)[2];
-
-        if(!USES_LIST(mb_type, list))
-            continue;
-
-        mv_dst   = &hc->motion_val[list][b_x];
-        mv_src   = &m->mv_cache[list][scan8[0]];
-        for(y=0; y<4; y++){
-            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
-        }
-        {
-            uint8_t (*mvd_dst)[2] = (void *) hc->mvd[list][8*mb_x];
-            uint8_t (*mvd_src)[2] = &hc->mvd_cache[list][scan8[0]];
-            if(IS_SKIP(mb_type))
-                AV_ZERO128(mvd_dst);
-            else{
-				AV_COPY64(mvd_dst, mvd_src + 8*3);
-                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
-                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
-                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
-            }
-        }
-
-        {
-            int8_t *ref_index = &hc->ref_index[list][b8_x];
-            ref_index[0+0*2]= m->ref_cache[list][scan8[0]];
-            ref_index[1+0*2]= m->ref_cache[list][scan8[4]];
-            ref_index[0+1*2]= m->ref_cache[list][scan8[8]];
-            ref_index[1+1*2]= m->ref_cache[list][scan8[12]];
-        }
-    }
-
-    if(s->slice_type_nos == FF_B_TYPE){
-        if(IS_8X8(mb_type)){
-            uint8_t *direct = &hc->direct[4*mb_x];
-            direct[1] = m->sub_mb_type[1]>>1;
-            direct[2] = m->sub_mb_type[2]>>1;
-            direct[3] = m->sub_mb_type[3]>>1;
-        }
-    }
-}
-
-static inline int get_dct8x8_allowed(EDSlice_spu *s){
-    H264Mb *m = s->m;
-    if(s->direct_8x8_inference_flag)
-        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
-    else
-        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
-}
-
-static inline int fetch_diagonal_mv(EDSlice_spu *s, const int16_t **C, int i, int list, int part_width){
-    H264Mb *m = s->m;
-    const int topright_ref= m->ref_cache[list][ i - 8 + part_width ];
-
-    if(topright_ref != PART_NOT_AVAILABLE){
-        *C= m->mv_cache[list][ i - 8 + part_width ];
-        return topright_ref;
-    }else{
-        *C= m->mv_cache[list][ i - 8 - 1 ];
-        return m->ref_cache[list][ i - 8 - 1 ];
-    }
-}
-
-/**
- * gets the predicted MV.
- * @param n the block index
- * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
- * @param mx the x component of the predicted motion vector
- * @param my the y component of the predicted motion vector
- */
-static inline void pred_motion(EDSlice_spu *s, int n, int part_width, int list, int ref, int * const mx, int * const my){
-    H264Mb *m = s->m;
-    const int index8= scan8[n];
-    const int top_ref=      m->ref_cache[list][ index8 - 8 ];
-    const int left_ref=     m->ref_cache[list][ index8 - 1 ];
-    const int16_t * const A= m->mv_cache[list][ index8 - 1 ];
-    const int16_t * const B= m->mv_cache[list][ index8 - 8 ];
-    const int16_t * C;
-    int diagonal_ref, match_count;
-
-    assert(part_width==1 || part_width==2 || part_width==4);
-
-/* mv_cache
-  B . . A T T T T
-  U . . L . . , .
-  U . . L . . . .
-  U . . L . . , .
-  . . . L . . . .
-*/
-
-    diagonal_ref= fetch_diagonal_mv(s, &C, index8, list, part_width);
-    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
-
-    if(match_count > 1){ //most common
-        *mx= mid_pred(A[0], B[0], C[0]);
-        *my= mid_pred(A[1], B[1], C[1]);
-    }else if(match_count==1){
-        if(left_ref==ref){
-            *mx= A[0];
-            *my= A[1];
-        }else if(top_ref==ref){
-            *mx= B[0];
-            *my= B[1];
-        }else{
-            *mx= C[0];
-            *my= C[1];
-        }
-    }else{
-        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
-            *mx= A[0];
-            *my= A[1];
-        }else{
-            *mx= mid_pred(A[0], B[0], C[0]);
-            *my= mid_pred(A[1], B[1], C[1]);
-        }
-    }
-
-}
-
-/**
- * gets the directionally predicted 16x8 MV.
- * @param n the block index
- * @param mx the x component of the predicted motion vector
- * @param my the y component of the predicted motion vector
- */
-static inline void pred_16x8_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){
-    H264Mb *m = s->m;
-    if(n==0){
-        const int top_ref=      m->ref_cache[list][ scan8[0] - 8 ];
-        const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ];
-
-        if(top_ref == ref){
-            *mx= B[0];
-            *my= B[1];
-            return;
-        }
-    }else{
-        const int left_ref=     m->ref_cache[list][ scan8[8] - 1 ];
-        const int16_t * const A= m->mv_cache[list][ scan8[8] - 1 ];
-
-        if(left_ref == ref){
-            *mx= A[0];
-            *my= A[1];
-            return;
-        }
-    }
-
-    //RARE
-    pred_motion(s, n, 4, list, ref, mx, my);
-}
-
-/**
- * gets the directionally predicted 8x16 MV.
- * @param n the block index
- * @param mx the x component of the predicted motion vector
- * @param my the y component of the predicted motion vector
- */
-static inline void pred_8x16_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){
-    H264Mb *m = s->m;
-    if(n==0){
-        const int left_ref=      m->ref_cache[list][ scan8[0] - 1 ];
-        const int16_t * const A=  m->mv_cache[list][ scan8[0] - 1 ];
-
-        if(left_ref == ref){
-            *mx= A[0];
-            *my= A[1];
-            return;
-        }
-    }else{
-        const int16_t * C;
-        int diagonal_ref;
-
-        diagonal_ref= fetch_diagonal_mv(s, &C, scan8[4], list, 2);
-        if(diagonal_ref == ref){
-            *mx= C[0];
-            *my= C[1];
-            return;
-        }
-    }
-
-    //RARE
-    pred_motion(s, n, 2, list, ref, mx, my);
-}
-
-static inline void pred_pskip_motion(EDSlice_spu *s, int * const mx, int * const my){
-    H264Mb *m = s->m;
-    const int top_ref = m->ref_cache[0][ scan8[0] - 8 ];
-    const int left_ref= m->ref_cache[0][ scan8[0] - 1 ];
-
-    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
-       || !( top_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 8 ]))
-       || !(left_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 1 ]))){
-
-        *mx = *my = 0;
-        return;
-    }
-
-    pred_motion(s, 0, 4, 0, 0, mx, my);
-
-    return;
-}
-
-/**
- * decodes a P_SKIP or B_SKIP macroblock
- */
-static void decode_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s){
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;    
-    int mb_type=0;
-
-    memset(hc->non_zero_count[mb_x], 0, 32);
-    memset(m->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
-
-    if( s->slice_type_nos == FF_B_TYPE )
-    {
-        // just for fill_caches. pred_direct_motion will set the real mb_type
-        mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
-		fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ...
-
-        ff_h264_pred_direct_motion(hc, s, &mb_type);
-        mb_type|= MB_TYPE_SKIP;
-    }
-    else
-    {
-        int mx, my;
-        mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
-
-        fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ...
-        pred_pskip_motion(s, &mx, &my);
-        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
-        fill_rectangle(  m->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
-    }
-
-    write_back_motion(hc, s, mb_type);
-	hc->mb_type[mb_x]= mb_type;
-    m->mb_type = mb_type;
-    hc->qscale[mb_x]= s->qscale;
-    fill_filter_caches(hc, s, mb_type);
-}
-
-static int decode_cabac_intra_mb_type(EDSlice_spu *s, CABACContext *c, int ctx_base, int intra_slice) {
-    H264Mb *m =s->m;
-    uint8_t *state= &c->cabac_state[ctx_base];
-    int mb_type;
-
-    if(intra_slice){
-        int ctx=0;
-        if( m->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
-            ctx++;
-        if( m->top_type     & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
-            ctx++;
-        if( get_cabac_noinline( c, &state[ctx] ) == 0 )
-            return 0;   /* I4x4 */
-        state += 2;
-    }else{
-        if( get_cabac_noinline( c, state ) == 0 )
-            return 0;   /* I4x4 */
-    }
-
-    if( get_cabac_terminate( c ) )
-        return 25;  /* PCM */
-
-    mb_type = 1; /* I16x16 */
-    mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */
-    if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */
-        mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] );
-    mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] );
-    mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] );
-    return mb_type;
-}
-
-static int decode_cabac_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s, H264Mb *m, CABACContext *c) {
-    int ctx = 0;
-    const int mb_x = m->mb_x;
-
-	if( m->mb_x>0 && !IS_SKIP( hc->mb_type[mb_x-1] ))
-        ctx++;
-	if( m->mb_y>0 && !IS_SKIP( hc->mb_type_top[mb_x] ))
-        ctx++;
-
-    if( s->slice_type_nos == FF_B_TYPE )
-        ctx += 13;
-    return get_cabac_noinline(c, &c->cabac_state[11+ctx] );
-}
-
-static int decode_cabac_mb_intra4x4_pred_mode( CABACContext *c, int pred_mode ) {
-    int mode = 0;
-
-    if( get_cabac(c, &c->cabac_state[68] ) )
-        return pred_mode;
-
-    mode += 1 * get_cabac(c, &c->cabac_state[69] );
-    mode += 2 * get_cabac(c, &c->cabac_state[69] );
-    mode += 4 * get_cabac(c, &c->cabac_state[69] );
-
-    return mode + ( mode >= pred_mode );
-}
-
-static int decode_cabac_mb_chroma_pre_mode(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) {
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-
-    int ctx = 0;
-
-    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */
-    if( m->left_type && hc->chroma_pred_mode[mb_x-1] != 0 )
-        ctx++;
-
-    if( m->top_type     && hc->chroma_pred_mode_top[mb_x] != 0 )
-        ctx++;
-
-    if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 )
-        return 0;
-
-    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
-        return 1;
-    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
-        return 2;
-    else
-        return 3;
-}
-
-static int decode_cabac_mb_cbp_luma(H264Cabac_spu *hc, CABACContext *c) {
-    int cbp_b, cbp_a, ctx, cbp = 0;
-
-    cbp_a = hc->left_cbp;
-    cbp_b = hc->top_cbp;
-
-    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]);
-    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1;
-    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2;
-    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3;
-    return cbp;
-}
-static int decode_cabac_mb_cbp_chroma(H264Cabac_spu *hc, CABACContext *c) {
-    int ctx;
-    int cbp_a, cbp_b;
-
-    cbp_a = (hc->left_cbp>>4)&0x03;
-    cbp_b = (hc-> top_cbp>>4)&0x03;
-
-    ctx = 0;
-    if( cbp_a > 0 ) ctx++;
-    if( cbp_b > 0 ) ctx += 2;
-    if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 )
-        return 0;
-
-    ctx = 4;
-    if( cbp_a == 2 ) ctx++;
-    if( cbp_b == 2 ) ctx += 2;
-    return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] );
-}
-
-static int decode_cabac_p_mb_sub_type( CABACContext *c) {
-    if( get_cabac(c, &c->cabac_state[21] ) )
-        return 0;   /* 8x8 */
-    if( !get_cabac(c, &c->cabac_state[22] ) )
-        return 1;   /* 8x4 */
-    if( get_cabac(c, &c->cabac_state[23] ) )
-        return 2;   /* 4x8 */
-    return 3;       /* 4x4 */
-}
-static int decode_cabac_b_mb_sub_type(CABACContext *c) {
-    int type;
-    if( !get_cabac(c, &c->cabac_state[36] ) )
-        return 0;   /* B_Direct_8x8 */
-    if( !get_cabac(c, &c->cabac_state[37] ) )
-        return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
-    type = 3;
-    if( get_cabac(c, &c->cabac_state[38] ) ) {
-        if( get_cabac(c, &c->cabac_state[39] ) )
-            return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
-        type += 4;
-    }
-    type += 2*get_cabac(c, &c->cabac_state[39] );
-    type +=   get_cabac(c, &c->cabac_state[39] );
-    return type;
-}
-
-static int decode_cabac_mb_ref(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, int list, int n ) {
-    H264Mb *m = s->m;
-    int refa = m->ref_cache[list][scan8[n] - 1];
-    int refb = m->ref_cache[list][scan8[n] - 8];
-    int ref  = 0;
-    int ctx  = 0;
-
-    if( s->slice_type_nos == FF_B_TYPE) {
-        if( refa > 0 && !(hc->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) )
-            ctx++;
-        if( refb > 0 && !(hc->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) )
-            ctx += 2;
-    } else {
-        if( refa > 0 )
-            ctx++;
-        if( refb > 0 )
-            ctx += 2;
-    }
-
-    while( get_cabac(c, &c->cabac_state[54+ctx] ) ) {
-        ref++;
-        ctx = (ctx>>2)+4;
-        if(ref >= 32 /*h->ref_list[list]*/){
-			fprintf(stderr, "refcount %d\n", ref);
-            return -1;
-        }
-    }
-    return ref;
-}
-
-static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) {
-    int mvd;
-
-    if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
-//    if(!get_cabac(&h->cabac, &c->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
-        *mvda= 0;
-        return 0;
-    }
-
-    mvd= 1;
-    ctxbase+= 3;
-    while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) {
-        if( mvd < 4 )
-            ctxbase++;
-        mvd++;
-    }
-
-    if( mvd >= 9 ) {
-        int k = 3;
-        while( get_cabac_bypass(c ) ) {
-            mvd += 1 << k;
-            k++;
-            if(k>24){
-                fprintf(stderr, "overflow in decode_cabac_mb_mvd\n");
-                return INT_MIN;
-            }
-        }
-        while( k-- ) {
-            mvd += get_cabac_bypass(c )<<k;
-        }
-        *mvda=mvd < 70 ? mvd : 70;
-    }else
-        *mvda=mvd;
-    return get_cabac_bypass_sign(c, -mvd );
-}
-
-#define DECODE_CABAC_MB_MVD( hc, c, list,  n )\
-{\
-    int amvd0 = hc->mvd_cache[list][scan8[n] - 1][0] +\
-                hc->mvd_cache[list][scan8[n] - 8][0];\
-    int amvd1 = hc->mvd_cache[list][scan8[n] - 1][1] +\
-                hc->mvd_cache[list][scan8[n] - 8][1];\
-\
-    mx += decode_cabac_mb_mvd( c, 40, amvd0, &mpx );\
-    my += decode_cabac_mb_mvd( c, 47, amvd1, &mpy );\
-}
-
-static av_always_inline int get_cabac_cbf_ctx(H264Cabac_spu *hc, EDSlice_spu *s, int cat, int idx, int is_dc ) {
-    H264Mb *m = s->m;
-    int nza, nzb;
-    int ctx = 0;
-
-    if( is_dc ) {
-        if( cat == 0 ) {
-            nza = hc->left_cbp&0x100;
-            nzb = hc-> top_cbp&0x100;
-        } else {
-            nza = (hc->left_cbp>>(6+idx))&0x01;
-            nzb = (hc-> top_cbp>>(6+idx))&0x01;
-        }
-    } else {
-        assert(cat == 1 || cat == 2 || cat == 4);
-        nza = m->non_zero_count_cache[scan8[idx] - 1];
-        nzb = m->non_zero_count_cache[scan8[idx] - 8];
-    }
-
-    if( nza > 0 )
-        ctx++;
-
-    if( nzb > 0 )
-        ctx += 2;
-
-    return ctx + 4 * cat;
-}
-
- uint8_t last_coeff_flag_offset_8x8[63] = {
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
-};
-
-static const int significant_coeff_flag_offset[2][6] = {
-    { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
-    { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
-};
-static const int last_coeff_flag_offset[2][6] = {
-    { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
-    { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
-};
-static const int coeff_abs_level_m1_offset[6] = {
-    227+0, 227+10, 227+20, 227+30, 227+39, 426
-};
-static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
-    { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
-    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
-    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
-    12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
-    { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
-    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
-    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
-    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
-};
-/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
-* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
-* map node ctx => cabac ctx for level=1 */
-static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
-/* map node ctx => cabac ctx for level>1 */
-static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
-static const uint8_t coeff_abs_level_transition[2][8] = {
-    /* update node ctx after decoding a level=1 */
-    { 1, 2, 3, 3, 4, 5, 6, 7 },
-    /* update node ctx after decoding a level>1 */
-    { 4, 4, 4, 4, 5, 6, 7, 7 }
-};
-
-static av_always_inline void decode_cabac_residual_internal(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-    int index[64];
-
-    int av_unused last;
-    int coeff_count = 0;
-    int node_ctx = 0;
-
-    uint8_t *significant_coeff_ctx_base;
-    uint8_t *last_coeff_ctx_base;
-    uint8_t *abs_level_m1_ctx_base;
-
-    /* read coded block flag */
-    if( is_dc || cat != 5 ) {
-        if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( hc, s, cat, n, is_dc ) ] ) == 0 ) {
-            if( !is_dc )
-                m->non_zero_count_cache[scan8[n]] = 0;
-            return;
-        }
-    }
-
-    significant_coeff_ctx_base = c->cabac_state
-        + significant_coeff_flag_offset[0][cat];
-    last_coeff_ctx_base = c->cabac_state
-        + last_coeff_flag_offset[0][cat];
-    abs_level_m1_ctx_base = c->cabac_state
-        + coeff_abs_level_m1_offset[cat];
-
-    if( !is_dc && cat == 5 ) {
-#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
-        for(last= 0; last < coefs; last++) { \
-            uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
-            if( get_cabac( c, sig_ctx )) { \
-                uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
-                index[coeff_count++] = last; \
-                if( get_cabac( c, last_ctx ) ) { \
-                    last= max_coeff; \
-                    break; \
-                } \
-            } \
-        }\
-        if( last == max_coeff -1 ) {\
-            index[coeff_count++] = last;\
-        }\
-		
-        const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0];
-        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
-    } else {
-        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
-    }
-    assert(coeff_count > 0);
-
-    if( is_dc ) {
-        if( cat == 0 )
-            hc->cbp[mb_x] |= 0x100;
-        else
-            hc->cbp[mb_x] |= 0x40 << n;
-    } else {
-        if( cat == 5 )
-            fill_rectangle(&m->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
-        else {
-            assert( cat == 1 || cat == 2 || cat == 4 );
-            m->non_zero_count_cache[scan8[n]] = coeff_count;
-        }
-    }
-
-    do {
-        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
-        int j= scantable[index[--coeff_count]];
-
-        if( get_cabac( c, ctx ) == 0 ) {
-            node_ctx = coeff_abs_level_transition[0][node_ctx];
-            if( is_dc ) {
-                block[j] = get_cabac_bypass_sign( c, -1);
-            }else{
-                block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6;
-            }
-        } else {
-            int coeff_abs = 2;
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
-            node_ctx = coeff_abs_level_transition[1][node_ctx];
-
-            while( coeff_abs < 15 && get_cabac( c, ctx ) ) {
-                coeff_abs++;
-            }
-
-            if( coeff_abs >= 15 ) {
-                int j = 0;
-                while( get_cabac_bypass( c ) ) {
-                    j++;
-                }
-
-                coeff_abs=1;
-                while( j-- ) {
-                    coeff_abs += coeff_abs + get_cabac_bypass( c );
-                }
-                coeff_abs+= 14;
-            }
-
-            if( is_dc ) {
-                block[j] = get_cabac_bypass_sign( c, -coeff_abs );
-            }else{
-                block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6;
-            }
-        }
-    } while( coeff_count );
-
-}
-
-static void decode_cabac_residual_dc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
-    decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, NULL, max_coeff, 1);
-}
-
-static void decode_cabac_residual_nondc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
-    decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, qmul, max_coeff, 0);
-}
-
-/**
- * decodes a macroblock
- * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
- */
-int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) {
-    H264Mb *m = s->m;
-	int mb_x = m->mb_x;
-    int mb_type, partition_count, cbp = 0;
-    int dct8x8_allowed= s->pps.transform_8x8_mode;
-
-    fill_decode_neighbors(hc, s);
-	memset(m->mb, 0 , sizeof(m->mb));
-
-    if( s->slice_type_nos != FF_I_TYPE ) {
-        int skip;
-        /* a skipped mb needs the aff flag from the following mb */
-        skip = decode_cabac_mb_skip( hc, s, m, c);
-		
-        /* read skip flags */
-        if( skip ) {
-            decode_mb_skip(hc, s);
-            hc->cbp[mb_x] = m->cbp = 0;
-            hc->chroma_pred_mode[mb_x] = 0;
-            s->last_qscale_diff = 0;
-            return 0;
-        }
-    }
-
-    if( s->slice_type_nos == FF_B_TYPE ) {
-        int ctx = 0;
-
-        if( !IS_DIRECT( m->left_type-1 ) )
-            ctx++;
-        if( !IS_DIRECT( m->top_type-1 ) )
-            ctx++;
-
-        if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){
-            mb_type= 0; /* B_Direct_16x16 */
-        }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) {
-            mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */
-        }else{
-            int bits;
-            bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3;
-            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2;
-            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1;
-            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] );
-            if( bits < 8 ){
-                mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
-            }else if( bits == 13 ){
-                mb_type= decode_cabac_intra_mb_type(s, c, 32, 0);
-                goto decode_intra_mb;
-            }else if( bits == 14 ){
-                mb_type= 11; /* B_L1_L0_8x16 */
-            }else if( bits == 15 ){
-                mb_type= 22; /* B_8x8 */
-            }else{
-                bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] );
-                mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
-            }
-        }
-            partition_count= b_mb_type_info[mb_type].partition_count;
-            mb_type=         b_mb_type_info[mb_type].type;
-    } else if( s->slice_type_nos == FF_P_TYPE ) {
-        if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) {
-            /* P-type */
-            if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) {
-                /* P_L0_D16x16, P_8x8 */
-                mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] );
-            } else {
-                /* P_L0_D8x16, P_L0_D16x8 */
-                mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] );
-            }
-            partition_count= p_mb_type_info[mb_type].partition_count;
-            mb_type=         p_mb_type_info[mb_type].type;
-        } else {
-            mb_type= decode_cabac_intra_mb_type(s, c, 17, 0);
-            goto decode_intra_mb;
-        }
-    } else {
-        mb_type= decode_cabac_intra_mb_type(s ,c, 3, 1);
-        if(s->slice_type == FF_SI_TYPE && mb_type)
-            mb_type--;
-        assert(s->slice_type_nos == FF_I_TYPE);
-decode_intra_mb:
-        partition_count = 0;
-        cbp= i_mb_type_info[mb_type].cbp;
-        m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
-        mb_type= i_mb_type_info[mb_type].type;
-    }
-	
-    if(IS_INTRA_PCM(mb_type)) {
-        uint8_t *ptr;
-        // We assume these blocks are very rare so we do not optimize it.
-        // FIXME The two following lines get the bitstream position in the cabac
-        // decode, I think it should be done by a function in cabac.h (or cabac.c).
-        ptr=c->bytestream;
-        if(c->low&0x1) ptr--;
-        if(CABAC_BITS==16){
-            if(c->low&0x1FF) ptr--;
-        }
-		if ((unsigned) (ptr + 384) >= (unsigned) c->bytestream_end){
-			fprintf(stderr, "Intra PCM mb crossed bytestream buffer\n Known issue.");
-		}		
-		
-        // The pixels are stored in the same order as levels in h->mb array.
-        memcpy(m->mb, ptr, 256); ptr+=256;        
-		memcpy(m->mb+128, ptr, 128); ptr+=128;
-        
-		c->bytestream = ptr;
-		#if CABAC_BITS == 16
-		c->low =  (*c->bytestream++)<<18;
-		c->low+=  (*c->bytestream++)<<10;
-		#else
-		c->low =  (*c->bytestream++)<<10;
-		#endif
-		c->low+= ((*c->bytestream++)<<2) + 2;
-		c->range= 0x1FE;
-
-        // All blocks are present
-        hc->cbp[mb_x] = 0x1ef;
-        hc->chroma_pred_mode[mb_x] = 0;
-        // In deblocking, the quantizer is 0
-        hc->qscale[mb_x]= 0;
-        // All coeffs are present
-        memset(hc->non_zero_count[mb_x], 16, 32);
-		hc->mb_type[mb_x]= m->mb_type = mb_type;
-        s->last_qscale_diff = 0;
-        fill_filter_caches(hc, s, mb_type);
-        return 0;
-    }
-    fill_decode_caches(hc, s, mb_type);
-
-    if( IS_INTRA( mb_type ) ) {
-        int i, pred_mode;
-        if( IS_INTRA4x4( mb_type ) ) {
-            if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ) ) {
-                mb_type |= MB_TYPE_8x8DCT;
-                for( i = 0; i < 16; i+=4 ) {
-                    int pred = pred_intra_mode( s, i );
-                    int mode = decode_cabac_mb_intra4x4_pred_mode(c, pred );
-                    fill_rectangle( &m->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
-                }
-            } else {
-                for( i = 0; i < 16; i++ ) {
-                    int pred = pred_intra_mode( s, i );
-                    m->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode(c, pred );
-                }
-            }
-            write_back_intra_pred_mode(hc, s);
-            if( check_intra4x4_pred_mode(s) < 0 ) return -1;
-        } else {
-            m->intra16x16_pred_mode= check_intra_pred_mode(s, m->intra16x16_pred_mode );
-            if( m->intra16x16_pred_mode < 0 ) return -1;
-        }
-
-		hc->chroma_pred_mode[mb_x] =
-		pred_mode                        = decode_cabac_mb_chroma_pre_mode( hc, s, c );
-
-		pred_mode= check_intra_pred_mode( s, pred_mode );
-		if( pred_mode < 0 ) return -1;
-		m->chroma_pred_mode= pred_mode;
-	
-    } else if( partition_count == 4 ) {
-        int i, j, sub_partition_count[4], list, ref[2][4];
-
-        if( s->slice_type_nos == FF_B_TYPE ) {
-            for( i = 0; i < 4; i++ ) {
-                m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c );
-                sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
-                m->sub_mb_type[i]=      b_sub_mb_type_info[ m->sub_mb_type[i] ].type;
-            }
-            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
-                          m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
-                ff_h264_pred_direct_motion(hc, s, &mb_type);
-                m->ref_cache[0][scan8[4]] =
-                m->ref_cache[1][scan8[4]] =
-                m->ref_cache[0][scan8[12]] =
-                m->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
-                    for( i = 0; i < 4; i++ )
-                        fill_rectangle( &hc->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 );
-            }
-        } else {
-            for( i = 0; i < 4; i++ ) {
-                m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c );
-                sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
-                m->sub_mb_type[i]=      p_sub_mb_type_info[ m->sub_mb_type[i] ].type;
-            }
-        }
-
-        for( list = 0; list < s->list_count; list++ ) {
-            for( i = 0; i < 4; i++ ) {
-                if(IS_DIRECT(m->sub_mb_type[i])) continue;
-                if(IS_DIR(m->sub_mb_type[i], 0, list)){
-                    if( s->ref_count[list] > 1 ){
-                        ref[list][i] = decode_cabac_mb_ref(hc, s, c, list, 4*i );
-                        if(ref[list][i] >= s->ref_count[list]){
-                            fprintf(stderr, "Reference %d >= %d\n", ref[list][i], s->ref_count[list]);
-                            return -1;
-                        }
-                    }else
-                        ref[list][i] = 0;
-                } else {
-                    ref[list][i] = -1;
-                }
-                                                    m->ref_cache[list][ scan8[4*i]+1 ]=
-                m->ref_cache[list][ scan8[4*i]+8 ]=m->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
-            }
-        }
-
-        if(dct8x8_allowed)
-            dct8x8_allowed = get_dct8x8_allowed(s);
-
-        for(list=0; list<s->list_count; list++){
-            for(i=0; i<4; i++){
-                m->ref_cache[list][ scan8[4*i]   ]=m->ref_cache[list][ scan8[4*i]+1 ];
-                if(IS_DIRECT(m->sub_mb_type[i])){
-                    fill_rectangle(hc->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
-                    continue;
-                }
-
-                if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){
-                    const int sub_mb_type= m->sub_mb_type[i];
-                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
-                    for(j=0; j<sub_partition_count[i]; j++){
-                        int mpx, mpy;
-                        int mx, my;
-                        const int index= 4*i + block_width*j;
-                        int16_t (* mv_cache)[2]= &m->mv_cache[list][ scan8[index]];
-                        uint8_t (* mvd_cache)[2]= &hc->mvd_cache[list][ scan8[index]];
-                        pred_motion(s, index, block_width, list, m->ref_cache[list][ scan8[index] ], &mx, &my);
-                        DECODE_CABAC_MB_MVD( hc, c, list, index)
-
-                        if(IS_SUB_8X8(sub_mb_type)){
-                            mv_cache[ 1 ][0]=
-                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
-                            mv_cache[ 1 ][1]=
-                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
-
-                            mvd_cache[ 1 ][0]=
-                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx;
-                            mvd_cache[ 1 ][1]=
-                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy;
-                        }else if(IS_SUB_8X4(sub_mb_type)){
-                            mv_cache[ 1 ][0]= mx;
-                            mv_cache[ 1 ][1]= my;
-
-                            mvd_cache[ 1 ][0]=  mpx;
-                            mvd_cache[ 1 ][1]= mpy;
-                        }else if(IS_SUB_4X8(sub_mb_type)){
-                            mv_cache[ 8 ][0]= mx;
-                            mv_cache[ 8 ][1]= my;
-
-                            mvd_cache[ 8 ][0]= mpx;
-                            mvd_cache[ 8 ][1]= mpy;
-                        }
-                        mv_cache[ 0 ][0]= mx;
-                        mv_cache[ 0 ][1]= my;
-
-                        mvd_cache[ 0 ][0]= mpx;
-                        mvd_cache[ 0 ][1]= mpy;
-                    }
-                }else{
-                    fill_rectangle(m->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
-                    fill_rectangle(hc->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
-                }
-            }
-        }
-    } else if( IS_DIRECT(mb_type) ) {
-		ff_h264_pred_direct_motion(hc, s, &mb_type);
-        fill_rectangle(hc->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
-        fill_rectangle(hc->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
-        dct8x8_allowed &= s->direct_8x8_inference_flag;
-    } else {
-        int list, i;
-        if(IS_16X16(mb_type)){
-            for(list=0; list<s->list_count; list++){
-                if(IS_DIR(mb_type, 0, list)){
-                    int ref;
-                    if(s->ref_count[list] > 1){
-                        ref= decode_cabac_mb_ref(hc, s, c, list, 0);
-                        if(ref >= s->ref_count[list]){
-                            fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
-                            return -1;
-                        }
-                    }else
-                        ref=0;
-                        fill_rectangle(&m->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
-                }
-            }
-            for(list=0; list<s->list_count; list++){
-                if(IS_DIR(mb_type, 0, list)){
-                    int mx,my,mpx,mpy;
-                    pred_motion(s, 0, 4, list, m->ref_cache[list][ scan8[0] ], &mx, &my);
-                    DECODE_CABAC_MB_MVD( hc, c, list, 0)
-
-                    fill_rectangle(hc->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
-                    fill_rectangle(m->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
-                }
-
-            }
-        }
-        else if(IS_16X8(mb_type)){
-            for(list=0; list<s->list_count; list++){
-                    for(i=0; i<2; i++){
-                        if(IS_DIR(mb_type, i, list)){
-                            int ref;
-                            if(s->ref_count[list] > 1){
-                                ref= decode_cabac_mb_ref(hc, s, c, list, 8*i );
-                                if(ref >= s->ref_count[list]){
-                                    fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
-                                    return -1;
-                                }
-                            }else
-                                ref=0;
-                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
-                        }else
-                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
-                    }
-            }
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){
-                        int mx,my,mpx,mpy;
-                        pred_16x8_motion(s, 8*i, list, m->ref_cache[list][scan8[0] + 16*i], &mx, &my);
-                        DECODE_CABAC_MB_MVD( hc, c, list, 8*i)
-
-                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
-                        fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
-                    }else{
-                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
-                        fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
-                    }
-                }
-            }
-        }else{
-            assert(IS_8X16(mb_type));
-            for(list=0; list<s->list_count; list++){
-                    for(i=0; i<2; i++){
-                        if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int ref;
-                            if(s->ref_count[list] > 1){
-                                ref= decode_cabac_mb_ref(hc, s, c, list, 4*i );
-                                if(ref >= s->ref_count[list]){
-                                    fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
-                                    return -1;
-                                }
-                            }else
-                                ref=0;
-                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
-                        }else
-                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
-                    }
-            }
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){
-                        int mx,my,mpx,mpy;
-                        pred_8x16_motion( s, i*4, list, m->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
-                        DECODE_CABAC_MB_MVD( hc, c, list, 4*i)
-
-                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
-                        fill_rectangle(m->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
-                    }else{
-                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
-                        fill_rectangle(m-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
-                    }
-                }
-            }
-        }
-    }
-	
-	if( IS_INTER( mb_type ) ) {
-			hc->chroma_pred_mode[mb_x] = 0;
-			write_back_motion( hc, s, mb_type );
-	}
-
-    if( !IS_INTRA16x16( mb_type ) ) {
-        cbp  = decode_cabac_mb_cbp_luma( hc, c);
-		cbp |= decode_cabac_mb_cbp_chroma( hc, c ) << 4;
-    }
-	
-    hc->cbp[mb_x] = m->cbp = cbp;
-    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
-        mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] );
-    }
-
-    if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *scan8x8, *dc_scan;
-        const uint32_t *qmul;
-
-        if (s->transform_bypass && s->qscale){
-            scan8x8= ff_zigzag_direct;
-            scan= zigzag_scan;
-        }else{
-            scan8x8= hc->zigzag_scan8x8;
-            scan= hc->zigzag_scan;
-        }
-        dc_scan= luma_dc_zigzag_scan;
-
-        // decode_cabac_mb_dqp
-        if(get_cabac_noinline(c, &c->cabac_state[60 + (s->last_qscale_diff != 0)])){
-            int val = 1;
-            int ctx= 2;
-
-            while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) {
-                ctx= 3;
-                val++;
-                if(val > 102){ //prevent infinite loop
-                    fprintf(stderr, "cabac decode of qscale diff failed at %d %d (%d)\n", m->mb_x, m->mb_y, val);
-                    return -1;
-                }
-            }
-
-            if( val&0x01 )
-                val=   (val + 1)>>1 ;
-            else
-                val= -((val + 1)>>1);
-            s->last_qscale_diff = val;
-            s->qscale += val;
-            if(((unsigned)s->qscale) > 51){
-                if(s->qscale<0) s->qscale+= 52;
-                else            s->qscale-= 52;
-            }
-            s->chroma_qp[0] = s->pps.chroma_qp_table[0][s->qscale];
-            s->chroma_qp[1] = s->pps.chroma_qp_table[1][s->qscale];
-        }else
-            s->last_qscale_diff=0;
-
-        if( IS_INTRA16x16( mb_type ) ) {
-            int i;            
-            decode_cabac_residual_dc( hc, s, c, m->mb, 0, 0, dc_scan, 16);
-
-            if( cbp&15 ) {
-                qmul = hc->dequant4_coeff[0][s->qscale];
-                for( i = 0; i < 16; i++ ) {                    
-                    decode_cabac_residual_nondc( hc, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15);
-                }
-            } else {
-                fill_rectangle(&m->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
-            }
-        } else {
-            int i8x8, i4x4;
-            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
-                if( cbp & (1<<i8x8) ) {
-                    if( IS_8x8DCT(mb_type) ) {
-                        decode_cabac_residual_nondc(hc, s, c, m->mb + 64*i8x8, 5, 4*i8x8,
-                            scan8x8, hc->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
-                    } else {
-                        qmul = hc->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
-                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
-                            const int index = 4*i8x8 + i4x4;                            
-//START_TIMER
-                            decode_cabac_residual_nondc(hc, s, c, m->mb + 16*index, 2, index, scan, qmul, 16);
-//STOP_TIMER("decode_residual")
-                        }
-                    }
-                } else {
-                    uint8_t * const nnz= &m->non_zero_count_cache[ scan8[4*i8x8] ];
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-                }
-            }
-        }
-
-        if( cbp&0x30 ){
-            int i;
-            for( i = 0; i < 2; i++ ) {                
-                decode_cabac_residual_dc(hc, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4);
-            }
-        }
-
-        if( cbp&0x20 ) {
-            int i, j;
-            for( i = 0; i < 2; i++ ) {
-                qmul = hc->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][s->chroma_qp[i]];
-                for( j = 0; j < 4; j++ ) {
-                    const int index = 16 + 4 * i + j;                    
-                    decode_cabac_residual_nondc( hc, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15);
-                }
-            }
-        } else {
-            uint8_t * const nnz= &m->non_zero_count_cache[0];
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
-        }
-    } else {
-        uint8_t * const nnz= &m->non_zero_count_cache[0];
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
-        s->last_qscale_diff = 0;
-    }
-	hc->mb_type[mb_x]= m->mb_type = mb_type;
-    hc->qscale[mb_x]= s->qscale;	
-    write_back_non_zero_count(hc, s);
-    fill_filter_caches(hc, s, mb_type);
-
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-#ifndef H264_CABAC_H
-#define H264_CABAC_H
-
-#define CELL_SPE
-#include "libavcodec/avcodec.h"
-#include "h264_types_spu.h"
-#include "cabac_spu.h"
-
-
-/**
- * decodes a CABAC coded macroblock
- * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
- */
-int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c);
-void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,355 +0,0 @@
-static void PREFIX_h264_chroma_mc8_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
-
-  register int i;
-
-  const int16_t i32ss= 32;
-  const int16_t imax = 255;
-  const int16_t iABCD1 = ((8 - x) * (8 - y));
-  const int16_t iABCD2 = ((x) * (8 - y));
-  const int16_t iABCD3 = ((8 - x) * (y));
-  const int16_t iABCD4 = ((x) * (y));
-
-  const vsint16_t vA = spu_splats(iABCD1);
-  const vsint16_t vB = spu_splats(iABCD2);
-  const vsint16_t vC = spu_splats(iABCD3);
-  const vsint16_t vD = spu_splats(iABCD4);
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v32ss = spu_splats(i32ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const int shift_src =(unsigned int) src & 15;
-  const int shift_dst =(unsigned int) dst & 15;
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-  const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-  vuint8_t dstmask;
-
-  if(shift_dst==0)
-    dstmask=dstmask0;
-  else
-    dstmask=dstmask8;
-
-  vuint8_t vsrc0uc1;
-  vuint8_t vsrc0uc2;
-  vuint8_t vsrc0uc;
-  vuint8_t vsrc1uc;
-  vsrc0uc1 = *(vuint8_t *)(src);
-  vsrc0uc2 = *(vuint8_t *)(src+16);
-  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
-  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
-
-  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
-  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
-
-  for (i = 0 ; i < h ; i++) {
-        
-    vuint8_t vsrc2uc1;
-    vuint8_t vsrc2uc2;
-    vuint8_t vsrc2uc;
-    vuint8_t vsrc3uc;
-    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
-    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
-    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
-    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
-        
-    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
-    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
-        
-    vsint16_t psum;
-        
-    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
-    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
-    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-
-    psum1 = spu_mule(vsrc1ssH, vB);
-    psum2 = spu_mulo(vsrc1ssH, vB);
-    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum1 = spu_mule(vsrc2ssH, vC);
-    psum2 = spu_mulo(vsrc2ssH, vC);
-    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum1 = spu_mule(vsrc3ssH, vD);
-    psum2 = spu_mulo(vsrc3ssH, vD);
-    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum = spu_add(v32ss, psum);
-    psum = spu_rlmask(psum, -6);
-
-    //Saturation from 0 to 255
-    sat = spu_cmpgt(psum,(vsint16_t)vzero);
-    psum = spu_and(psum,(vsint16_t)sat);
-    sat = spu_cmpgt(psum,vmax);
-    psum = spu_sel(psum,vmax,sat);
-
-    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-
-    vsrc0ssH = vsrc2ssH;
-    vsrc1ssH = vsrc3ssH;
-        
-    dst += dst_stride;
-    //src += src_stride;
-	src += STRIDE_C;
-  }
-}
-
-static void PREFIX_h264_chroma_mc4_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
-
-  register int i;
-
-  const int16_t i32ss= 32;
-  const int16_t imax = 255;
-  const int16_t iABCD1 = ((8 - x) * (8 - y));
-  const int16_t iABCD2 = ((x) * (8 - y));
-  const int16_t iABCD3 = ((8 - x) * (y));
-  const int16_t iABCD4 = ((x) * (y));
-
-  const vsint16_t vA = spu_splats(iABCD1);
-  const vsint16_t vB = spu_splats(iABCD2);
-  const vsint16_t vC = spu_splats(iABCD3);
-  const vsint16_t vD = spu_splats(iABCD4);
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v32ss = spu_splats(i32ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-    
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  const int shift_src = (unsigned int) src & 15;
-  const int shift_dst = (unsigned int) dst & 15;
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-  switch(shift_dst){
-    case 0:  dstmask = dstmask0;
-             break;
-    case 4:  dstmask = dstmask4;
-             break;
-    case 8:  dstmask = dstmask8;
-             break;
-    case 12: dstmask = dstmask12;
-             break;
-  }
-
-  vuint8_t vsrc0uc1;
-  vuint8_t vsrc0uc2;
-  vuint8_t vsrc0uc;
-  vuint8_t vsrc1uc;
-  vsrc0uc1 = *(vuint8_t *)(src);
-  vsrc0uc2 = *(vuint8_t *)(src+16);
-  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
-  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
-    
-  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
-  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
-
-  for (i = 0 ; i < h ; i++) {
-
-    vuint8_t vsrc2uc1;
-    vuint8_t vsrc2uc2;
-    vuint8_t vsrc2uc;
-    vuint8_t vsrc3uc;
-    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
-    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
-    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
-    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
-        
-    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
-    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
-        
-    vsint16_t psum;
-        
-    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
-    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
-    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-
-    psum1 = spu_mule(vsrc1ssH, vB);
-    psum2 = spu_mulo(vsrc1ssH, vB);
-    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum1 = spu_mule(vsrc2ssH, vC);
-    psum2 = spu_mulo(vsrc2ssH, vC);
-    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum1 = spu_mule(vsrc3ssH, vD);
-    psum2 = spu_mulo(vsrc3ssH, vD);
-    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum = spu_add(v32ss, psum);
-    psum = spu_rlmask(psum, -6);
-
-    //Saturation from 0 to 255
-    sat = spu_cmpgt(psum,(vsint16_t)vzero);
-    psum = spu_and(psum,(vsint16_t)sat);
-    sat = spu_cmpgt(psum,vmax);
-    psum = spu_sel(psum,vmax,sat);
-
-    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-
-    vsrc0ssH = vsrc2ssH;
-    vsrc1ssH = vsrc3ssH;
-        
-    dst += dst_stride;
-    src += STRIDE_C;
-  }
-}
-
-static void PREFIX_h264_chroma_mc2_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
-
-  register int i;
-
-  const int16_t i32ss= 32;
-  const int16_t imax = 255;
-  const int16_t iABCD1 = ((8 - x) * (8 - y));
-  const int16_t iABCD2 = ((x) * (8 - y));
-  const int16_t iABCD3 = ((8 - x) * (y));
-  const int16_t iABCD4 = ((x) * (y));
-
-  const vsint16_t vA = spu_splats(iABCD1);
-  const vsint16_t vB = spu_splats(iABCD2);
-  const vsint16_t vC = spu_splats(iABCD3);
-  const vsint16_t vD = spu_splats(iABCD4);
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v32ss = spu_splats(i32ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-    
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  const int shift_src = (unsigned int) src & 15;
-  const int shift_dst = (unsigned int) dst & 15;
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dstmask0=  {0x10,0x11,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask2=  {0x00,0x01,0x10,0x11,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask6=  {0x00,0x01,0x02,0x03,0x04,0x05,0x10,0x11,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask10= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x10,0x11,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x0E,0x0F};
-  const vuint8_t dstmask14= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x10,0x11};
-  
-  switch(shift_dst){
-    case 0:  dstmask = dstmask0;
-             break;
-    case 2:  dstmask = dstmask2;
-             break;
-    case 4:  dstmask = dstmask4;
-             break;
-    case 6:  dstmask = dstmask6;
-             break;
-    case 8:  dstmask = dstmask8;
-             break;
-    case 10: dstmask = dstmask10;
-             break;
-    case 12: dstmask = dstmask12;
-             break;
-    case 14: dstmask = dstmask14;
-             break;
-  }
-
-  vuint8_t vsrc0uc1;
-  vuint8_t vsrc0uc2;
-  vuint8_t vsrc0uc;
-  vuint8_t vsrc1uc;
-  vsrc0uc1 = *(vuint8_t *)(src);
-  vsrc0uc2 = *(vuint8_t *)(src+16);
-  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
-  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
-    
-  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
-  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
-
-  for (i = 0 ; i < h ; i++) {
-
-    vuint8_t vsrc2uc1;
-    vuint8_t vsrc2uc2;
-    vuint8_t vsrc2uc;
-    vuint8_t vsrc3uc;
-    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
-    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
-    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
-    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
-        
-    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
-    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
-        
-    vsint16_t psum;
-        
-    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
-    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
-    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-
-    psum1 = spu_mule(vsrc1ssH, vB);
-    psum2 = spu_mulo(vsrc1ssH, vB);
-    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum1 = spu_mule(vsrc2ssH, vC);
-    psum2 = spu_mulo(vsrc2ssH, vC);
-    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum1 = spu_mule(vsrc3ssH, vD);
-    psum2 = spu_mulo(vsrc3ssH, vD);
-    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
-    psum = spu_add(psum3, psum);
-
-    psum = spu_add(v32ss, psum);
-    psum = spu_rlmask(psum, -6);
-
-    //Saturation from 0 to 255
-    sat = spu_cmpgt(psum,(vsint16_t)vzero);
-    psum = spu_and(psum,(vsint16_t)sat);
-    sat = spu_cmpgt(psum,vmax);
-    psum = spu_sel(psum,vmax,sat);
-
-    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-
-    vsrc0ssH = vsrc2ssH;
-    vsrc1ssH = vsrc3ssH;
-        
-    dst += dst_stride;
-    src += STRIDE_C;
-  }
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding. 
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-#include "h264_deblock_spu.h"
-#include "h264_decode_mb_spu.h"
-
-extern int print_debug;
-
-static void filter_mb_edgev( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-	H264slice *s= h->s;
-    const int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]];
-        tc[1] = tc0_table[index_a][bS[1]];
-        tc[2] = tc0_table[index_a][bS[2]];
-        tc[3] = tc0_table[index_a][bS[3]];
-		
-        h->dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
-    } else {
-        h->dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
-    }
-}
-
-static void filter_mb_edgecv( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-	H264slice *s= h->s;
-    const int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-	if (alpha ==0 || beta == 0) return;
-	
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-		
-        tc[0] = tc0_table[index_a][bS[0]]+1;
-        tc[1] = tc0_table[index_a][bS[1]]+1;
-        tc[2] = tc0_table[index_a][bS[2]]+1;
-        tc[3] = tc0_table[index_a][bS[3]]+1;
-		
-		h->dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
-    } else {
-        h->dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
-    }
-}
-
-static void filter_mb_edgeh( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-	H264slice *s= h->s;
-    const int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-		
-        tc[0] = tc0_table[index_a][bS[0]];
-        tc[1] = tc0_table[index_a][bS[1]];
-        tc[2] = tc0_table[index_a][bS[2]];
-        tc[3] = tc0_table[index_a][bS[3]];
-		
-        h->dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
-    } else {
-        h->dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
-    }
-}
-
-static void filter_mb_edgech( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-	H264slice *s= h->s;
-    const int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-		
-		tc[0] = tc0_table[index_a][bS[0]]+1;
-        tc[1] = tc0_table[index_a][bS[1]]+1;
-        tc[2] = tc0_table[index_a][bS[2]]+1;
-        tc[3] = tc0_table[index_a][bS[3]]+1;
-		
-        h->dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
-    } else {
-        h->dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
-    }
-}
-
-static void filter_mb_dir(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int dir) {
-    H264Mb *mb = h->mb;
-	H264slice *s = h->s;
-	const int qp_xy= mb->qscale_mb_xy;
-    const int qp_dir = dir == 0 ? mb->qscale_left_mb_xy : mb->qscale_top_mb_xy;
-	const int mbm_type = dir == 0 ? mb->left_type : mb->top_type;
-	const int mb_type = mb->mb_type;
-	int edge;
-	const int edges = mb->edges[dir];
-    //int (*ref2frm)[64] = s->ref2frm;
-
-//     int start;//= h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
-// 
-//     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
-//                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
-//     // how often to recheck mv-based bS when iterating between edges
-//     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
-//                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
-//     // how often to recheck mv-based bS when iterating along each edge
-//     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
-
-// 	if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0))
-// 		start =1;
-// 	else
-// 		start =0;
-// 
-//     /* Calculate bS */
-//     for( edge = start; edge < edges; edge++ ) {
-// 		const int mbn_type = edge > 0 ? mb_type : mbm_type;
-// 		const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm;
-//         int (*ref2frmn)[64] = ref2frm;//edge > 0 ? ref2frm : ref2frmm;
-//         int16_t bS[4];
-//         int qp;
-// 
-//         if( (edge&1) && IS_8x8DCT(mb_type) )
-//             continue;
-// 
-//         if( IS_INTRA(mb_type) ||
-//             IS_INTRA(mbn_type) ) {
-//             int value;
-// 
-//             if (edge == 0) {
-//                 value = 4;
-//             } else {
-//                 value = 3;
-//             }
-//             bS[0] = bS[1] = bS[2] = bS[3] = value;
-//         } else {
-//             int i, l;
-//             int mv_done;
-// 
-//             if( edge & mask_edge ) {
-// 
-//                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
-//                 mv_done = 1;
-//             }
-//             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
-//                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
-//                 int bn_idx= b_idx - (dir ? 8:1);
-//                 int v = 0;
-// 
-// 				for( l = 0; !v && l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) {
-//                     v |= ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] ||
-//                          FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
-//                          FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit;
-//                 }
-//                 bS[0] = bS[1] = bS[2] = bS[3] = v;
-// 
-//                 mv_done = 1;
-//             }
-//             else
-//                 mv_done = 0;
-// 
-// 			for( i = 0; i < 4; i++ ) {
-//                 int x = dir == 0 ? edge : i;
-//                 int y = dir == 0 ? i    : edge;
-//                 int b_idx= 8 + 4 + x + 8*y;
-//                 int bn_idx= b_idx - (dir ? 8:1);
-// 
-//                 if( mb->non_zero_count_cache[b_idx] |
-//                     mb->non_zero_count_cache[bn_idx] ) {
-//                     bS[i] = 2;
-//                 }
-//                 else if(!mv_done)
-//                 {
-//                     bS[i] = 0;
-//                     for( l = 0; l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) {
-//                         if( ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] ||
-//                             FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
-//                             FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
-//                             bS[i] = 1;
-//                             break;
-//                         }
-//                     }
-//                 }
-//             }
-// 
-//             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-//                 continue;
-//         }
-// 		qp = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1;
-
-    if(mbm_type){
-        int16_t* bS=mb->bS[dir][0];
-        /* Filter edge */
-        // Do not use s->qscale as luma quantizer because it has not the same
-        // value in IPCM macroblocks.
-        if(bS[0]+bS[1]+bS[2]+bS[3]){
-            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
-            if( dir == 0 ) {
-                filter_mb_edgev(h, &img_y[0], linesize, bS, qp);
-                {
-                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
-                    filter_mb_edgecv(h, &img_cb[0], uvlinesize, bS, qp);
-                    filter_mb_edgecv(h, &img_cr[0], uvlinesize, bS, qp);
-                }
-            } else {
-                filter_mb_edgeh(h, &img_y[0], linesize, bS, qp);
-                {
-                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
-                    filter_mb_edgech(h, &img_cb[0], uvlinesize, bS, qp);
-                    filter_mb_edgech(h, &img_cr[0], uvlinesize, bS, qp);
-                }
-            }
-        }
-    }
-
-    for( edge = 1; edge < edges; edge++ ) {
-        int16_t* bS=mb->bS[dir][edge];
-        int qp = qp_xy;
-
-        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
-            continue;
-
-        /* Filter edge */
-        // Do not use s->qscale as luma quantizer because it has not the same
-        // value in IPCM macroblocks.
-
-        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-            continue;
-
-		if( dir == 0 ) {
-            filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
-            if( (edge&1) == 0 ) {
-                filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) );
-                filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) );
-            }
-        } else {
-            filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
-            if( (edge&1) == 0 ) {
-                filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) );
-                filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) );
-            }
-        }
-    }
-}
-
-void filter_mb( H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
-    filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 0);
-    filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 1);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,80 +0,0 @@
-#ifndef H264_FILTER_SPU_H
-#define H264_FILTER_SPU_H
-
-#include "types_spu.h"
-#include "h264_decode_mb_spu.h"
-
-#define FFABS(a)           ((a) >= 0 ? (a) : (-(a)))
-
-void filter_mb(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
-
-/* Deblocking filter (p153) */
-static const uint8_t alpha_table[52*3] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
-     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
-    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
-    80, 90,101,113,127,144,162,182,203,226,
-   255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
-
-static const uint8_t beta_table[52*3] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
-     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
-     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
-    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
-    18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-};
-
-static const uint8_t tc0_table[52*3][4] = {
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
-    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
-    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
-    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
-    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
-    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
-    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-};
-
-static inline int get_chroma_qp(H264slice *s, int t, int qscale){
-    return s->chroma_qp_table[t][qscale];
-}
-
-#endif 
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,725 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding.
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <spu_intrinsics.h>
-//#include "dsputil_cell.h"
-#include "types_spu.h"
-#include "h264_tables.h"
-#include "h264_dma.h"
-#include "h264_mc_spu.h"
-#include "h264_intra_spu.h"
-#include "h264_decode_mb_spu.h"
-#include "h264_deblock_spu.h"
-
-//border buffers
-DECLARE_ALIGNED_16(TopBorder, top_ls[240]);
-LeftBorder left_ls;
-
-//mb line buffer - statically allocated for up to 1920 width video
-DECLARE_ALIGNED_16(uint8_t, dest_y_ls[2*16*20]);
-DECLARE_ALIGNED_16(uint8_t, dest_cb_ls[2*8*10]);
-DECLARE_ALIGNED_16(uint8_t, dest_cr_ls[2*8*10]);
-
-//dma transfer buffer
-DECLARE_ALIGNED_16(uint8_t, dma_y_ls [64*(32+20)]); //EDGE_WIDTH = 32
-DECLARE_ALIGNED_16(uint8_t, dma_cb_ls[32*(16+10)]);
-DECLARE_ALIGNED_16(uint8_t, dma_cr_ls[32*(16+10)]);
-
-DECLARE_ALIGNED_16(uint8_t, extra_edge_y [32*(32+20)]); //EDGE_WIDTH = 32
-DECLARE_ALIGNED_16(uint8_t, extra_edge_cr[16*(16+10)]);
-DECLARE_ALIGNED_16(uint8_t, extra_edge_cb[16*(16+10)]);
-
-
-// For intra mode
-/// for now do the extra copy before dma, but it's better to skip this and do the dma right away
-static void backup_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
-	H264Mb* mb= h->mb;
-	
-    int i;
-	uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y;
-	uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb;
-	uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr;
-	
-	uint8_t* left_border_y = left_ls.unfiltered_y;
-	uint8_t* left_border_cb = left_ls.unfiltered_cb;
-	uint8_t* left_border_cr = left_ls.unfiltered_cr;
-		
-    src_y  -=   linesize;
-    src_cb -= uvlinesize;
-    src_cr -= uvlinesize;
-
-    // There are two lines saved, the line above the top macroblock of a pair,
-    // and the line above the bottom macroblock
-    left_border_y[0] = top_border_y[15];
-    for(i=1; i<17; i++){
-        left_border_y[i] = src_y[15+i*  linesize];
-    }
-
-   *(qword*)(top_border_y)= *(qword*)(src_y +  16*linesize);
-
-    left_border_cb[0] = top_border_cb[7];
-    left_border_cr[0] = top_border_cr[7];
-    for(i=1; i<9; i++){
-        left_border_cb[i] = src_cb[7+i*uvlinesize];
-        left_border_cr[i] = src_cr[7+i*uvlinesize];
-    }
-    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
-    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
-}
-
-static void xchg_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
-	H264Mb* mb= h->mb;
-	H264slice* s = h->s;
-	
-	int temp8, i;
-	uint64_t temp64;
-	int deblock_left;
-	int deblock_top;
-	
-	uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y;	
-	uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb;
-	uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr;
-	uint8_t* top_border_y_next = top_ls[mb->mb_x +1].unfiltered_y;
-	
-	uint8_t* left_border_y = left_ls.unfiltered_y;
-	uint8_t* left_border_cb = left_ls.unfiltered_cb;
-	uint8_t* left_border_cr = left_ls.unfiltered_cr;
-	
-	deblock_left = (mb->mb_x > 0);
-	deblock_top =  (mb->mb_y > 0);
-	
-	src_y  -= (  linesize + 1);
-	src_cb -= (uvlinesize + 1);
-	src_cr -= (uvlinesize + 1);
-	
-	#define XCHG(a,b,t,xchg)\
-	t= a;\
-	if(xchg)\
-		a= b;\
-	b= t;
-	
-	if(deblock_left){
-		for(i = !deblock_top; i<16; i++){
-			XCHG(left_border_y[i], src_y [i*  linesize], temp8, xchg);
-		}
-		XCHG(left_border_y[i], src_y [i*  linesize], temp8, 1);
-		
-		for(i = !deblock_top; i<8; i++){
-			XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg);
-			XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg);
-		}
-		XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1);
-		XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1);
-	}
-	
-	if(deblock_top){
-		XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg);
-		XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1);
-		if(mb->mb_x+1 < s->mb_width){
-			XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1);
-		}
-		XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1);
-		XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1);
-	}
-}
-
-void copy_top_borders(int mb_x, uint8_t *dst_y, uint8_t *dst_cb, uint8_t *dst_cr, int stride_y, int stride_c){			
-	qword *qsrc_y = (qword *) (top_ls[mb_x].top_borders_y);
-	dst_y-= 4*stride_y;
-	
-	*((qword *) (dst_y + 0*stride_y)) = *qsrc_y++;
-	*((qword *) (dst_y + 1*stride_y)) = *qsrc_y++;
-	*((qword *) (dst_y + 2*stride_y)) = *qsrc_y++;
-	*((qword *) (dst_y + 3*stride_y)) = *qsrc_y++;
-
-	dst_cb-=2*stride_c;	
-	uint64_t *dsrc_cb = (uint64_t *) (top_ls[mb_x].top_borders_cb);
-	*((uint64_t *) (dst_cb + 0*stride_c)) = *dsrc_cb++; 
-	*((uint64_t *) (dst_cb + 1*stride_c)) = *dsrc_cb++;
-
-	dst_cr-=2*stride_c;	
-	uint64_t *dsrc_cr = (uint64_t *) (top_ls[mb_x].top_borders_cr);
-	*((uint64_t *) (dst_cr + 0*stride_c)) = *dsrc_cr++;
-	*((uint64_t *) (dst_cr + 1*stride_c)) = *dsrc_cr++;
-}
-
-static void send_top_borders(H264Context_spu *h, int mb_x, uint8_t* dest_y, uint8_t* dest_cb, uint8_t* dest_cr, int stride_y, int stride_c){
-	H264spe *spe= &h->spe;
-	//fill borders (unfiltered borders already filled in backup_mb_border)
-	dest_y+= 12*stride_y;
-	qword *qtop_y = (qword *) top_ls[mb_x].top_borders_y;	
-	for(int i=0; i<4; i++){
-		qword *qdest_y = (qword *) dest_y;
-		*qtop_y++ = *qdest_y;		
-		dest_y+=stride_y;
-	}
-	dest_cb+= 6*stride_c;
-	dest_cr+= 6*stride_c;
-	uint64_t *dtop_cb = (uint64_t *) top_ls[mb_x].top_borders_cb;
-	uint64_t *dtop_cr = (uint64_t *) top_ls[mb_x].top_borders_cr;
-	for(int i=0; i<2; i++){
-		uint64_t *ddest_cb = (uint64_t *) dest_cb;
-		uint64_t *ddest_cr = (uint64_t *) dest_cr;
-		
-		*dtop_cb++  = *ddest_cb;
-		*dtop_cr++  = *ddest_cr;
-		
-		dest_cb+=stride_c;
-		dest_cr+=stride_c;
-	}
-	uint8_t* top_border_tgt = spe->tgt_spe + (unsigned) &top_ls[mb_x];
-	spu_dma_put(&top_ls[mb_x], (unsigned) top_border_tgt, sizeof(TopBorder), MBD_put);
-}
-
-static void extend_edges_left(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c){
-	for (int i=0; i<lines; i++){
-		memset(dma_y, dma_y[32], 32);
-		dma_y+=64;
-	}
-
-	for (int i=0; i<lines_c; i++){
-		memset(dma_cb, dma_cb[16], 16);
-		memset(dma_cr, dma_cr[16], 16);
-		dma_cb+=32; dma_cr+=32;
-	}
-}
-
-static void extend_edges_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c, int slots){
-		
-	for (int i=0; i<lines; i++){
-		memset(dma_y, dma_y[-1], slots*16);
-		dma_y+=64;
-	}
-	
-	for (int i=0; i<lines_c; i++){
-		memset(dma_cb, dma_cb[-1], slots*8);
-		memset(dma_cr, dma_cr[-1], slots*8);
-		dma_cb+=32; dma_cr+=32;
-	}
-}
-
-static void extend_edges_top(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr ){
-	qword *qborder_y = (qword *) dma_y;
-	for (int i=1; i<=32; i++){
-		qword *qdma_y = (qword *) (dma_y - i*64);
-		*qdma_y = *qborder_y;
-	}
-
-	uint64_t *dborder_cb = (uint64_t *) dma_cb;
-	uint64_t *dborder_cr = (uint64_t *) dma_cr;
-	for (int i=1; i<=16; i++){
-		uint64_t *ddma_cb = (uint64_t *) (dma_cb - i*32);
-		uint64_t *ddma_cr = (uint64_t *) (dma_cr - i*32);
-		*ddma_cb = *dborder_cb;
-		*ddma_cr = *dborder_cr;
-	}
-}
-
-static void extend_edges_bottom(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr){
-	qword *qborder_y = (qword *) dma_y;
-	for (int i=1; i<=32; i++){
-		qword *qdma_y = (qword *) (dma_y + i*64);
-		*qdma_y = *qborder_y;
-	}
-	
-	uint64_t *dborder_cb = (uint64_t *) dma_cb;
-	uint64_t *dborder_cr = (uint64_t *) dma_cr;
-	for (int i=1; i<=16; i++){
-		uint64_t *ddma_cb = (uint64_t *) (dma_cb + i*32);
-		uint64_t *ddma_cr = (uint64_t *) (dma_cr + i*32);
-		*ddma_cb = *dborder_cb;
-		*ddma_cr = *dborder_cr;
-	}
-}
-
-static void extend_extra_edge_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr, uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr, int lines, int lines_c){
-
-	for (int i=0; i<lines; i++){
-		memset(extra_y, dma_y[-1], 32);
-		dma_y+=64; extra_y+=32;
-	}
-	
-	for (int i=0; i<lines_c; i++){
-		memset(extra_cb, dma_cb[-1], 16);
-		memset(extra_cr, dma_cr[-1], 16);
-		dma_cb+=32; dma_cr+=32;
-		extra_cb+=16; extra_cr+=16;
-	}
-}
-
-static void extend_extra_edge_top(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){
-	qword *qborder_y = (qword *) extra_y;
-	qword *qborder_y2 = (qword *) (extra_y+16);
-	
-	for (int i=1; i<=32; i++){
-		qword *qextra_y = (qword *) (extra_y-i*32);
-		*qextra_y = *qborder_y;
-		*(qextra_y+1) = *qborder_y2;
-	}
-	
-	qword *qborder_cb = (qword *) extra_cb;
-	qword *qborder_cr = (qword *) extra_cr;
-	for (int i=1; i<=16; i++){
-		qword *qextra_cb = (qword *) (extra_cb - i*16);
-		qword *qextra_cr = (qword *) (extra_cr - i*16);
-		*qextra_cb = *qborder_cb;
-		*qextra_cr = *qborder_cr;
-	}
-}
-
-static void extend_extra_edge_bottom(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){
-	qword *qborder_y = (qword *) extra_y;
-	qword *qborder_y2 = (qword *) (extra_y+16);
-	
-	for (int i=1; i<=32; i++){
-		qword *qextra_y = (qword *) (extra_y+i*32);
-		*qextra_y = *qborder_y;
-		*(qextra_y+1) = *qborder_y2;
-	}
-	
-	qword *qborder_cb = (qword *) extra_cb;
-	qword *qborder_cr = (qword *) extra_cr;
-	for (int i=1; i<=16; i++){
-		qword *qextra_cb = (qword *) (extra_cb + i*16);
-		qword *qextra_cr = (qword *) (extra_cr + i*16);
-		*qextra_cb = *qborder_cb;
-		*qextra_cr = *qborder_cr;
-	}
-}
-
-static void extend_edges(H264Context_spu *h, int mb_x, int mb_y){
-	H264slice *s = h->s;
-	
-	uint8_t *dma_y; 
-	uint8_t *dma_cb; 
-	uint8_t *dma_cr;
-	
-	uint8_t *extra_y  = extra_edge_y;
-	uint8_t *extra_cb = extra_edge_cb;
-	uint8_t *extra_cr = extra_edge_cr;
-	
-	int pos = (mb_x+2) %4;
-	if (mb_x == 0){
-		if (mb_y ==0){
-			extend_edges_left(&dma_y_ls[32*64], &dma_cb_ls[16*32], &dma_cr_ls[16*32], 12, 6);
-		}else if (mb_y == s->mb_height -1){
-			extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 20, 10);
-		}else {
-			extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 16, 8);
-		}
-	}else if (mb_x == s->mb_width-1){
-		dma_y  = &dma_y_ls [(pos+1)*16];
-		dma_cb = &dma_cb_ls[(pos+1)*8];
-		dma_cr = &dma_cr_ls[(pos+1)*8];
-		if (mb_y ==0){
-			dma_y   += 32*64;
-			dma_cb  += 16*32;
-			dma_cr  += 16*32;
-			extra_y = extra_edge_y  + 32*32;
-			extra_cb= extra_edge_cb + 16*16;
-			extra_cr= extra_edge_cr + 16*16;
-			
-			if (pos==2){
-				extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 1);
-				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6);
-			}else if (pos==3){
-				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6);
-			}else{
-				extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 2);
-			}
-		}else if (mb_y == s->mb_height -1){
-			if (pos==2){
-				extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 1);
-				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10);
-			}else if (pos==3){
-				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10);
-			}else{
-				extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 2);
-			}				
-		}else {
-			if (pos==2){
-				extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1);
-				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8);
-			}else if (pos==3){
-				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8);
-			}else{
-				extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1);
-			}
-		}
-	}
-		
-	if (mb_y == 0){
-		dma_y  = &dma_y_ls [32*64];
-		dma_cb = &dma_cb_ls[16*32];
-		dma_cr = &dma_cr_ls[16*32];
-		extra_y = extra_edge_y  + 32*32;
-		extra_cb= extra_edge_cb + 16*16;
-		extra_cr= extra_edge_cr + 16*16;
-		
-		if (mb_x ==0){
-			extend_edges_top (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8);
-			extend_edges_top (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8);
-			extend_edges_top (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8);
-		}else if (mb_x == s->mb_width -1){
-			if (pos==2){
-				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-				extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
-				extend_extra_edge_top(extra_y, extra_cb, extra_cr);
-			}else if (pos == 3){
-				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-				extend_extra_edge_top(extra_y, extra_cb, extra_cr);
-			}else{
-				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-				extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
-				extend_edges_top (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8);
-			}			
-		}else {
-			extend_edges_top (dma_y + pos*16, dma_cb + pos*8, dma_cr + pos*8);
-		}
-	}else if (mb_y == s->mb_height -1){
-		dma_y  = &dma_y_ls [19*64];
-		dma_cb = &dma_cb_ls[9*32];
-		dma_cr = &dma_cr_ls[9*32];
-		extra_y = extra_edge_y  + 19*32;
-		extra_cb= extra_edge_cb + 9*16;
-		extra_cr= extra_edge_cr + 9*16;
-		
-		if (mb_x ==0){
-			extend_edges_bottom (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8);
-			extend_edges_bottom (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8);
-			extend_edges_bottom (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8);
-		}else if (mb_x == s->mb_width -1){
-			if (pos==2){
-				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-				extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
-				extend_extra_edge_bottom(extra_y, extra_cb, extra_cr);
-			}else if (pos == 3){
-				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-				extend_extra_edge_bottom(extra_y, extra_cb, extra_cr);
-			}else{				
-				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-				extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
-				extend_edges_bottom (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8);
-			}
-		}else {
-			extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
-		}
-	}
-}
-
-static void send_pic_data(H264Context_spu *h, int mb_x, int mb_y, int pos, int stride_y, int stride_c){
-	H264slice *s = h->s;
-	int lines, lines_c;
-	int linesize = s->linesize;
-	int uvlinesize = s->uvlinesize;
-	
-	uint8_t* dst_y  = s->dst_y + (mb_x-pos)*16 + (mb_y*16)*linesize;
-	uint8_t* dst_cb = s->dst_cb +(mb_x-pos)*8 + (mb_y*8)*uvlinesize;
-	uint8_t* dst_cr = s->dst_cr +(mb_x-pos)*8 + (mb_y*8)*uvlinesize;
-
-	if (mb_y == 0){
-		dst_y -= 32 *linesize;
-		dst_cb-= 16 *uvlinesize;
-		dst_cr-= 16 *uvlinesize;
-	}else {
-		dst_y -= 4 *linesize;
-		dst_cb-= 2 *uvlinesize;
-		dst_cr-= 2 *uvlinesize;
-	}
-	
-	if (mb_y == 0){
-		lines = 12+32; lines_c=6+16;
-	}else if (mb_y == s->mb_height-1){
-		lines = 20+32; lines_c=10+16;
-	}else{
-		lines = 16; lines_c=8;
-	}
-	
-	put_list = put_list_buf;
-	put_dma_list(dma_y_ls, dst_y, stride_y, lines, linesize, MBD_pic);
-	put_dma_list(dma_cb_ls, dst_cb, stride_c, lines_c, uvlinesize, MBD_pic);
-	put_dma_list(dma_cr_ls, dst_cr, stride_c, lines_c, uvlinesize, MBD_pic);
-
-	if (mb_x == s->mb_width-1 && pos>1){		
-		put_dma_list(extra_edge_y, dst_y+64, 32, lines, linesize, MBD_pic);
-		put_dma_list(extra_edge_cb, dst_cb+32, 16, lines_c, uvlinesize, MBD_pic);
-		put_dma_list(extra_edge_cr, dst_cr+32, 16, lines_c, uvlinesize, MBD_pic);
-   	}
-}
-
-void copy_data_and_send(H264Context_spu *h, int mb_x, int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
-	H264slice *s = h->s;
-	int lines, lines_c;
-	int pos = (mb_x+2)%4; //4 slots in our 64 byte wide transfer buffer. Offset 2 for edge emulation
-	uint8_t *dma_y = &dma_y_ls[pos*16];
-	uint8_t *dma_cb = &dma_cb_ls[pos*8];
-	uint8_t *dma_cr = &dma_cr_ls[pos*8];
-	
-	if (mb_y == 0){
-		dma_y += 32*64;
-		dma_cb+= 16*32;
-		dma_cr+= 16*32;
-	}else{		
-		dest_y -= 4*stride_y;
-		dest_cb-= 2*stride_c;
-		dest_cr-= 2*stride_c;		
-	}
-	
-	if (mb_y == 0){
-		lines = 12; lines_c=6;
-	}else if (mb_y == s->mb_height-1){
-		lines = 20; lines_c=10;
-	}else{
-		lines = 16; lines_c=8;
-	}
-
-	for(int i=0; i<lines; i++){
-		qword *qdest_y = (qword *) dest_y;
-		qword *qdma_y  = (qword *) dma_y;
-		*qdma_y = *qdest_y;
-		dma_y +=64;
-		dest_y+=stride_y;
-	}
-
-	for(int i=0; i<lines_c; i++){
-		uint64_t *ddest_cb  = (uint64_t *) dest_cb;
-		uint64_t *ddest_cr  = (uint64_t *) dest_cr;
-		uint64_t *ddma_cb   = (uint64_t *) dma_cb;
-		uint64_t *ddma_cr   = (uint64_t *) dma_cr;
-		*ddma_cb = *ddest_cb;
-		*ddma_cr = *ddest_cr;
-		dma_cb +=32;
-		dma_cr +=32;
-		dest_cb+=stride_c;
-		dest_cr+=stride_c;
-	}
-
-	extend_edges(h, mb_x, mb_y);
-
-	//send when dma buf is full
-	if (pos==3){
-		send_pic_data(h, mb_x, mb_y, pos, 64, 32);
-	} else if (mb_x == s->mb_width-1){
-		send_pic_data(h, mb_x, mb_y, pos, 64, 32);
-	}
-}
-
-static void shift_left(int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
-	int lines, lines_c;
-	if (mb_y > 0){
-		lines  =20;
-		lines_c=10;
-		dest_y  -= 4*stride_y;
-		dest_cb -= 2*stride_c;
-		dest_cr -= 2*stride_c;
-	}else {
-		lines  =16;
-		lines_c= 8;		
-	}		
-		
-	for (int i=0; i<lines; i++){
-		qword *left_y  = (qword *) (dest_y -16);
-		qword *qdest_y = (qword *) dest_y;
-		*left_y = *qdest_y;
-		dest_y += stride_y;
-	}
-	
-	for (int i=0; i<lines_c; i++){
-		uint64_t *left_cb  = (uint64_t *) (dest_cb -8);
-		uint64_t *left_cr  = (uint64_t *) (dest_cr -8);
-		uint64_t *ddest_cb = (uint64_t *) dest_cb;
-		uint64_t *ddest_cr = (uint64_t *) dest_cr;
-		*left_cb = *ddest_cb;
-		*left_cr = *ddest_cr;
-		dest_cb += stride_c;
-		dest_cr += stride_c;
-	}
-}
-
-void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c){
-	H264slice *s = h->s;
-	H264Mb *mb = h->mb;
-    const int mb_x= mb->mb_x;
-    const int mb_y= mb->mb_y;    
-    const int mb_type= mb->mb_type;
-	
-	uint8_t *dest_y, *dest_cb, *dest_cr;	//ls ptrs (abstracts the fact it is operating in a ls buffer)
-
-    int i;
-  
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
-
-	dest_y  = dest_y_ls + 16 + 4*stride_y;
-	dest_cb = dest_cb_ls + 8 + 2*stride_c;
-	dest_cr = dest_cr_ls + 8 + 2*stride_c;
-	
-	if(IS_8x8DCT(mb_type)){
-		idct_dc_add = ff_idct8_dc_add;
-		idct_add = h->dsp.h264_idct_add[0];
-	}
-	else{
-		idct_dc_add = ff_idct_dc_add;
-		idct_add = h->dsp.h264_idct_add[1];
-	}
-
-	if (mb_y>0){
-		copy_top_borders(mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-	}
-
-	if(IS_INTRA(mb_type)){
-		xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 1);
-
-		h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cb, stride_c);
-		h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cr, stride_c);
-
-		if(IS_INTRA4x4(mb_type)){
-			if(IS_8x8DCT(mb_type)){
-
-				for(i=0; i<16; i+=4){
-					uint8_t * const ptr= dest_y + block_offset[i];
-					const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ];
-					const int nnz = mb->non_zero_count_cache[ scan8[i] ];
-					h->hpc.pred8x8l[ dir ](ptr, (mb->topleft_samples_available<<i)&0x8000,
-												(mb->topright_samples_available<<i)&0x4000, stride_y);
-
-					if(nnz){
-						if(nnz == 1 && mb->mb[i*16])
-							idct_dc_add(ptr, mb->mb + i*16, stride_y);
-						else{
-							idct_add   (ptr, mb->mb + i*16, stride_y);
-						}
-					}
-				}
-			}else{
-				for(i=0; i<16; i++){
-					uint8_t * const ptr= dest_y + block_offset[i];
-					const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ];
-
-					uint8_t *topright;
-					int nnz, tr;
-					if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
-						const int topright_avail= (mb->topright_samples_available<<i)&0x8000;
-						if(!topright_avail){
-							tr= ptr[3 - stride_y]*0x01010101;
-							topright= (uint8_t*) &tr;
-						}else
-							topright= ptr + 4 - stride_y;
-					}else
-						topright= NULL;
-
-					h->hpc.pred4x4[ dir ](ptr, topright, stride_y);
-					nnz = mb->non_zero_count_cache[ scan8[i] ];
-					if(nnz){
-						if(nnz == 1 && mb->mb[i*16])
-							idct_dc_add(ptr, mb->mb + i*16, stride_y);
-						else
-							idct_add   (ptr, mb->mb + i*16, stride_y);
-					}
-				}
-			}
-
-		}else{
-			h->hpc.pred16x16[ mb->intra16x16_pred_mode ](dest_y , stride_y);
-			h264_luma_dc_dequant_idct_c(mb->mb, mb->dequant4_coeff_y);
-		}
-		xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 0);
-
-	}else {
-		hl_motion(h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-	}
-
-	if(!IS_INTRA4x4(mb_type)){
-		if(IS_INTRA16x16(mb_type)){
-			for(i=0; i<16; i++){
-				if(mb->non_zero_count_cache[ scan8[i] ])
-					idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
-				else if(mb->mb[i*16])
-					idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
-			}
-		}else if(mb->cbp&15){
-			const int incr = IS_8x8DCT(mb_type) ? 4 : 1;
-			for(i=0; i<16; i+=incr){
-				int nnz = mb->non_zero_count_cache[ scan8[i] ];
-				if(nnz){
-					if(nnz==1 && mb->mb[i*16])
-						idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
-					else
-						idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
-				}
-			}
-		}
-	}
-
-	if(mb->cbp&0x30){
-		uint8_t *dest[2] = {dest_cb, dest_cr};
-		chroma_dc_dequant_idct_c(mb->mb + 16*16, mb->dequant4_coeff_cb);
-		chroma_dc_dequant_idct_c(mb->mb + 16*16+4*16, mb->dequant4_coeff_cr);
-
-		idct_add = h->dsp.h264_idct_add[1];
-		idct_dc_add = ff_idct_dc_add;
-		for(i=16; i<16+8; i++){
-			if(mb->non_zero_count_cache[ scan8[i] ])
-				idct_add   (dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c);
-			else if(mb->mb[i*16])
-				idct_dc_add(dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c);
-		}
-	}
-
-	// save unfiltered borders
-	backup_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-	if (mb->deblock_mb){
-		filter_mb( h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-	}
-
-	if (mb_y < s->mb_height-1){
-		if(mb_x>0){
-			send_top_borders(h, mb_x-1, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
-		}
-		if (mb_x == s->mb_width-1){
-			send_top_borders(h, mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-		}
-	}
-	update_tgt_spe_dep(h, 0);
-
-	if (h->blocking){
-		if (mb_x>0){			
-			copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
-			wait_dma_id(MBD_pic);
-		}
-		if (mb_x == s->mb_width-1){			
-			copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-			wait_dma_id(MBD_pic);
-		}
-		
-	}else{
-		if (mb_x>0){
-			wait_dma_id(MBD_pic);
-			copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
-		}
-		if (mb_x == s->mb_width-1){
-			wait_dma_id(MBD_pic);
-			copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-		}
-	}
-
-	if (mb_x < s->mb_width)
-		shift_left(mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
-	
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding. 
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-#ifndef H264_DECODE_MB_SPU_H
-#define H264_DECODE_MB_SPU_H
-
-#define CELL_SPE
-#include "libavcodec/avcodec.h"
-#include "types_spu.h"
-#include "h264_types_spu.h"
-#include "h264_mc_spu.h"
-#include "h264_dma.h"
-#include "dsputil_spu.h"
-#include "h264_intra_spu.h"
-
-/**
- * H264Context
- */
-typedef struct H264Context_spu{
-	DECLARE_ALIGNED_16(H264spe, spe);		// contains simple type parameters that doesn't change
-    DECLARE_ALIGNED_16(H264Mb, mb_buf[3]);			// contains simple type parameters that changes for macroblock
-    DECLARE_ALIGNED_16(H264slice, slice_buf[2]);	// contains simple type parameters that changes for slice
-	
-	DSPContext_spu dsp;  // struct that contains pointers to mc interpolations functions
-	H264PredContext_spu hpc;  // struct that contains pointers to intra prediction functions
-
-	H264slice *s;
-	int sl_idx;
-	int frames;
-	//mc arg buffer
-	H264mc mc_buf[2];
-	H264mc *mc;		//mc ptr to current decoded mb
-	int mc_idx;
-	int n_mc;		//next mb_id to mc
-	int mb_proc;
-	int mb_total;
-	int curr_line;
-	
-	H264Mb* mb;		//mb ptr to current decoded mb
-	int mb_id;		//next mb_id to dma
-	int mb_dec; 	//mb_buf index - decoded mb
-	int mb_mc;		//mb_buf index - prebuffer motion data
-	int mb_dma;		//mb_buf index - target for dma mb data
-	int next_mb_idx;
-/*// for deblocking filter
-    int edges[2];
-    int start[2]; 
-    int bS[2][4][4];				// dir, edge, bS;
-    int qp[2][4];					// dir, edge;
-    int chroma_qp[2][2][4];			// cb/cr, dir, edge;	
-*/
-	int blocking; 
-}H264Context_spu;
-
-void print_output(H264Context_spu* h, const char* msg);
-void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c);
-void update_tgt_spe_dep(H264Context_spu *h, int end);
-
-// IDCT functions
-void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
-
-void ff_idct_dc_add(uint8_t *dst, DCTELEM *block, int stride);
-void ff_idct8_dc_add(uint8_t *dst, DCTELEM *block, int stride);
-
-void ff_cropTbl_init();
-void add_pixels8_c(uint8_t *pixels, DCTELEM *block, int line_size);
-void add_pixels4_c(uint8_t *pixels, DCTELEM *block, int line_size);
-void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul);
-void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul);
-// Filter functions
-//void calculate_bS_qp(H264Context_spu *h);
-
-// Motion compensation function
-void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc);
-void calc_mc_params(H264Mb *mb, H264mc *mc);
-void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c);
-
-
-// Function to get traces
-void trace_event_SPU(int event, int id);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,332 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 direct mb/block decoding.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-#define CELL_SPE
-#include "libavcodec/avcodec.h"
-#include "dsputil_spu.h"
-#include "h264_tables.h"
-#include "h264_types_spu.h"
-#include "libavutil/common.h"
-#include "libavutil/intreadwrite.h"
-#include "mathops_spu.h"
-#include "rectangle_spu.h"
-
-//#undef NDEBUG
-#include <assert.h>
-static void pred_spatial_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
-    H264Mb *m = s->m;
-    int b4_stride = hc->b_stride;
-	const int mb_x = m->mb_x;    
-    int mb_type_col[2];
-    const int16_t (*l1mv0)[2], (*l1mv1)[2];
-    const int8_t *l1ref0, *l1ref1;
-    const int is_b8x8 = IS_8X8(*mb_type);
-    unsigned int sub_mb_type= MB_TYPE_L0L1;
-    int i8, i4;
-    int ref[2];
-    int mv[2];
-    int list;
-
-    //assert(h->ref_list[1][0].reference&3);
-
-#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
-
-    /* ref = min(neighbors) */
-    for(list=0; list<2; list++){
-        int left_ref = m->ref_cache[list][scan8[0] - 1];
-        int top_ref  = m->ref_cache[list][scan8[0] - 8];
-        int refc = m->ref_cache[list][scan8[0] - 8 + 4];
-        const int16_t *C= m->mv_cache[list][ scan8[0] - 8 + 4];
-        if(refc == PART_NOT_AVAILABLE){
-            refc = m->ref_cache[list][scan8[0] - 8 - 1];
-            C    = m-> mv_cache[list][scan8[0] - 8 - 1];
-        }
-        ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
-        if(ref[list] >= 0){
-            //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
-            const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ];
-            const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ];
-
-            int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
-            if(match_count > 1){ //most common
-                mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]),
-                                     mid_pred(A[1], B[1], C[1]) );
-            }else {
-                assert(match_count==1);
-                if(left_ref==ref[list]){
-                    mv[list]= AV_RN32A(A);
-                }else if(top_ref==ref[list]){
-                    mv[list]= AV_RN32A(B);
-                }else{
-                    mv[list]= AV_RN32A(C);
-                }
-            }
-        }else{
-            int mask= ~(MB_TYPE_L0 << (2*list));
-            mv[list] = 0;
-            ref[list] = -1;
-            if(!is_b8x8)
-                *mb_type &= mask;
-            sub_mb_type &= mask;
-        }
-    }
-
-    if(ref[0] < 0 && ref[1] < 0){
-        ref[0] = ref[1] = 0;
-        if(!is_b8x8)
-            *mb_type |= MB_TYPE_L0L1;
-        sub_mb_type |= MB_TYPE_L0L1;
-    }
-
-    if(!(is_b8x8|mv[0]|mv[1])){
-        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
-        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
-        fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
-        fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
-        *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
-        return;
-    }
-
-    mb_type_col[0] =
-    mb_type_col[1] = hc->list1_mb_type[mb_x];
-
-    sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
-    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
-        *mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
-    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
-        *mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
-    }else{
-        if(!s->direct_8x8_inference_flag){
-            /* FIXME save sub mb types from previous frames (or derive from MVs)
-            * so we know exactly what block size to use */
-            sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
-        }
-        *mb_type   |= MB_TYPE_8x8;
-    }
-
-//     l1mv0  = (void *) &hc->list1_motion_val[0][4*mb_x];
-//     l1mv1  = (void *) &hc->list1_motion_val[1][4*mb_x];
-	l1mv0  = (void *) hc->list1_motion_val[0];
-    l1mv1  = (void *) hc->list1_motion_val[1];
-    l1ref0 = &hc->list1_ref_index [0][4*mb_x];
-    l1ref1 = &hc->list1_ref_index [1][4*mb_x];
-//     if(!b8_stride){
-//         if(m->mb_y&1){
-//             l1ref0 += 2;
-//             l1ref1 += 2;
-//             l1mv0  +=  2*b4_stride;
-//             l1mv1  +=  2*b4_stride;
-//         }
-//     }
-
-    if(IS_16X16(*mb_type)){
-        int a,b;
-
-        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
-        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
-        if(!IS_INTRA(mb_type_col[0]) && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
-            || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
-            ))){
-            a=b=0;
-            if(ref[0] > 0)
-                a= mv[0];
-            if(ref[1] > 0)
-                b= mv[1];
-        }else{
-            a= mv[0];
-            b= mv[1];
-        }
-        fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
-        fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
-    }else{
-        int n=0;
-        for(i8=0; i8<4; i8++){
-            const int x8 = i8&1;
-            const int y8 = i8>>1;
-
-            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
-                continue;
-            m->sub_mb_type[i8] = sub_mb_type;
-
-            fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4);
-            fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4);
-            fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
-            fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
-
-            /* col_zero_flag */
-            if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 ))
-                ){
-                const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
-                if(IS_SUB_8X8(sub_mb_type)){
-//                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
-					const int16_t *mv_col = l1mv[x8*3 + y8*3*4];
-                    if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
-                        if(ref[0] == 0)
-                            fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
-                        if(ref[1] == 0)
-                            fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
-                        n+=4;
-                    }
-                }else{
-                    int k=0;
-                    for(i4=0; i4<4; i4++){
-                        //const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
-						const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4];
-                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
-                            if(ref[0] == 0)
-                                AV_ZERO32(m->mv_cache[0][scan8[i8*4+i4]]);
-                            if(ref[1] == 0)
-                                AV_ZERO32(m->mv_cache[1][scan8[i8*4+i4]]);
-                            k++;
-                        }
-                    }
-                    if(!(k&3))
-                        m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8;
-                    n+=k;
-                }
-            }
-        }
-        if(!is_b8x8 && !(n&15)){
-            *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
-        }
-    }
-}
-
-static void pred_temp_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
-    H264Mb *m = s->m;
-	const int mb_x = m->mb_x;
-    int b4_stride = hc->b_stride;    
-    int mb_type_col[2];
-    const int16_t (*l1mv0)[2], (*l1mv1)[2];
-    const int8_t *l1ref0, *l1ref1;
-    const int is_b8x8 = IS_8X8(*mb_type);
-    unsigned int sub_mb_type;
-    int i8, i4;
-    const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]};
-    const int *dist_scale_factor = s->dist_scale_factor;
-
-    mb_type_col[0] =
-    mb_type_col[1] = hc->list1_mb_type[mb_x];
-
-    sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
-    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
-        *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
-    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
-        *mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
-    }else{
-        if(!s->direct_8x8_inference_flag){
-            /* FIXME save sub mb types from previous frames (or derive from MVs)
-            * so we know exactly what block size to use */
-            sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
-        }
-        *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
-    }
-
-//     l1mv0  = (void *) &hc->list1_motion_val[0][4*mb_x];
-//     l1mv1  = (void *) &hc->list1_motion_val[1][4*mb_x];
-	l1mv0  = (void *) hc->list1_motion_val[0];
-    l1mv1  = (void *) hc->list1_motion_val[1];
-    l1ref0 = &hc->list1_ref_index [0][4*mb_x];
-    l1ref1 = &hc->list1_ref_index [1][4*mb_x];
-
-    /* one-to-one mv scaling */
-    if(IS_16X16(*mb_type)){
-        int ref, mv0, mv1;
-
-        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
-        if(IS_INTRA(mb_type_col[0])){
-            ref=mv0=mv1=0;
-        }else{
-            const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
-            : map_col_to_list0[1][l1ref1[0]];
-            const int scale = dist_scale_factor[ref0];
-            const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
-            int mv_l0[2];
-            mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
-            mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
-            ref= ref0;
-            mv0= pack16to32(mv_l0[0],mv_l0[1]);
-            mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
-        }
-        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
-        fill_rectangle(&m-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
-        fill_rectangle(&m-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
-    }else{
-        for(i8=0; i8<4; i8++){
-            const int x8 = i8&1;
-            const int y8 = i8>>1;
-            int ref0, scale;
-            const int16_t (*l1mv)[2]= l1mv0;
-
-            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
-                continue;
-            m->sub_mb_type[i8] = sub_mb_type;
-            fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
-            if(IS_INTRA(mb_type_col[0])){
-                fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
-                fill_rectangle(&m-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
-                fill_rectangle(&m-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
-                continue;
-            }
-
-            ref0 = l1ref0[i8];
-            if(ref0 >= 0)
-                ref0 = map_col_to_list0[0][ref0 ];
-            else{
-                ref0 = map_col_to_list0[1][l1ref1[i8]];
-                l1mv= l1mv1;
-            }
-            scale = dist_scale_factor[ref0];
-
-            fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
-            if(IS_SUB_8X8(sub_mb_type)){
-//                 const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
-				const int16_t *mv_col = l1mv[x8*3 + y8*3*4];
-                int mx = (scale * mv_col[0] + 128) >> 8;
-                int my = (scale * mv_col[1] + 128) >> 8;
-                fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
-                fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
-            }else
-            for(i4=0; i4<4; i4++){
-//                 const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
-				const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4];
-                int16_t *mv_l0 = m->mv_cache[0][scan8[i8*4+i4]];
-                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
-                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
-                AV_WN32A(m->mv_cache[1][scan8[i8*4+i4]],
-                    pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]));
-            }
-        }
-    }
-}
-
-void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
-    if(s->direct_spatial_mv_pred){
-        pred_spatial_direct_motion(hc, s, mb_type);
-    }else{
-        pred_temp_direct_motion(hc, s, mb_type);
-    }
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#ifndef H264_DIRECT_H
-#define H264_DIRECT_H
-
-#include "h264_types_spu.h"
-
-void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,74 +0,0 @@
-#include <spu_mfcio.h>
-#include "h264_dma.h"
-
-DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]);
-dma_list_elem_t* put_list;
-
-DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]);
-dma_list_elem_t* get_list;
-
-inline void spu_dma_get(void *ls, unsigned ea, int size, int tag){
-	mfc_get(ls, ea, size, tag, 0, 0);
-}
-
-inline void spu_dma_put(void *ls, unsigned ea, int size, int tag){
-	mfc_put(ls, ea, size, tag, 0, 0);
-}
-
-inline void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag){
-	mfc_putb(ls, ea, size, tag, 0, 0);
-}
-
-// Function that wait to finish a DMA transfer with especific id
-inline void wait_dma_id(int id){
-	spu_writech(MFC_WrTagMask, 1<< id);
-	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
-}
-
-// Functions to get/put a block from/to main memory
-void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier)
-{
-    unsigned int i = 0;
-    unsigned int listsize;
-    unsigned int ea_low;
-
-	dma_list_elem_t* list = get_list;
-	get_list+=h;
-
-    ea_low=(uint32_t) mfc_ea2l(ea);
-
-    /* Create the list, size of each list id the "width" parameter defined by the user */
-    for ( i=0; i<h; i++ ){
-        list[i].size.all32 = w;
-        list[i].ea_low = ea_low;
-        ea_low += stride;
-    }
-    /* Specify the list size and initiate the list transfer */
-    listsize = h*sizeof(dma_list_elem_t);
-    if (barrier)
-		mfc_getlb(dst, (unsigned)ea, list, listsize, tag, 0, 0);
-	else
-		mfc_getl(dst, (unsigned)ea, list, listsize, tag, 0, 0);
-}
-
-
-void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag){
-    unsigned int i = 0;
-    unsigned int listsize;
-    unsigned int ea_low;
-
-	dma_list_elem_t* list = put_list;
-	put_list+=h;
-
-	ea_low=(uint32_t) mfc_ea2l(ea);
-
-    /* Create the list, size of each list id the "width" parameter defined by the user */
-    for ( i=0; i<h; i++ ) {
-        list[i].size.all32 = size;
-        list[i].ea_low = ea_low;
-        ea_low += stride;
-    }
-    /* Specify the list size and initiate the list transfer */
-    listsize = h*sizeof(dma_list_elem_t);
-	mfc_putl(src, (unsigned) ea, list, listsize, tag, 0, 0);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,59 +0,0 @@
-#ifndef H264_DMA_H
-#define H264_DMA_H
-
-#include "libavutil/mem.h"
-
-typedef struct dma_list_elem {
-	union {
-		unsigned int all32;
-		struct {
-		unsigned int stall    : 1;
-		unsigned int reserved : 15;
-		unsigned int nbytes   : 16;
-		} bits;
-	} size;
-	uint64_t ea_low : 32;
-}dma_list_elem_t;
-
-extern DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]);
-extern dma_list_elem_t* put_list;
-
-extern DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]);
-extern dma_list_elem_t* get_list;
-
-enum{
-	MBD_slice=1,
-	MBD_buf1,
-	MBD_buf2,
-	MBD_buf3,
-	MBD_put,
-	MBD_pic,
-	MBD_mc_buf1,
-	MBD_mc_buf2
-};
-
-enum{
-	ED_spe=1,
-	ED_slice,
-	ED_raw,
-	ED_get,
-	ED_get2,
-	ED_get_mv,
-	ED_put,
-	ED_putmb0,
-	ED_putmb1,
-};
-
-// Functions to get/put a block from/to main memory
-void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier);
-void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag);
-
-//Functions to do a dma transfer for 32-bit
-void spu_dma_get(void *ls, unsigned ea, int size, int tag);
-void spu_dma_put(void *ls, unsigned ea, int size, int tag);
-void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag);
-
-// Function that wait to finish a DMA transfer with especific id
-void wait_dma_id(int id);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,650 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding. 
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-
-#include <stdio.h>
-#include <spu_mfcio.h>
-#include <spu_intrinsics.h>
-
-#include "h264_filter_spu.h"
-#include "h264_decode_mb_spu.h"
-// To use scan8 table
-#include "h264_mc_spu.h"
-
-
-int get_chroma_qp(H264Context_spu *h, int t, int qscale){
-    return h->slice.chroma_qp_table[t][qscale];
-}
-
-static inline int clip(int a, int amin, int amax){
-    if (a < amin)
-        return amin;
-    else if (a > amax)
-        return amax;
-    else
-        return a;
-}
-
-static inline vsint16_t clip_altivec(vsint16_t a, vsint16_t amin, vsint16_t amax){
-    vector unsigned short min_mask,max_mask;
-    min_mask = spu_cmpgt(amin, a);
-    max_mask = spu_cmpgt(a, amax);
-
-    return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask);
-}
-
-static inline vsint16_t clip_uint8_altivec(vsint16_t a){
-    const vsint16_t amax = {255,255,255,255,255,255,255,255};
-    const vsint16_t amin = {0, 0, 0, 0, 0, 0, 0, 0};
-    vector unsigned short min_mask,max_mask;
-    min_mask = spu_cmpgt(amin, a);
-    max_mask = spu_cmpgt(a, amax);
-
-    return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask);
-}
-
-static  inline void h264_loop_filter_chroma(vsint16_t *pix, int alpha, int beta, int8_t *tc0){
-
-    short a = (short) tc0[0];
-    short b = (short) tc0[1];
-    short c = (short) tc0[2];
-    short d = (short) tc0[3];
-    const vsint16_t vec_tc0 = {a,a,b,b,c,c,d,d};
-    const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0};
-    vector unsigned short mask_B0;
-
-    mask_B0 = spu_cmpgt(vec_v0, vec_tc0);
-
-    const vsint16_t p0 = pix[-1];
-    const vsint16_t p1 = pix[-2];
-    const vsint16_t q0 = pix[0];
-    const vsint16_t q1 = pix[1];
-
-    const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
-    const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
-    const vsint16_t v_2 = {2,2,2,2,2,2,2,2};
-    const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
-    const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
-
-    vsint16_t rp0;
-    vsint16_t rq0;
-    vsint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0;
-    vector unsigned short mask_B1, mask_tmp;
-    vsint16_t i_delta;
-
-    abs_p0mq0 = (vector signed short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
-    abs_p1mp0 = (vector signed short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
-    abs_q1mq0 = (vector signed short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
-
-    mask_B1  = spu_cmpgt(v_alpha, abs_p0mq0);
-    mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
-    mask_B1  = spu_and(mask_B1, mask_tmp);
-    mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
-    mask_B1  = spu_and(mask_B1, mask_tmp);
-
-
-    i_delta = clip_altivec(spu_rlmaska(spu_add(spu_sl(spu_sub(q0,p0 ), (vuint16_t)v_2), spu_add(spu_sub(p1,q1),v_4)), (vsint16_t)-v_3), -vec_tc0, vec_tc0);
-
-    rp0 = clip_uint8_altivec( spu_add(p0,i_delta));
-    rq0 = clip_uint8_altivec( spu_sub(q0,i_delta));
-
-    pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0);
-    pix[0]  = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0);
-}
-
-static void h264_v_loop_filter_luma_c(vsint16_t *pix, int alpha, int beta, int8_t *tc0, int inc_low2high){
-
-    short a = (short) tc0[0 + inc_low2high];
-    short b = (short) tc0[1 + inc_low2high];
-    const vsint16_t vec_tc0 = {a,a,a,a,b,b,b,b};
-    const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0};
-    vector unsigned short mask_B0;
-
-    mask_B0 = spu_cmpgt(vec_v0, vec_tc0);
-    const vsint16_t p0 = pix[-1];
-    const vsint16_t p1 = pix[-2];
-    const vsint16_t p2 = pix[-3];
-    const vsint16_t q0 = pix[0];
-    const vsint16_t q1 = pix[1];
-    const vsint16_t q2 = pix[2];
-
-    const vuint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
-    const vuint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
-
-    const vuint16_t v_1 = {1,1,1,1,1,1,1,1};
-    const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
-    const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
-    const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
-
-    vsint16_t rp0, rp1;
-    vsint16_t rq0, rq1;
-    vsint16_t tc0_B2P, tc0_B2Q, rtc0;
-    vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0;
-    vector unsigned short mask_B1, mask_B2P, mask_B2Q, mask_tmp;
-    vsint16_t i_delta, i_delta2;
-
-    abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
-    abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
-    abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
-    abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0);
-    abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0);
-
-    mask_B1  = spu_cmpgt(v_alpha, abs_p0mq0);
-    mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
-    mask_B1  = spu_and(mask_B1, mask_tmp);
-    mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
-    mask_B1  = spu_and(mask_B1, mask_tmp);
-
-    mask_B2P = spu_cmpgt(v_beta, abs_p2mp0);
-    mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0);
-
-    rp1 = spu_add(p1, clip_altivec(spu_sub(spu_rlmaska(spu_add(p2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), p1), -vec_tc0, vec_tc0 ));
-    rq1 = spu_add(q1, clip_altivec(spu_sub(spu_rlmaska(spu_add(q2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), q1), -vec_tc0, vec_tc0 ));
-
-    tc0_B2P = spu_add(vec_tc0, (vsint16_t) v_1);
-    tc0_B2P = spu_sel(vec_tc0, tc0_B2P, mask_B2P);
-
-    tc0_B2Q = spu_add(tc0_B2P, (vsint16_t) v_1);
-    rtc0    = spu_sel(tc0_B2P, tc0_B2Q, mask_B2Q);
-    i_delta2 = spu_add(spu_sub(p1,q1),v_4);
-    i_delta = spu_sl(spu_sub(q0,p0 ), v_2);
-    i_delta = spu_add(i_delta,i_delta2 );
-    i_delta = spu_rlmaska(i_delta, (vsint16_t)-v_3);
-    i_delta = clip_altivec(i_delta, -rtc0, rtc0);
-
-    rp0 = clip_uint8_altivec( spu_add(p0,i_delta));    /* p0' */
-    rq0 = clip_uint8_altivec( spu_sub(q0,i_delta));    /* q0' */
-
-    pix[-2] = spu_sel(spu_sel(p1,spu_sel(p1,rp1,mask_B2P) ,mask_B1), p1,mask_B0);
-    pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0);
-    pix[0]  = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0);
-    pix[1]  = spu_sel(spu_sel(q1,spu_sel(q1,rq1,mask_B2Q) ,mask_B1), q1,mask_B0);
-}
-
-
-
-static inline void h264_loop_filter_chroma_intra(vsint16_t *pix, int alpha, int beta){
-
-    const vuint16_t p0 = (vuint16_t) pix[-1];
-    const vuint16_t p1 = (vuint16_t) pix[-2];
-    const vuint16_t q0 = (vuint16_t) pix[0];
-    const vuint16_t q1 = (vuint16_t) pix[1];
-
-    const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
-    const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
-    const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
-
-    vuint16_t rp0;
-    vuint16_t rq0;
-    vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0;
-    vector unsigned short mask_B0, mask_tmp;
-
-    abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
-    abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
-    abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
-
-    mask_B0  = spu_cmpgt(v_alpha, (vsint16_t)abs_p0mq0);
-    mask_tmp = spu_cmpgt(v_beta, (vsint16_t)abs_p1mp0);
-    mask_B0  = spu_and(mask_B0, mask_tmp);
-    mask_tmp = spu_cmpgt( v_beta, (vsint16_t)abs_q1mq0);
-    mask_B0  = spu_and(mask_B0, mask_tmp);
-
-    rp0 = spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2;
-    rp0 = spu_rlmaska(rp0, (vsint16_t)-v_2);
-    rq0 = spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2;
-    rq0 = spu_rlmaska(rq0, (vsint16_t)-v_2);
-
-    pix[-1] = (vsint16_t) spu_sel(p0, rp0, mask_B0);
-    pix[0]  = (vsint16_t) spu_sel(q0, rq0, mask_B0);
-}
-int slice_alpha_c0_offset;
-int slice_beta_offset;
-static void filter_mb_edgecv(vsint16_t *pix, int bS[4], int qp ) {
-    int i;	
-    const int index_a = qp + slice_alpha_c0_offset;
-    const int alpha = (alpha_table+52)[index_a];
-    const int beta  = (beta_table+52)[qp + slice_beta_offset];
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        for(i=0; i<4; i++)
-            tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
-        h264_loop_filter_chroma(pix, alpha, beta, tc);
-    } else {
-        h264_loop_filter_chroma_intra(pix, alpha, beta);
-    }
-}
-
-static void filter_mb_edgeh(vsint16_t *pix, int bS[4], int qp, int inc_low2high ) {
-    int i;
-    const int index_a = qp + slice_alpha_c0_offset;
-    const int alpha = (alpha_table+52)[index_a];
-    const int beta  = (beta_table+52)[qp + slice_beta_offset];
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        for(i=0; i<4; i++)
-            tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
-        h264_v_loop_filter_luma_c(pix, alpha, beta, tc, inc_low2high);
-    } else {
-
-        const vuint16_t p0 = (vuint16_t) pix[-1];
-        const vuint16_t p1 = (vuint16_t) pix[-2];
-        const vuint16_t p2 = (vuint16_t) pix[-3];
-        const vuint16_t p3 = (vuint16_t) pix[-4];
-        const vuint16_t q0 = (vuint16_t) pix[0];
-        const vuint16_t q1 = (vuint16_t) pix[1];
-        const vuint16_t q2 = (vuint16_t) pix[2];
-        const vuint16_t q3 = (vuint16_t) pix[3];
-
-    	const vuint16_t v_alpha = {(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha};
-    	const vuint16_t v_beta = {(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta};
-    	const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
-    	const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
-    	const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
-
-        vuint16_t rp0_B1f, rp0_B2t, rp0_B2f, rp1_B2t, rp2_B2t;
-        vuint16_t rq0_B1f, rq0_B2t, rq0_B2f, rq1_B2t, rq2_B2t;
-        vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0;
-        vuint16_t v_alpha_2 = spu_rlmaska(v_alpha, (vsint16_t)-v_2);
-        vector unsigned short mask_B0, mask_B1, mask_B2P, mask_B2Q, mask_tmp;
-
-        v_alpha_2 = spu_add(v_alpha_2, v_2);
-
-	abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
-    	abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
-    	abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
-        abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0);
-        abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0);
-
-	mask_B0  = spu_cmpgt(v_alpha, abs_p0mq0);
-	mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
-	mask_B0  = spu_and(mask_B0, mask_tmp);
-	mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
-	mask_B0  = spu_and(mask_B0, mask_tmp);
-
-        mask_B1  = spu_cmpgt(v_alpha_2, abs_p0mq0);
-        mask_B2P = spu_cmpgt(v_beta,abs_p2mp0);
-        mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0);
-
-        rp0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p2,p1),spu_add(p1,p0)),spu_add(spu_add(p0,q0),spu_add(q0,q1))),(vuint16_t)v_4),(vsint16_t) -v_3);
-        		//( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-        rp1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p2,p1),spu_add(q0,p0)),v_2),(vsint16_t)-v_2);//( p2 + p1 + p0 + q0 + 2 ) >> 2;
-        rp2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p3,p3),spu_add(p2,p2)),spu_add(spu_add(p2,p1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3);
-        		//( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-        rq0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p1,p0),spu_add(p0,q0)),spu_add(spu_add(q0,q1),spu_add(q1,q2))),(vuint16_t)v_4),(vsint16_t)-v_3);
-
-        		//( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-        rq1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p0,q0),spu_add(q1,q2)),v_2),(vsint16_t)-v_2);//( p0 + q0 + q1 + q2 + 2 ) >> 2;
-        rq2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(q3,q3),spu_add(q2,q2)),spu_add(spu_add(q2,q1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3);
-        		//( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-        rp0_B1f =
-        rp0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2),(vsint16_t)-v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2;
-        rq0_B1f =
-        rq0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2),(vsint16_t)-v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2;
-
-        pix[-1] = (vsint16_t) spu_sel(p0, spu_sel(rp0_B1f, spu_sel(rp0_B2f, rp0_B2t, mask_B2P), mask_B1), mask_B0);
-        pix[-2] = (vsint16_t) spu_sel(p1, spu_sel(p1, spu_sel(p1, rp1_B2t, mask_B2P), mask_B1), mask_B0);
-        pix[-3] = (vsint16_t) spu_sel(p2, spu_sel(p2, spu_sel(p2, rp2_B2t, mask_B2P), mask_B1), mask_B0);
-        pix[0] = (vsint16_t) spu_sel(q0, spu_sel(rq0_B1f, spu_sel(rq0_B2f, rq0_B2t, mask_B2Q), mask_B1), mask_B0);
-        pix[1] = (vsint16_t) spu_sel(q1, spu_sel(q1, spu_sel(q1, rq1_B2t,mask_B2Q), mask_B1), mask_B0);
-        pix[2] = (vsint16_t) spu_sel(q2, spu_sel(q2, spu_sel(q2, rq2_B2t,mask_B2Q), mask_B1), mask_B0);
-    }
-}
-
-// This function gets bS and qp for luma and chroma before the filter
-void calculate_bS_qp(H264Context_spu *h){
-	H264mb* mb = &h->mb;
-	H264slice* slice = h->slice;
-    int dir;
-    const int mvy_limit = 4;
-    /* FIXME: A given frame may occupy more than one position in
-     * the reference list. So ref2frm should be populated with
-     * frame numbers, not indices. */
-
-	int (*ref2frm)[64] = slice->ref2frm;
-	int mb_x = mb->mb_x;
-	int mb_y = mb->mb_y;
-	int mb_type =mb->mb_type;
-    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
-    for( dir = 0; dir < 2; dir++ ){
-        int edge;
-		const int mbm_type = dir == 0 ? mb->mb_type_xy_n1 : mb->mb_type_top;
-        const int8_t qscale_mbm = dir == 0 ? mb->qscale_mbxy_n1 : mb->qscale_mbxy_top;
-
-        // how often to recheck mv-based bS when iterating between edges
-        const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
-        // how often to recheck mv-based bS when iterating along each edge
-        const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
-
-		h->edges[dir] = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
-
-		if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0))
-			h->start[dir] =1;
-		else
-			h->start[dir] =0;
-
-        /* Calculate bS */
-        for( edge = h->start[dir]; edge < h->edges[dir]; edge++ ) {
-            /* mbn_xy: neighbor macroblock */
-            const int mbn_type = edge > 0 ? mb_type : mbm_type;
-            const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm;
-			int* bS = h->bS[dir][edge];
-
-            if( (edge&1) && IS_8x8DCT(mb_type) ){
-                bS[0] = bS[1] = bS[2] = bS[3] = 0; //extra code due to decoupling
-                continue;
-            }
-            if( IS_INTRA(mb_type) ||
-                IS_INTRA(mbn_type) ) {
-                int value;
-                if (edge == 0) {
-					value = 4;
-				} else {
-					value = 3;
-				}
-                bS[0] = bS[1] = bS[2] = bS[3] = value;
-            } else {
-                int i, l;
-                int mv_done;
-
-                if( edge & mask_edge ) {
-					bS[0] = bS[1] = bS[2] = bS[3] = 0;
-                    mv_done = 1;
-                }
-                else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
-                    int b_idx= 8 + 4 + edge * (dir ? 8:1);
-                    int bn_idx= b_idx - (dir ? 8:1);
-                    int v = 0;
-
-                    for( l = 0; !v && l < 1 + (slice->slice_type_nos == FF_B_TYPE); l++ ) {
-                        v |= ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] ||
-                             FFABS(mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
-                             FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit;
-                    }
-                    bS[0] = bS[1] = bS[2] = bS[3] = v;
-
-					mv_done = 1;
-                }
-                else
-                    mv_done = 0;
-
-                for( i = 0; i < 4; i++ ) {
-                    int x = dir == 0 ? edge : i;
-                    int y = dir == 0 ? i    : edge;
-                    int b_idx= 8 + 4 + x + 8*y;
-                    int bn_idx= b_idx - (dir ? 8:1);
-
-                    if( mb->non_zero_count_cache[b_idx] != 0 ||
-                        mb->non_zero_count_cache[bn_idx] != 0 ) {
-                        bS[i] = 2;
-                    }
-                    else if(!mv_done)
-                    {
-                        bS[i] = 0;
-                        for( l = 0; l < 1 + (slice->slice_type == B_TYPE); l++ ) {
-                            if( ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] ||
-                                FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
-                                FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
-                                bS[i] = 1;
-                                break;
-                            }
-                        }
-                    }
-                }
-
-                if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-                    continue;
-            }
-
-            /* Filter edge */
-            // Do not use s->qscale as luma quantizer because it has not the same
-            // value in IPCM macroblocks.
-            h->qp[dir][edge] = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1;
-            h->chroma_qp[0][dir][edge] = ( mb->chroma_qp[0] + get_chroma_qp(h, 0, qscale_mbn_xy ) + 1 ) >> 1;
-
-			h->chroma_qp[1][dir][edge] = ( mb->chroma_qp[1] + get_chroma_qp(h, 1, qscale_mbn_xy ) + 1 ) >> 1;
-        }
-		slice_alpha_c0_offset=slice->slice_alpha_c0_offset;
-		slice_beta_offset= slice->slice_beta_offset;
-    }
-}
-
-
-#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7,merge_h,merge_l) \
-    b0 = spu_shuffle( a0, a4, merge_h); \
-    b1 = spu_shuffle( a0, a4, merge_l ); \
-    b2 = spu_shuffle( a1, a5, merge_h ); \
-    b3 = spu_shuffle( a1, a5, merge_l ); \
-    b4 = spu_shuffle( a2, a6, merge_h ); \
-    b5 = spu_shuffle( a2, a6, merge_l ); \
-    b6 = spu_shuffle( a3, a7, merge_h ); \
-    b7 = spu_shuffle( a3, a7, merge_l ); \
-    a0 = spu_shuffle( b0, b4, merge_h ); \
-    a1 = spu_shuffle( b0, b4, merge_l ); \
-    a2 = spu_shuffle( b1, b5, merge_h ); \
-    a3 = spu_shuffle( b1, b5, merge_l ); \
-    a4 = spu_shuffle( b2, b6, merge_h ); \
-    a5 = spu_shuffle( b2, b6, merge_l); \
-    a6 = spu_shuffle( b3, b7, merge_h ); \
-    a7 = spu_shuffle( b3, b7, merge_l ); \
-    b0 = spu_shuffle( a0, a4, merge_h ); \
-    b1 = spu_shuffle( a0, a4, merge_l ); \
-    b2 = spu_shuffle( a1, a5, merge_h ); \
-    b3 = spu_shuffle( a1, a5, merge_l); \
-    b4 = spu_shuffle( a2, a6, merge_h ); \
-    b5 = spu_shuffle( a2, a6, merge_l ); \
-    b6 = spu_shuffle( a3, a7, merge_h ); \
-    b7 = spu_shuffle( a3, a7, merge_l )
-
-void filter_mb_spu(vsint16_t *img_y, vsint16_t *img_cb, vsint16_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int edges[2], int bS[2][4][4], int qp[2][4], int chroma_qp[2][2][4], int start[2]){
-
-    int dir,x;
-    vsint16_t o_vec_img_y[(16+8)*2];
-    vsint16_t t_vec_img_y[(16+8)*2];
-    vsint16_t *vec_img_y_o = o_vec_img_y;
-    vsint16_t *vec_img_y_t = t_vec_img_y;
-
-    vsint16_t o_vec_img_cb[8+8+4];
-    vsint16_t t_vec_img_cb[8+8];
-    vsint16_t *vec_img_cb_o = &o_vec_img_cb[2];
-    vsint16_t *vec_img_cb_t = t_vec_img_cb;
-
-    vsint16_t o_vec_img_cr[8+8+4];
-    vsint16_t t_vec_img_cr[8+8];
-    vsint16_t *vec_img_cr_o = &o_vec_img_cr[2];
-    vsint16_t *vec_img_cr_t = t_vec_img_cr;
-
-    vuint8_t *pvec_tmp;
-
-    const vuint8_t patt_high = {16,  0, 17,  1, 18,  2, 19,  3, 20,  4, 21,  5, 22,  6, 23,  7};
-    const vuint8_t patt_low  = {16,  8, 17,  9, 18, 10, 19, 11, 20, 12, 21, 13, 22, 14, 23, 15};
-    const vuint8_t patt_unpack={ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
-    const vuint8_t patt_pack_hw={0,  1,  2,  3,  4,  5,  6,  7, 17, 19, 21, 23, 25, 27, 29, 31};
-    const vuint8_t patt_pack_chroma_aligned={0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F,
-                                             0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F};
-    const vuint8_t patt_pack_chroma_unaligned={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-                                               0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F};
-    const vuint8_t v_0  	   = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-    const vuint8_t mergehu16 = {0x00,0x01,0x10,0x11,0x02,0x03,0x12,0x13,0x04,0x05,0x14,0x15,0x06,0x07,0x16,0x17};
-    const vuint8_t mergelu16 = {0x08,0x09,0x18,0x19,0x0A,0x0B,0x1A,0x1B,0x0C,0x0D,0x1C,0x1D,0x0E,0x0F,0x1E,0x1F};
-    vuint8_t store_chroma, store_chroma_n1, load_chroma, load_chroma_n1;
-    int mb_xy_n1;
-    const int unalign_chroma = (unsigned int) img_cb & 15;
-
-    if(unalign_chroma==0){
-        load_chroma = patt_high;
-        load_chroma_n1 = patt_low;  // for load chroma mb_x-1
-        store_chroma = patt_pack_chroma_aligned;
-        store_chroma_n1 = patt_pack_chroma_unaligned;  // for store chroma mb_x-1
-        mb_xy_n1 = 1;   //  si no hay desalineamineto se necesita el bloque anterior para filtrar horizontalmente
-    }
-    else{
-        load_chroma = patt_low;
-        load_chroma_n1 = patt_high; // for load mb_x-1
-        store_chroma = patt_pack_chroma_unaligned;
-        store_chroma_n1 = patt_pack_chroma_aligned;    // for store chroma mb_x-1
-        mb_xy_n1 = 0;   //  si hay desalineamineto 8 no se necesita el bloque anterior
-    }
-
-    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
-
-    // LOAD MB_X -1
-
-    for (x = 0; x < 16; x++){  //Unpack Memory to 8 positions vector
-        vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize - 1], v_0 , patt_low);
-    }
-
-    for (x = 0; x < 8; x++){  //Unpack Memory to 8 positions vector
-	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cb[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1);
-	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cr[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1);
-    }
-
-    VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16);
-
-    vec_img_y_t  = &vec_img_y_t[8];
-    vec_img_y_o  = &vec_img_y_o[8];
-    vec_img_cb_t = &vec_img_cb_t[8];
-    vec_img_cb_o = &vec_img_cb_o[10];
-    vec_img_cr_t = &vec_img_cr_t[8];
-    vec_img_cr_o = &vec_img_cr_o[10];
-
-    //LOAD CURRENT MB
-    for (x = 0; x < 16; x++){  //Unpack Memory to 8 positions vector
-        pvec_tmp  	  = (vuint8_t *) &img_y[x*linesize];
-	vec_img_y_o[x]    = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_high);
-	vec_img_y_o[x+24] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_low);
-    }
-
-    for (x = 0; x < 8; x++){  //Unpack Memory to 8 positions vector
-	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma);
-	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma);
-    }
-
-    //TRANSPOSE MATRIX
-
-    VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31], vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39], vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16);
-
-    //PROCESS
-    dir = 0;
-    {
-        int edge;
-        for( edge = start[dir]; edge < edges[dir]; edge++ ) {
-            if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0)
-            {
-            	filter_mb_edgeh( &vec_img_y_t[4*edge   ], bS[dir][edge], qp[dir][edge],0);//low
-            	filter_mb_edgeh( &vec_img_y_t[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high
-
-                if( (edge&1) == 0 ) {
-                    filter_mb_edgecv( &vec_img_cb_t[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] );
-                    filter_mb_edgecv( &vec_img_cr_t[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] );
-                }
-            }
-        }
-    }
-
-    //SAVE MB_X -1 RESULTS
-
-    VEC_TRANSPOSE_8(vec_img_y_t[-8], vec_img_y_t[-7], vec_img_y_t[-6], vec_img_y_t[-5], vec_img_y_t[-4], vec_img_y_t[-3], vec_img_y_t[-2], vec_img_y_t[-1], vec_img_y_o[-8], vec_img_y_o[-7], vec_img_y_o[-6], vec_img_y_o[-5], vec_img_y_o[-4], vec_img_y_o[-3], vec_img_y_o[-2], vec_img_y_o[-1],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_t[16], vec_img_y_t[17], vec_img_y_t[18], vec_img_y_t[19], vec_img_y_t[20], vec_img_y_t[21], vec_img_y_t[22], vec_img_y_t[23], vec_img_y_o[16], vec_img_y_o[17], vec_img_y_o[18], vec_img_y_o[19], vec_img_y_o[20], vec_img_y_o[21], vec_img_y_o[22], vec_img_y_o[23],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cb_t[ -8], vec_img_cb_t[-7], vec_img_cb_t[-6], vec_img_cb_t[-5], vec_img_cb_t[-4], vec_img_cb_t[-3], vec_img_cb_t[-2], vec_img_cb_t[-1], vec_img_cb_o[-10], vec_img_cb_o[-9], vec_img_cb_o[-8], vec_img_cb_o[-7], vec_img_cb_o[-6], vec_img_cb_o[-5], vec_img_cb_o[-4], vec_img_cb_o[-3],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cr_t[ -8], vec_img_cr_t[-7], vec_img_cr_t[-6], vec_img_cr_t[-5], vec_img_cr_t[-4], vec_img_cr_t[-3], vec_img_cr_t[-2], vec_img_cr_t[-1], vec_img_cr_o[-10], vec_img_cr_o[-9], vec_img_cr_o[-8], vec_img_cr_o[-7], vec_img_cr_o[-6], vec_img_cr_o[-5], vec_img_cr_o[-4], vec_img_cr_o[-3],mergehu16, mergelu16);
-
-    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
-    	img_y[x*linesize - 1] = spu_shuffle(img_y[x*linesize - 1], vec_img_y_o[-8+x], patt_pack_hw);
-    }
-
-    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
-    	img_y[(x+8)*linesize - 1] = spu_shuffle(img_y[(x+8)*linesize - 1], vec_img_y_o[16+x], patt_pack_hw);
-    }
-
-    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
-    	img_cb[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cb[x*uvlinesize - mb_xy_n1], vec_img_cb_o[-10+x], store_chroma_n1);
-    	img_cr[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cr[x*uvlinesize - mb_xy_n1], vec_img_cr_o[-10+x], store_chroma_n1);
-    }
-
-    //TRANSPOSE MATRIX
-
-    VEC_TRANSPOSE_8(vec_img_y_t[ 0], vec_img_y_t[ 1], vec_img_y_t[ 2], vec_img_y_t[ 3], vec_img_y_t[ 4], vec_img_y_t[ 5], vec_img_y_t[ 6], vec_img_y_t[ 7], vec_img_y_o[ 0], vec_img_y_o[ 1], vec_img_y_o[ 2], vec_img_y_o[ 3], vec_img_y_o[ 4], vec_img_y_o[ 5], vec_img_y_o[ 6], vec_img_y_o[ 7],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15], vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31], vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39], vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7], vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7],mergehu16, mergelu16);
-
-    VEC_TRANSPOSE_8(vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7], vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7],mergehu16, mergelu16);
-
-
-    //LOAD MB_Y - 1
-    for (x = -4; x < 0; x++){  //Unpack Memory to 8 positions vector
-	vec_img_y_o[x]    = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_high);
-	vec_img_y_o[x+24] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_low);
-    }
-
-    for (x = -2; x < 0; x++){  //Unpack Memory to 8 positions vector
-	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma);
-	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma);
-    }
-
-    //PROCESS
-    dir = 1;
-    {
-        int edge;
-        for( edge = start[dir]; edge < edges[dir]; edge++ ) {
-            if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0)
-            {
-            	filter_mb_edgeh( &vec_img_y_o[4*edge   ], bS[dir][edge], qp[dir][edge],0);//low
-            	filter_mb_edgeh( &vec_img_y_o[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high
-            	if( (edge&1) == 0 ) {
-            	    filter_mb_edgecv( &vec_img_cb_o[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] );
-                    filter_mb_edgecv( &vec_img_cr_o[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] );
-            	}
-            }
-        }
-
-        for (x = -3; x < 16; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
-    	    img_y[x*linesize] = spu_shuffle(vec_img_y_o[x], vec_img_y_o[x+24], patt_unpack);
-        }
-
-        for (x = -1; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
-            img_cb[x*uvlinesize] = spu_shuffle(img_cb[x*uvlinesize], vec_img_cb_o[x], store_chroma);
-            img_cr[x*uvlinesize] = spu_shuffle(img_cr[x*uvlinesize], vec_img_cr_o[x], store_chroma);
-        }
-    }
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,408 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - Macroblock Decoding.
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-#include <spu_intrinsics.h>
-#include "types_spu.h"
-#include "h264_tables.h"
-#include "h264_idct_spu.h"
-#include "h264_intra_spu.h"
-
-/***********************************************************************
- * ff_h264_idct_add_spu
- ***********************************************************************
- *  h264 idct 4x4 transform with SPU SIMD intrinsics
- *  using the factorized algorithm 
- *  Mauricio Alvarez: alvarez@ac.upc.edu
- *  - DCTELEM* block: transformed coefficients are stored consecutvely in memory, 
- *  - for the 4x4 transform the structure is like that:
- *       || coef_00 | coef_01 || coef_02 | coef_03 ||..||coef_0F||
- *  - Usually the DCTELEM block is declared with an alignment modificator in such a way 
- *    that the  array is 128 bit (16 byte, 8 short) aligned.
- *  - The dst pointer can be unaligned with unaligment as a multiple of 4.
- ***********************************************************************/
-
-// idct_dc
-void ff_idct_dc_add(uint8_t *dst, short *block, int stride){
-    int i, j;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int dc = (block[0] + 32) >> 6;
-    for( j = 0; j < 4; j++ ){
-        for( i = 0; i < 4; i++ )
-            dst[i] = cm[ dst[i] + dc ];
-        dst += stride;
-    }
-}
-
-void ff_idct8_dc_add(uint8_t *dst, short *block, int stride){
-    int i, j;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int dc = (block[0] + 32) >> 6;
-    for( j = 0; j < 8; j++ ){
-        for( i = 0; i < 8; i++ )
-            dst[i] = cm[ dst[i] + dc ];
-        dst += stride;
-    }
-}
-
-// add without idct
-
-void add_pixels8_c(uint8_t *pixels, short *block, int line_size)
-{
-    int i;
-    for(i=0;i<8;i++) {
-        pixels[0] += block[0];
-        pixels[1] += block[1];
-        pixels[2] += block[2];
-        pixels[3] += block[3];
-        pixels[4] += block[4];
-        pixels[5] += block[5];
-        pixels[6] += block[6];
-        pixels[7] += block[7];
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-void add_pixels4_c(uint8_t *pixels, short *block, int line_size)
-{
-    int i;
-    for(i=0;i<4;i++) {
-        pixels[0] += block[0];
-        pixels[1] += block[1];
-        pixels[2] += block[2];
-        pixels[3] += block[3];
-        pixels += line_size;
-        block += 4;
-    }
-}
-
-void h264_luma_dc_dequant_idct_c(short *block, int qmul){
-	#define stride 16
-	int i;
-	int temp[16]; //FIXME check if this is a good idea
-	static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
-	static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
-
-	for(i=0; i<4; i++){
-		const int offset= y_offset[i];
-		const int z0= block[offset+stride*0] + block[offset+stride*4];
-		const int z1= block[offset+stride*0] - block[offset+stride*4];
-		const int z2= block[offset+stride*1] - block[offset+stride*5];
-		const int z3= block[offset+stride*1] + block[offset+stride*5];
-
-		temp[4*i+0]= z0+z3;
-		temp[4*i+1]= z1+z2;
-		temp[4*i+2]= z1-z2;
-		temp[4*i+3]= z0-z3;
-	}
-
-	for(i=0; i<4; i++){
-		const int offset= x_offset[i];
-		const int z0= temp[4*0+i] + temp[4*2+i];
-		const int z1= temp[4*0+i] - temp[4*2+i];
-		const int z2= temp[4*1+i] - temp[4*3+i];
-		const int z3= temp[4*1+i] + temp[4*3+i];
-
-		block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
-		block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
-		block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
-		block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
-	}
-}
-#undef stride
-
-void chroma_dc_dequant_idct_c(short *block, int qmul){
-	const int stride= 16*2;
-	const int xStride= 16;
-	int a,b,c,d,e;
-
-	a= block[stride*0 + xStride*0];
-	b= block[stride*0 + xStride*1];
-	c= block[stride*1 + xStride*0];
-	d= block[stride*1 + xStride*1];
-
-	e= a-b;
-	a= a+b;
-	b= c-d;
-	c= c+d;
-
-	block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
-	block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
-	block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
-	block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
-}
-
-void h264_idct4_add_spu(uint8_t *dst, short *block, int stride)
-{
-  vsint16_t __vz0, __vz1, __vz2, __vz3; // used as temporal storage in for VEC_1D_DCT
-  vsint16_t va0, va1, va2, va3;
-  vsint16_t vtmp0, vtmp1, vtmp2, vtmp3;
-  vuint16_t sat;
-  vuint8_t va_u8;
-  vsint16_t vdst_ss;
-  vuint8_t dstperm;
-  vuint8_t vdst, vdst_orig, vfdst;
-  const int16_t imax = 255;
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  const int shift_dst = (unsigned int) dst  & 15;
-  const vuint8_t packu16   = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F);
-  const vuint8_t mergehu8  = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17);
-  //for optimized matrix transpose:
-  const vuint8_t tr0 =AVV(0x00,0x01,0x08,0x09,0x10,0x11,0x18,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
-  const vuint8_t tr1 =AVV(0x02,0x03,0x0A,0x0B,0x12,0x13,0x1A,0x1B,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
-  const vuint8_t tr2 =AVV(0x04,0x05,0x0C,0x0D,0x14,0x15,0x1C,0x1D,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
-  const vuint8_t tr3 =AVV(0x06,0x07,0x0E,0x0F,0x16,0x17,0x1E,0x1F,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
-  const vuint8_t conc =AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17);
-
-  block[0] += 32;  // add 32 as a DC-level for rounding
-
-  //load matrix
-  vtmp0 = *(vsint16_t *)(block);
-  vtmp1 = spu_rlqwbyte(vtmp0,8);
-  vtmp2 = *(vsint16_t *)(block+8);
-  vtmp3 = spu_rlqwbyte(vtmp2,8);
-
-  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
-
-  //concatenate first two rows of matrix
-  va0=spu_shuffle(va0,va1,conc);
-  //concatenate last two rows of matrix
-  va2=spu_shuffle(va2,va3,conc);
-
-  //do transpose starting from two vectors, storing as four vectors of which the second part is unused
-  vtmp0 = spu_shuffle( va0, va2, tr0);
-  vtmp1 = spu_shuffle( va0, va2, tr1);
-  vtmp2 = spu_shuffle( va0, va2, tr2);
-  vtmp3 = spu_shuffle( va0, va2, tr3);
-
-  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
-
-  // division by 64
-  va0 = spu_rlmaska(va0,-6);
-  va1 = spu_rlmaska(va1,-6);
-  va2 = spu_rlmaska(va2,-6);
-  va3 = spu_rlmaska(va3,-6);
-
-  switch (shift_dst){
-    case 0: {
-      dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
-                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-    } break;
-    case 4: {
-      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-    } break;
-    case 8: {
-      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-  	                      0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F);
-    } break;
-    case 12: {
-      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-                              0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13);
-    } break;
-    default: {
-      dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
-                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-    } break;
-  }
-
-  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm);
-  dst += stride;
-  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm);
-  dst += stride;
-  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm);
-  dst += stride;
-  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm);
-}
-
-void h264_idct8_add_spu(uint8_t *dst, short *block, int stride)
-{
-	vsint16_t va0, va1, va2, va3, va4, va5, va6, va7;
-	vsint16_t vza0, vza1, vza2, vza3, vza4, vza5, vza6, vza7, vzal,vzah;
-	vsint16_t vzb0, vzb1, vzb2, vzb3, vzb4, vzb5, vzb6, vzb7;
-	vsint16_t vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6, vtmp7;
-	vuint16_t sat;
-	vuint8_t va_u8;
-	const int block_stride=8;
-	vsint16_t vdst_ss;
-	const int16_t imax = 255;
-	const vsint32_t vzero = spu_splats(0);
-	const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-	vuint8_t vdst, vdst_orig, vfdst;
-	vuint8_t dstperm;
-	const int shift_dst = (unsigned int) dst  & 15;
-	const vuint8_t packu16   = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F);
-	const vuint8_t mergehu8  = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17);
-	const vuint8_t m1        = AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17);
-	const vuint8_t m2        = AVV(0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F);
-	const vuint8_t m3        = AVV(0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x18,0x19,0x1A,0x1B);
-	const vuint8_t m4        = AVV(0x14,0x15,0x16,0x17,0x04,0x05,0x06,0x07,0x1C,0x1D,0x1E,0x1F,0x0C,0x0D,0x0E,0x0F);
-	const vuint8_t m5        = AVV(0x00,0x01,0x10,0x11,0x04,0x05,0x14,0x15,0x08,0x09,0x18,0x19,0x0C,0x0D,0x1C,0x1D);
-	const vuint8_t m6        = AVV(0x12,0x13,0x02,0x03,0x16,0x17,0x06,0x07,0x1A,0x1B,0x0A,0x0B,0x1E,0x1F,0x0E,0x0F);
-
-	block[0] += 32;  // add 32 as a DC-level for rounding
-
-	vtmp0 = *(vsint16_t *)(block);
-	vtmp1 = *(vsint16_t *)(block + block_stride);
-	vtmp2 = *(vsint16_t *)(block + 2*block_stride);
-	vtmp3 = *(vsint16_t *)(block + 3*block_stride);
-	vtmp4 = *(vsint16_t *)(block + 4*block_stride);
-	vtmp5 = *(vsint16_t *)(block + 5*block_stride);
-	vtmp6 = *(vsint16_t *)(block + 6*block_stride);
-	vtmp7 = *(vsint16_t *)(block + 7*block_stride);
-
-	VEC_1D_DCT8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7);
-	VEC_TRANSPOSE_8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7,va0,va1,va2,va3,va4,va5,va6,va7);
-	VEC_1D_DCT8(va0, va1, va2, va3, va4, va5, va6, va7);
-
-	va0 = spu_rlmaska(va0,-6);
-	va1 = spu_rlmaska(va1,-6);
-	va2 = spu_rlmaska(va2,-6);
-	va3 = spu_rlmaska(va3,-6);
-	va4 = spu_rlmaska(va4,-6);
-	va5 = spu_rlmaska(va5,-6);
-	va6 = spu_rlmaska(va6,-6);
-	va7 = spu_rlmaska(va7,-6);
-
-	if (shift_dst==8)
-		dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-				   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
-	else																		    dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
-			0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va4,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va5,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va6,dstperm);
-	dst += stride;
-	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va7,dstperm);
-
-}
-
-/*
-
-void h264_idct4_add_spu(uint8_t *dst, short *block, int stride){
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    block[0] += 32;
-
-    for(i=0; i<4; i++){
-        const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
-        const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
-        const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
-        const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
-
-        block[0 + 4*i]= z0 + z3;
-        block[1 + 4*i]= z1 + z2;
-        block[2 + 4*i]= z1 - z2;
-        block[3 + 4*i]= z0 - z3;
-    }
-
-    for(i=0; i<4; i++){
-        const int z0=  block[i + 4*0]     +  block[i + 4*2];
-        const int z1=  block[i + 4*0]     -  block[i + 4*2];
-        const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
-        const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
-
-        dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ];
-        dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ];
-        dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
-        dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
-    }
-}
-
-void h264_idct8_add_spu(uint8_t *dst, short *block, int stride){
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    block[0] += 32;
-	
-    for( i = 0; i < 8; i++ )
-    {
-        const int a0 =  block[0+i*8] + block[4+i*8];
-        const int a2 =  block[0+i*8] - block[4+i*8];
-        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
-        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
-
-        const int b0 = a0 + a6;
-        const int b2 = a2 + a4;
-        const int b4 = a2 - a4;
-        const int b6 = a0 - a6;
-
-        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
-        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
-        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
-        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
-
-        const int b1 = (a7>>2) + a1;
-        const int b3 =  a3 + (a5>>2);
-        const int b5 = (a3>>2) - a5;
-        const int b7 =  a7 - (a1>>2);
-
-        block[0+i*8] = b0 + b7;
-        block[7+i*8] = b0 - b7;
-        block[1+i*8] = b2 + b5;
-        block[6+i*8] = b2 - b5;
-        block[2+i*8] = b4 + b3;
-        block[5+i*8] = b4 - b3;
-        block[3+i*8] = b6 + b1;
-        block[4+i*8] = b6 - b1;
-    }
-    for( i = 0; i < 8; i++ )
-    {
-        const int a0 =  block[i+0*8] + block[i+4*8];
-        const int a2 =  block[i+0*8] - block[i+4*8];
-        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
-        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
-
-        const int b0 = a0 + a6;
-        const int b2 = a2 + a4;
-        const int b4 = a2 - a4;
-        const int b6 = a0 - a6;
-
-        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
-        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
-        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
-        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
-
-        const int b1 = (a7>>2) + a1;
-        const int b3 =  a3 + (a5>>2);
-        const int b5 = (a3>>2) - a5;
-        const int b7 =  a7 - (a1>>2);
-			
-		dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
-		dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
-		dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
-		dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
-		dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
-		dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
-		dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
-		dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
-	}
-}*/
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,141 +0,0 @@
-#ifndef H264_IDCT_SPU_H
-#define H264_IDCT_SPU_H
-
-void h264_idct4_add_spu(uint8_t *dst, short *block, int stride);
-void h264_idct8_add_spu(uint8_t *dst, short *block, int stride);
-
-/***********************************************************************
- * VEC_1D_IDCT
- ***********************************************************************
- * 1-dimensional 4x4 H264 integer DCT inverse transform.
- * Actually source and destination are 8x4. The low elements of the
- * source are discarded and the low elements of the destination mustn't
- * be used. 
- * __vz0-__vz3 registers need to be declared in the caller function
- ***********************************************************************/
-#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)				\
-  /* 1st stage */								\
-  __vz0 = spu_add(vb0,vb2);		/* temp[0] = Y[0] + Y[2] 	*/	\
-  __vz1 = spu_sub(vb0,vb2);		/* temp[1] = Y[0] - Y[2] 	*/	\
-  __vz2 = spu_rlmaska(vb1,-1);							\
-  __vz2 = spu_sub(__vz2,vb3);		/* temp[2] = Y[1].1/2 - Y[3] 	*/	\
-  __vz3 = spu_rlmaska(vb3,-1);							\
-  __vz3 = spu_add(vb1,__vz3);		/* temp[3] = Y[1] + Y[3].1/2 	*/	\
-										\
-  /* 2nd stage: output */							\
-  va0 = spu_add(__vz0,__vz3);		/* x[0] = temp[0] + temp[3] 	*/	\
-  va1 = spu_add(__vz1,__vz2);		/* x[1] = temp[1] + temp[2] 	*/	\
-  va2 = spu_sub(__vz1,__vz2);		/* x[2] = temp[1] - temp[2] 	*/  	\
-  va3 = spu_sub(__vz0,__vz3)		/* x[3] = temp[0] - temp[3] 	*/	
-
-/***********************************************************************
- * VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8
- ***********************************************************************
- * load a vuint8_t vector from a unaligned memory position p
- * Converts the vector to vsint16_t
- * Adds the loaded and converted vector to a defined vector va
- * converts back the result to vuint8_t and store it to memory
- **********************************************************************/
-
-#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,shift,va,align_dst)	\
-    vdst_orig = *(vuint8_t *) (p);					\
-    vdst = spu_or(spu_slqwbyte(vdst_orig, shift),(vuint8_t) vzero);	\
-    vdst_ss = (vsint16_t) spu_shuffle((vuint8_t)vzero,vdst,mergehu8);	\
-    va = spu_add(va,vdst_ss);						\
-    sat = spu_cmpgt(va,(vsint16_t)vzero);				\
-    va = spu_and(va,(vsint16_t)sat);					\
-    sat = spu_cmpgt(va,vmax);						\
-    va = spu_sel(va,vmax,sat);						\
-    va_u8 = (vuint8_t) spu_shuffle(va,(vsint16_t) vzero,packu16);	\
-    vfdst = spu_shuffle(vdst_orig, va_u8, align_dst);			\
-    *(vuint8_t *) (dst) = vfdst
-
-/***********************************************************************
- * VEC_TRANSPOSE_8
- ***********************************************************************
- * Transposes a 8x8 matrix of s16 vectors
- **********************************************************************/
-#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
-    b0 = spu_shuffle( a0, a4, m1 ); \
-    b1 = spu_shuffle( a1, a5, m1 ); \
-    b2 = spu_shuffle( a2, a6, m1 ); \
-    b3 = spu_shuffle( a3, a7, m1 ); \
-    b4 = spu_shuffle( a4, a0, m2 ); \
-    b5 = spu_shuffle( a5, a1, m2 ); \
-    b6 = spu_shuffle( a6, a2, m2 ); \
-    b7 = spu_shuffle( a7, a3, m2 ); \
-    a0 = spu_shuffle( b0, b2, m3 ); \
-    a1 = spu_shuffle( b1, b3, m3 ); \
-    a2 = spu_shuffle( b2, b0, m4 ); \
-    a3 = spu_shuffle( b3, b1, m4 ); \
-    a4 = spu_shuffle( b4, b6, m3 ); \
-    a5 = spu_shuffle( b5, b7, m3 ); \
-    a6 = spu_shuffle( b6, b4, m4 ); \
-    a7 = spu_shuffle( b7, b5, m4 ); \
-    b0 = spu_shuffle( a0, a1, m5 ); \
-    b1 = spu_shuffle( a1, a0, m6 ); \
-    b2 = spu_shuffle( a2, a3, m5 ); \
-    b3 = spu_shuffle( a3, a2, m6 ); \
-    b4 = spu_shuffle( a4, a5, m5 ); \
-    b5 = spu_shuffle( a5, a4, m6 ); \
-    b6 = spu_shuffle( a6, a7, m5 ); \
-    b7 = spu_shuffle( a7, a6, m6 )
-
-/***********************************************************************
- * VEC_1D_IDCT8
- ***********************************************************************
- * 1-dimensional 8x8 H264 integer DCT inverse transform.
- ***********************************************************************/
-#define VEC_1D_DCT8(vb0,vb1,vb2,vb3,vb4,vb5,vb6,vb7)						\
-  vza0 = spu_add(vb0,vb4);		/* a[0] = Y[0] + Y[4] 	*/				\
-  vza2 = spu_sub(vb0,vb4);		/* a[2] = Y[0] - Y[4]	*/				\
-  vza4 = spu_rlmaska(vb2,-1);									\
-  vza4 = spu_sub(vza4,vb6);		/* a[4] = Y[2]>>1 - Y[6]	*/			\
-  vza6 = spu_rlmaska(vb6,-1	);								\
-  vza6 = spu_add(vb2,vza6);		/* a[6] = Y[2]    + Y[6]>>1	*/			\
-  												\
-  vzb0 = spu_add(vza0,vza6);		/* b[0] = a[0] + a[6]	*/				\
-  vzb2 = spu_add(vza2,vza4);		/* b[2] = a[2] + a[4]	*/				\
-  vzb4 = spu_sub(vza2,vza4);		/* b[4] = a[2] - a[4]	*/				\
-  vzb6 = spu_sub(vza0,vza6);		/* b[6] = a[0] - a[6]	*/				\
-  												\
-  vza1 = spu_rlmaska(vb7,-1);									\
-  vzal = spu_add(vza1,vb7);									\
-  vzah = spu_sub(vb5,vb3);									\
-  vza1 = spu_sub(vzah,vzal);	/* a1 = (-Y[3] + Y[5]) - (Y[7] + (Y[7]>>1))	*/		\
-  												\
-  vza3 = spu_rlmaska(vb3,-1);									\
-  vzal = spu_add(vza3,vb3);									\
-  vzah = spu_add(vb1,vb7);									\
-  vza3 = spu_sub(vzah,vzal);  	/* a3 =  (Y[1] + Y[7]) - (Y[3] + (Y[3]>>1))	*/		\
-  												\
-  vza5 = spu_rlmaska(vb5,-1);									\
-  vzal = spu_add(vza5,vb5);									\
-  vzah = spu_sub(vb7,vb1);									\
-  vza5 = spu_add(vzah,vzal);	/* a5 = (-Y[1] + Y[7]) + (Y[5] + Y[5]>>1))	*/		\
-												\
-  vza7 = spu_rlmaska(vb1,-1);									\
-  vzal = spu_add(vza7,vb1);									\
-  vzah = spu_add(vb3,vb5);									\
-  vza7 = spu_add(vzah,vzal);	/* a7 =  (Y[3] + Y[5]) + (Y[1] + (Y[1]>>1))	*/		\
-  												\
-  vzb1 = spu_rlmaska(vza7,-2);									\
-  vzb1 = spu_add(vzb1,vza1);		/* b1 = (a7>>2) + a1	*/				\
-  vzb3 = spu_rlmaska(vza5,-2);									\
-  vzb3 = spu_add(vzb3,vza3);		/* b3 =  a3 + (a5>>2)	*/				\
-  vzb5 = spu_rlmaska(vza3,-2);									\
-  vzb5 = spu_sub(vzb5,vza5);  		/* b5 = (a3>>2) - a5	*/				\
-  vzb7 = spu_rlmaska(vza1,-2);									\
-  vzb7 = spu_sub(vza7,vzb7);		/* b7 =  a7 - (a1>>2)	*/				\
-  												\
-  vb0 = spu_add(vzb0,vzb7); 		/* src[i][0] = b0 + b7	*/				\
-  vb7 = spu_sub(vzb0,vzb7);		/* src[i][7] = b0 - b7	*/				\
-  vb1 = spu_add(vzb2,vzb5);		/* src[i][1] = b2 + b5	*/				\
-  vb6 = spu_sub(vzb2,vzb5);		/* src[i][6] = b2 - b5	*/				\
-  vb2 = spu_add(vzb4,vzb3);		/* src[i][2] = b4 + b3	*/				\
-  vb5 = spu_sub(vzb4,vzb3);		/* src[i][5] = b4 - b3	*/				\
-  vb3 = spu_add(vzb6,vzb1);		/* src[i][3] = b6 + b1	*/				\
-  vb4 = spu_sub(vzb6,vzb1);		/* src[i][4] = b6 - b1	*/
-  
-
-#endif /*H264_IDCT_SPU_H*/
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,802 +0,0 @@
-#include "types_spu.h"
-#include "h264_tables.h"
-#include "h264_intra_spu.h"
-#include <assert.h>
-
-void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    ((uint32_t*)(src+0*stride))[0]= a;
-    ((uint32_t*)(src+1*stride))[0]= a;
-    ((uint32_t*)(src+2*stride))[0]= a;
-    ((uint32_t*)(src+3*stride))[0]= a;
-}
-
-void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
-    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
-    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
-    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
-}
-
-void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
-                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
-}
-
-
-#define LOAD_TOP_RIGHT_EDGE\
-    const int t4= topright[0];\
-    const int t5= topright[1];\
-    const int t6= topright[2];\
-    const int t7= topright[3];\
-
-#define LOAD_LEFT_EDGE\
-    const int l0= src[-1+0*stride];\
-    const int l1= src[-1+1*stride];\
-    const int l2= src[-1+2*stride];\
-    const int l3= src[-1+3*stride];\
-
-#define LOAD_TOP_EDGE\
-    const int t0= src[ 0-1*stride];\
-    const int t1= src[ 1-1*stride];\
-    const int t2= src[ 2-1*stride];\
-    const int t3= src[ 3-1*stride];\
-
-void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){	
-	(void) topright;
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-
-    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
-    src[0+2*stride]=
-    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
-    src[0+1*stride]=
-    src[1+2*stride]=
-    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
-    src[0+0*stride]=
-    src[1+1*stride]=
-    src[2+2*stride]=
-    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[1+0*stride]=
-    src[2+1*stride]=
-    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[2+0*stride]=
-    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-}
-
-void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_TOP_EDGE
-    LOAD_TOP_RIGHT_EDGE
-//    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
-    src[1+0*stride]=
-    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
-    src[2+0*stride]=
-    src[1+1*stride]=
-    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
-    src[3+0*stride]=
-    src[2+1*stride]=
-    src[1+2*stride]=
-    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+2*stride]=
-    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
-    src[3+2*stride]=
-    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
-    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
-}
-
-void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-	(void) l3;
-
-    src[0+0*stride]=
-    src[1+2*stride]=(lt + t0 + 1)>>1;
-    src[1+0*stride]=
-    src[2+2*stride]=(t0 + t1 + 1)>>1;
-    src[2+0*stride]=
-    src[3+2*stride]=(t1 + t2 + 1)>>1;
-    src[3+0*stride]=(t2 + t3 + 1)>>1;
-    src[0+1*stride]=
-    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[1+1*stride]=
-    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[2+1*stride]=
-    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
-    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-}
-
-void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_TOP_EDGE
-    LOAD_TOP_RIGHT_EDGE
-	(void) t7;
-
-    src[0+0*stride]=(t0 + t1 + 1)>>1;
-    src[1+0*stride]=
-    src[0+2*stride]=(t1 + t2 + 1)>>1;
-    src[2+0*stride]=
-    src[1+2*stride]=(t2 + t3 + 1)>>1;
-    src[3+0*stride]=
-    src[2+2*stride]=(t3 + t4+ 1)>>1;
-    src[3+2*stride]=(t4 + t5+ 1)>>1;
-    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[1+1*stride]=
-    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-    src[2+1*stride]=
-    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
-    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
-}
-
-void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=(l0 + l1 + 1)>>1;
-    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-    src[2+0*stride]=
-    src[0+1*stride]=(l1 + l2 + 1)>>1;
-    src[3+0*stride]=
-    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
-    src[2+1*stride]=
-    src[0+2*stride]=(l2 + l3 + 1)>>1;
-    src[3+1*stride]=
-    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
-    src[3+2*stride]=
-    src[1+3*stride]=
-    src[0+3*stride]=
-    src[2+2*stride]=
-    src[2+3*stride]=
-    src[3+3*stride]=l3;
-}
-
-void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-	(void) t3;
-
-    src[0+0*stride]=
-    src[2+1*stride]=(lt + l0 + 1)>>1;
-    src[1+0*stride]=
-    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[0+1*stride]=
-    src[2+2*stride]=(l0 + l1 + 1)>>1;
-    src[1+1*stride]=
-    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
-    src[0+2*stride]=
-    src[2+3*stride]=(l1 + l2+ 1)>>1;
-    src[1+2*stride]=
-    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-    src[0+3*stride]=(l2 + l3 + 1)>>1;
-    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
-}
-
-void ff_pred16x16_vertical_c(uint8_t *src, int stride){
-    int i;
-	const vuint32_t v= *((vuint32_t*)(src-stride));
-    for(i=0; i<4; i++){
-		*((vuint32_t*) src 			 ) =v;
-		*((vuint32_t*)(src +   stride)) =v;
-		*((vuint32_t*)(src + 2*stride)) =v;
-		*((vuint32_t*)(src + 3*stride)) =v;
-		src+= 4*stride;
-    }
-	
-	/*const uint32_t a= ((uint32_t*)(src-stride))[0];
-	const uint32_t b= ((uint32_t*)(src-stride))[1];
-	const uint32_t c= ((uint32_t*)(src-stride))[2];
-	const uint32_t d= ((uint32_t*)(src-stride))[3];
-
-	for(i=0; i<16; i++){
-		((uint32_t*)(src+i*stride))[0]= a;
-		((uint32_t*)(src+i*stride))[1]= b;
-		((uint32_t*)(src+i*stride))[2]= c;
-		((uint32_t*)(src+i*stride))[3]= d;
-	}*/
-}
-
-void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
-    int i;
-	
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
-    }
-}
-
-void ff_pred16x16_dc_c(uint8_t *src, int stride){
-    int i;
-	int dc=0;
-    for(i=0;i<16; i++){
-        dc+= src[-1+i*stride];
-    }
-
-    for(i=0;i<16; i++){
-		dc+= src[i-stride];
-    }
-	dc= 0x01010101*((dc + 16)>>5);
-    
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
-    int i;
-	
-	int dc=0;
-    for(i=0;i<16; i++){
-        dc+= src[-1+i*stride];
-    }
-	dc= 0x01010101*((dc + 8)>>4);
-	
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
-    int i;
-	int dc0=0;
-    for(i=0;i<16; i++){
-        dc0+= src[i-stride];
-    }
-	
-	dc0= 0x01010101*((dc0 + 8)>>4);
-	
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc0;
-    }
-}
-
-void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
-    int i;
-	
-	/*const vuint32_t v= AVV(0x01010101U*128U, 0x01010101U*128U,0x01010101U*128U,0x01010101U*128U);
-	for(i=0; i<4; i++){
-		*((vuint32_t*) src 			  ) =v;
-		*((vuint32_t*)(src +   stride)) =v;
-		*((vuint32_t*)(src + 2*stride)) =v;
-		*((vuint32_t*)(src + 3*stride)) =v;
-		src+= 4*stride;
-	}*/
-	
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
-    }
-}
-
-void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
-	int i, j, k;
-	int a;
-	uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-	const uint8_t * const src0 = src+7-stride;
-	const uint8_t *src1 = src+8*stride-1;
-	const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
-	int H = src0[1] - src0[-1];
-	int V = src1[0] - src2[ 0];
-	for(k=2; k<=8; ++k) {
-		src1 += stride; src2 -= stride;
-		H += k*(src0[k] - src0[-k]);
-		V += k*(src1[0] - src2[ 0]);
-	}
-	if(svq3){
-		H = ( 5*(H/4) ) / 16;
-		V = ( 5*(V/4) ) / 16;
-
-		/* required for 100% accuracy */
-		i = H; H = V; V = i;
-	}else{
-		H = ( 5*H+32 ) >> 6;
-		V = ( 5*V+32 ) >> 6;
-	}
-
-	a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
-	for(j=16; j>0; --j) {
-		int b = a;
-		a += V;
-		for(i=-16; i<0; i+=4) {
-		src[16+i] = cm[ (b    ) >> 5 ];
-		src[17+i] = cm[ (b+  H) >> 5 ];
-		src[18+i] = cm[ (b+2*H) >> 5 ];
-		src[19+i] = cm[ (b+3*H) >> 5 ];
-		b += 4*H;
-		}
-		src += stride;
-	}
-}
-
-void ff_pred16x16_plane_c(uint8_t *src, int stride){
-    pred16x16_plane_compat_c(src, stride, 0);
-}
-
-void ff_pred8x8_vertical_c(uint8_t *src, int stride){
-    int i;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    const uint32_t b= ((uint32_t*)(src-stride))[1];
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= a;
-        ((uint32_t*)(src+i*stride))[1]= b;
-    }
-}
-
-void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
-    }
-}
-
-void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
-    }
-}
-
-void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc2;
-
-    dc0=dc2=0;
-    for(i=0;i<4; i++){
-        dc0+= src[-1+i*stride];
-        dc2+= src[-1+(i+4)*stride];
-    }
-    dc0= 0x01010101*((dc0 + 2)>>2);
-    dc2= 0x01010101*((dc2 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= dc0;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= dc2;
-    }
-}
-
-void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc1;
-
-    dc0=dc1=0;
-    for(i=0;i<4; i++){
-        dc0+= src[i-stride];
-        dc1+= src[4+i-stride];
-    }
-    dc0= 0x01010101*((dc0 + 2)>>2);
-    dc1= 0x01010101*((dc1 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-}
-
-
-void ff_pred8x8_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc1, dc2, dc3;
-
-    dc0=dc1=dc2=0;
-    for(i=0;i<4; i++){
-        dc0+= src[-1+i*stride] + src[i-stride];
-        dc1+= src[4+i-stride];
-        dc2+= src[-1+(i+4)*stride];
-    }
-    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
-    dc0= 0x01010101*((dc0 + 4)>>3);
-    dc1= 0x01010101*((dc1 + 2)>>2);
-    dc2= 0x01010101*((dc2 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc2;
-        ((uint32_t*)(src+i*stride))[1]= dc3;
-    }
-}
-
-void ff_pred8x8_plane_c(uint8_t *src, int stride){
-  int j, k;
-  int a;
-  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-  const uint8_t * const src0 = src+3-stride;
-  const uint8_t *src1 = src+4*stride-1;
-  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
-  int H = src0[1] - src0[-1];
-  int V = src1[0] - src2[ 0];
-  for(k=2; k<=4; ++k) {
-    src1 += stride; src2 -= stride;
-    H += k*(src0[k] - src0[-k]);
-    V += k*(src1[0] - src2[ 0]);
-  }
-  H = ( 17*H+16 ) >> 5;
-  V = ( 17*V+16 ) >> 5;
-
-  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
-  for(j=8; j>0; --j) {
-    int b = a;
-    a += V;
-    src[0] = cm[ (b    ) >> 5 ];
-    src[1] = cm[ (b+  H) >> 5 ];
-    src[2] = cm[ (b+2*H) >> 5 ];
-    src[3] = cm[ (b+3*H) >> 5 ];
-    src[4] = cm[ (b+4*H) >> 5 ];
-    src[5] = cm[ (b+5*H) >> 5 ];
-    src[6] = cm[ (b+6*H) >> 5 ];
-    src[7] = cm[ (b+7*H) >> 5 ];
-    src += stride;
-  }
-}
-
-
-#define SRC(x,y) src[(x)+(y)*stride]
-#define PL(y) \
-    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_LEFT \
-    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
-                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
-    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
-    const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
-
-#define PT(x) \
-    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOP \
-    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
-                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
-    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
-    const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
-                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
-
-#define PTR(x) \
-    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOPRIGHT \
-    int t8, t9, t10, t11, t12, t13, t14, t15; \
-    if(has_topright) { \
-        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
-        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
-    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
-
-#define PREDICT_8x8_LOAD_TOPLEFT \
-    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
-
-#define PREDICT_8x8_DC(v) \
-    int y; \
-    for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
-        src += stride; \
-    }
-
-static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-	(void) has_topright;
-	(void) has_topleft;
-    PREDICT_8x8_DC(0x80808080);
-}
-static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-	(void) has_topright;
-    PREDICT_8x8_LOAD_LEFT;
-    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOP;
-    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
-                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-	(void) has_topright;
-    PREDICT_8x8_LOAD_LEFT;
-#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
-               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
-    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
-#undef ROW
-}
-static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    int y;
-    PREDICT_8x8_LOAD_TOP;
-    src[0] = t0;
-    src[1] = t1;
-    src[2] = t2;
-    src[3] = t3;
-    src[4] = t4;
-    src[5] = t5;
-    src[6] = t6;
-    src[7] = t7;
-    for( y = 1; y < 8; y++ )
-        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
-}
-static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_TOPRIGHT;
-    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
-    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
-    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
-    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
-    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
-    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
-    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
-    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
-    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
-}
-static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
-    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
-    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
-    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
-    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
-    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
-    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
-    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
-    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
-
-}
-static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-	(void) l7;
-    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
-    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
-    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
-    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
-    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
-    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
-    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
-    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
-    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
-    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
-    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
-    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
-    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
-    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
-    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(7,0)= (t6 + t7 + 1) >> 1;
-}
-static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-	(void) t7;
-    SRC(0,7)= (l6 + l7 + 1) >> 1;
-    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
-    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
-    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
-    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
-    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
-    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
-    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
-    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
-    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
-    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
-    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
-    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
-    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
-    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
-    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
-    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
-    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
-    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
-    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
-    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
-}
-static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_TOPRIGHT;
-    SRC(0,0)= (t0 + t1 + 1) >> 1;
-    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
-    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
-    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
-    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
-    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
-    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
-    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
-    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
-    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
-    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
-    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
-    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
-    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
-    SRC(7,6)= (t10 + t11 + 1) >> 1;
-    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
-}
-static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-	(void) has_topright;
-    PREDICT_8x8_LOAD_LEFT;
-    SRC(0,0)= (l0 + l1 + 1) >> 1;
-    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
-    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
-    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
-    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
-    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
-    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
-    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
-    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
-    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
-    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
-    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
-    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
-    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
-    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
-    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
-    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
-    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
-}
-#undef PREDICT_8x8_LOAD_LEFT
-#undef PREDICT_8x8_LOAD_TOP
-#undef PREDICT_8x8_LOAD_TOPLEFT
-#undef PREDICT_8x8_LOAD_TOPRIGHT
-#undef PREDICT_8x8_DC
-#undef PTR
-#undef PT
-#undef PL
-#undef SRC
-
-void init_pred_ptrs(H264PredContext_spu *i){
-
-    i->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
-    i->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
-    i->pred4x4[DC_PRED             ]= pred4x4_dc_c;
-    i->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
-    i->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
-    i->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
-    i->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
-    i->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
-    i->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
-    i->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
-    i->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
-    i->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
-
-    i->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
-    i->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
-    i->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
-    i->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
-    i->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
-    i->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
-    i->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
-    i->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
-    i->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
-    i->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
-    i->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
-    i->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
-
-  
-    i->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
-    i->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
-    i->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
-	i->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
-    i->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
-    i->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
-    i->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
-
-    i->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
-    i->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
-    i->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
-    i->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
-    i->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
-    i->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
-    i->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
-
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-#ifndef H264_INTRA_SPU_H
-#define H264_INTRA_SPU_H
-
-#define MAX_NEG_CROP       1024
-
-// For Intra mode
-#define MB_TYPE_INTRA4x4   0x0001
-#define IS_INTRA(a)       ((a)&7)
-#define IS_INTRA4x4(a)    ((a)&MB_TYPE_INTRA4x4)
-
-#define CODEC_FLAG_GRAY   0x2000
-
-#define VERT_PRED             0
-#define HOR_PRED              1
-#define DC_PRED               2
-#define DIAG_DOWN_LEFT_PRED   3
-#define DIAG_DOWN_RIGHT_PRED  4
-#define VERT_RIGHT_PRED       5
-#define HOR_DOWN_PRED         6
-#define VERT_LEFT_PRED        7
-#define HOR_UP_PRED           8
-
-#define LEFT_DC_PRED          9
-#define TOP_DC_PRED           10
-#define DC_128_PRED           11
-
-
-#define DC_PRED8x8            0
-#define HOR_PRED8x8           1
-#define VERT_PRED8x8          2
-#define PLANE_PRED8x8         3
-
-#define LEFT_DC_PRED8x8       4
-#define TOP_DC_PRED8x8        5
-#define DC_128_PRED8x8        6
-
-typedef struct H264PredContext_spu{
-
-  intra_pred4x4 pred4x4[9+3];
-  intra_pred16x16 pred16x16[4+3];
-  intra_pred8x8 pred8x8[4+3];
-  intra_pred8x8l pred8x8l[9+3];
-
-}H264PredContext_spu;
-
-void init_pred_ptrs(H264PredContext_spu *i);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1560 +0,0 @@
-static void PREFIX_h264_qpel16_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
-  
-  register int i;
-
-  const int16_t i20ss= 20;
-  const int16_t i5ss= 5;
-  const int16_t i16ss= 16;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t v16ss = spu_splats(i16ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const int shift_src =(unsigned int) src & 15;
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  uint8_t *srcbis = src - (STRIDE_Y * 2);
-
-  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-
-  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-  vsint16_t srcM2ssB = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
-  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-  vsint16_t srcM1ssB = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
-  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-  vsint16_t srcP0ssB = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
-  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-  vsint16_t srcP1ssB = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
-  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-  vsint16_t srcP2ssB = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
-
-  for (i = 0 ; i < h ; i++) {
-    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
-    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
-    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
-
-    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-    const vsint16_t srcP3ssB = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
-    srcbis += STRIDE_Y;
-
-    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
-    const vsint16_t sum1B = spu_add(srcP0ssB, srcP1ssB);
-    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
-    const vsint16_t sum2B = spu_add(srcM1ssB, srcP2ssB);
-    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
-    const vsint16_t sum3B = spu_add(srcM2ssB, srcP3ssB);
-
-    srcM2ssA = srcM1ssA;
-    srcM2ssB = srcM1ssB;
-    srcM1ssA = srcP0ssA;
-    srcM1ssB = srcP0ssB;
-    srcP0ssA = srcP1ssA;
-    srcP0ssB = srcP1ssB;
-    srcP1ssA = srcP2ssA;
-    srcP1ssB = srcP2ssB;
-    srcP2ssA = srcP3ssA;
-    srcP2ssB = srcP3ssB;
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
-
-    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
-    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
-    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
-    const vsint16_t pp1B = spu_add(pp1B3, v16ss);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
-    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
-    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
-
-    const vsint16_t pp3A = spu_add(sum3A, pp1A);
-    const vsint16_t pp3B = spu_add(sum3B, pp1B);
-
-    const vsint16_t psumA = spu_sub(pp3A, pp2A);
-    const vsint16_t psumB = spu_sub(pp3B, pp2B);
-
-    vsint16_t sumA = spu_rlmask(psumA, -5);
-    vsint16_t sumB = spu_rlmask(psumB, -5);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
-    sumA = spu_and(sumA,(vsint16_t)sat);
-    sat = spu_cmpgt(sumA,vmax);
-    sumA = spu_sel(sumA,vmax,sat);
-    sat = spu_cmpgt(sumB,(vsint16_t)vzero);
-    sumB = spu_and(sumB,(vsint16_t)sat);
-    sat = spu_cmpgt(sumB,vmax);
-    sumB = spu_sel(sumB,vmax,sat);
-
-    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu);
-
-    /* 16x16 dest luma blocks are alway aligned */
-    const vuint8_t vdst = *(vuint8_t *)dst;
-
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, sum, vdst);
-
-    *(vuint8_t *)dst=fsum;
-    
-    dst += dstStride; /* stride is  multiple of 16 ,so dstperm and dstmask can remain out of the loop */
-  }
-}
-
-static void PREFIX_h264_qpel16_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
-
-  register int i;
-  
-  const int16_t i20ss = 20;
-  const int16_t i5ss = 5;
-  const int16_t i16ss = 16;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t v16ss = spu_splats(i16ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  const int permM2 = (unsigned int) (src-2) & 15;
-  const int permM1 = (unsigned int) (src-1) & 15;
-  const int permP0 = (unsigned int) (src) & 15;
-  const int permP1 = (unsigned int) (src+1) & 15;
-  const int permP2 = (unsigned int) (src+2) & 15;
-  const int permP3 = (unsigned int) (src+3) & 15;
-
-  register int align = ((((unsigned long)src) - 2) % 16);
-
-  for (i = 0 ; i < h ; i ++) {
-    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-    vuint8_t srcR1 = *(vuint8_t *)(src-2);
-    vuint8_t srcR2 = *(vuint8_t *)(src+14);
-
-    switch (align) {
-    default: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
-    } break;
-    case 11: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = srcR2;
-    } break;
-    case 12: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = srcR2;
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 13: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = srcR2;
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 14: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = srcR2;
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 15: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = srcR2;
-      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    }
-
-    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-    const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
-    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-    const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
-
-    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-    const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
-    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-    const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
-
-    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-    const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
-    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-    const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
-
-    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
-    const vsint16_t sum1B = spu_add(srcP0B, srcP1B);
-    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
-    const vsint16_t sum2B = spu_add(srcM1B, srcP2B);
-    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
-    const vsint16_t sum3B = spu_add(srcM2B, srcP3B);
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
-
-    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
-    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
-    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
-    const vsint16_t pp1B = spu_add(pp1B3, v16ss);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
-    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
-    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
-
-    const vsint16_t pp3A = spu_add(sum3A, pp1A);
-    const vsint16_t pp3B = spu_add(sum3B, pp1B);
-
-    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
-    const vsint16_t psumB = spu_sub(pp3B, (vsint16_t)pp2B);
-
-    vsint16_t sumA = spu_rlmask(psumA, -5);
-    vsint16_t sumB = spu_rlmask(psumB, -5);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
-    sumA = spu_and(sumA,(vsint16_t)sat);
-    sat = spu_cmpgt(sumA,vmax);
-    sumA = spu_sel(sumA,vmax,sat);
-    sat = spu_cmpgt(sumB,(vsint16_t)vzero);
-    sumB = spu_and(sumB,(vsint16_t)sat);
-    sat = spu_cmpgt(sumB,vmax);
-    sumB = spu_sel(sumB,vmax,sat);
-
-    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu);
-
-    /* 16x16 dest luma blocks are alway aligned */
-    const vuint8_t vdst = *(vuint8_t *)dst;
-
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, sum, vdst);
-
-    *(vuint8_t *)dst=fsum;
-    
-    src += STRIDE_Y;
-    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
-   }
-}
-
-/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
-static void PREFIX_h264_qpel16_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
-  register int i;
-
-  const int16_t i20ss = 20;
-  const int16_t i5ss = 5;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  const int permM2 = (unsigned int) (src-2) & 15;
-  const int permM1 = (unsigned int) (src-1) & 15;
-  const int permP0 = (unsigned int) (src) & 15;
-  const int permP1 = (unsigned int) (src+1) & 15;
-  const int permP2 = (unsigned int) (src+2) & 15;
-  const int permP3 = (unsigned int) (src+3) & 15;
-
-  register int align = ((((unsigned long)src) - 2) % 16);
-
-  src -= (2 * STRIDE_Y);
-
-  for (i = 0 ; i < (h+5) ; i ++) {
-    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-    vuint8_t srcR1 = *(vuint8_t *)(src-2);
-    vuint8_t srcR2 = *(vuint8_t *)(src+14);
-
-    switch (align) {
-    default: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
-    } break;
-    case 11: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = srcR2;
-    } break;
-    case 12: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = srcR2;
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 13: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = srcR2;
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 14: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = srcR2;
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 15: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = srcR2;
-      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    }
-
-    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-    const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
-    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-    const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
-
-    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-    const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
-    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-    const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
-
-    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-    const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
-    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-    const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
-
-    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
-    const vsint16_t sum1B = spu_add(srcP0B, srcP1B);
-    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
-    const vsint16_t sum2B = spu_add(srcM1B, srcP2B);
-    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
-    const vsint16_t sum3B = spu_add(srcM2B, srcP3B);
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
-
-    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
-    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
-    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
-    const vsint16_t pp1B = spu_add(pp1B3, sum3B);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
-    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
-    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
-
-    const vsint16_t psumA = spu_sub(pp1A, pp2A);
-    const vsint16_t psumB = spu_sub(pp1B, pp2B);
-
-    *(vsint16_t *)tmp = psumA;
-    *(vsint16_t *)(tmp+8) = psumB;
-
-    src += STRIDE_Y;
-    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
-  }
-
-  const int32_t ni10si = -10;
-  const int16_t i1ss = 1;
-  const int32_t i512si = 512;
-  const int32_t ni16si = -16;
-
-  const vsint32_t nv10si = spu_splats(ni10si);
-  const vsint16_t v1ss = spu_splats(i1ss);
-  const vsint32_t v512si = spu_splats(i512si);
-  const vsint32_t nv16si = spu_splats(ni16si);
-
-  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
-  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
-
-  int16_t *tmpbis = tmp - (tmpStride * (h+5));
-
-  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
-  vsint16_t tmpM2ssB = *(vsint16_t *)(tmpbis+8);
-  tmpbis += tmpStride;
-  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
-  vsint16_t tmpM1ssB = *(vsint16_t *)(tmpbis+8);
-  tmpbis += tmpStride;
-  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
-  vsint16_t tmpP0ssB = *(vsint16_t *)(tmpbis+8);
-  tmpbis += tmpStride;
-  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
-  vsint16_t tmpP1ssB = *(vsint16_t *)(tmpbis+8);
-  tmpbis += tmpStride;
-  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
-  vsint16_t tmpP2ssB = *(vsint16_t *)(tmpbis+8);
-  tmpbis += tmpStride;
-
-  for (i = 0 ; i < h ; i++) {
-    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
-    const vsint16_t tmpP3ssB = *(vsint16_t *)(tmpbis+8);
-    tmpbis += tmpStride;
-
-    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
-    const vsint16_t sum1B = spu_add(tmpP0ssB, tmpP1ssB);
-    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
-    const vsint16_t sum2B = spu_add(tmpM1ssB, tmpP2ssB);
-    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
-    const vsint16_t sum3B = spu_add(tmpM2ssB, tmpP3ssB);
-
-    tmpM2ssA = tmpM1ssA;
-    tmpM2ssB = tmpM1ssB;
-    tmpM1ssA = tmpP0ssA;
-    tmpM1ssB = tmpP0ssB;
-    tmpP0ssA = tmpP1ssA;
-    tmpP0ssB = tmpP1ssB;
-    tmpP1ssA = tmpP2ssA;
-    tmpP1ssB = tmpP2ssB;
-    tmpP2ssA = tmpP3ssA;
-    tmpP2ssB = tmpP3ssB;
-
-    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
-    const vsint32_t pp1Be = spu_mule(sum1B, v20ss);
-    const vsint32_t pp1Bo = spu_mulo(sum1B, v20ss);
-
-    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
-    const vsint32_t pp2Be = spu_mule(sum2B, v5ss);
-    const vsint32_t pp2Bo = spu_mulo(sum2B, v5ss);
-
-    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
-    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
-    const vsint32_t pp3Be = spu_rlmask((vsint32_t)sum3B, nv16si);
-    const vsint32_t pp3Bo = spu_mulo(sum3B, v1ss);
-
-    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
-    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
-    const vsint32_t pp1cBe = spu_add(pp1Be, v512si);
-    const vsint32_t pp1cBo = spu_add(pp1Bo, v512si);
-
-    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
-    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
-    const vsint32_t pp32Be = spu_sub(pp3Be, pp2Be);
-    const vsint32_t pp32Bo = spu_sub(pp3Bo, pp2Bo);
-
-    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
-    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
-    const vsint32_t sumBe = spu_add(pp1cBe, pp32Be);
-    const vsint32_t sumBo = spu_add(pp1cBo, pp32Bo);
-
-    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
-    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
-    const vsint32_t ssumBe = spu_rlmask(sumBe, nv10si);
-    const vsint32_t ssumBo = spu_rlmask(sumBo, nv10si);
-
-    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, ssumBe, packs);
-    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, ssumBo, packs);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
-    ssume = spu_and(ssume,(vsint16_t)sat);
-    sat = spu_cmpgt(ssume,vmax);
-    ssume = spu_sel(ssume,vmax,sat);
-    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
-    ssumo = spu_and(ssumo,(vsint16_t)sat);
-    sat = spu_cmpgt(ssumo,vmax);
-    ssumo = spu_sel(ssumo,vmax,sat);
-
-    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
-
-    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
-
-    /* 16x16 dest luma blocks are alway aligned */
-    const vuint8_t vdst = *(vuint8_t *)dst;
-
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, sum, vdst);
-
-    *(vuint8_t *)dst=fsum;
-    
-    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
-
-  }
-}
-
-static void PREFIX_h264_qpel8_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
-  
-  register int i;
-
-  const int16_t i20ss= 20;
-  const int16_t i5ss= 5;
-  const int16_t i16ss= 16;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t v16ss = spu_splats(i16ss);
-  const int shift_src = (unsigned int) src & 15;
-
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
-  const int shift_dst = (unsigned int) dst & 15;
-  vuint8_t dstmask;
-  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-  if(shift_dst==0){
-    dstmask = dst8mask1;
-  }
-  else{
-    dstmask = dst8mask2;
-  }
-
-  uint8_t *srcbis = src - (STRIDE_Y * 2);
-
-  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-
-  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-
-  for (i = 0 ; i < h ; i++) {
-    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
-    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
-    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
-
-    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-    srcbis += STRIDE_Y;
-
-    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
-    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
-    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
-
-    srcM2ssA = srcM1ssA;
-    srcM1ssA = srcP0ssA;
-    srcP0ssA = srcP1ssA;
-    srcP1ssA = srcP2ssA;
-    srcP2ssA = srcP3ssA;
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint16_t pp3A = spu_add(sum3A, pp1A);
-    const vsint16_t psumA = spu_sub(pp3A, pp2A);
-    vsint16_t sumA = spu_rlmask(psumA, -5);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
-    sumA = spu_and(sumA,(vsint16_t)sat);
-    sat = spu_cmpgt(sumA,vmax);
-    sumA = spu_sel(sumA,vmax,sat);
-
-    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-    
-    dst += dstStride; 
-  }
-}
-
-static void PREFIX_h264_qpel8_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
-
-  register int i;
-  
-  const int16_t i20ss = 20;
-  const int16_t i5ss = 5;
-  const int16_t i16ss = 16;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t v16ss = spu_splats(i16ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
-  const int shift_dst = (unsigned int) dst & 15;
-  vuint8_t dstmask;
-  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-  if(shift_dst==0){
-    dstmask = dst8mask1;
-  }
-  else{
-    dstmask = dst8mask2;
-  }
-
-  const int permM2 = (unsigned int) (src-2) & 15;
-  const int permM1 = (unsigned int) (src-1) & 15;
-  const int permP0 = (unsigned int) (src) & 15;
-  const int permP1 = (unsigned int) (src+1) & 15;
-  const int permP2 = (unsigned int) (src+2) & 15;
-  const int permP3 = (unsigned int) (src+3) & 15;
-
-  register int align = ((((unsigned long)src) - 2) % 16);
-
-  for (i = 0 ; i < h ; i ++) {
-    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-    vuint8_t srcR1 = *(vuint8_t *)(src-2);
-    vuint8_t srcR2 = *(vuint8_t *)(src+14);
-
-    switch (align) {
-    default: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
-    } break;
-    case 11: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = srcR2;
-    } break;
-    case 12: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = srcR2;
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 13: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = srcR2;
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 14: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = srcR2;
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 15: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = srcR2;
-      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    }
-
-    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-
-    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-
-    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-
-    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
-    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
-    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint16_t pp3A = spu_add(sum3A, pp1A);
-
-    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
-    
-    vsint16_t sumA = spu_rlmask(psumA, -5);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
-    sumA = spu_and(sumA,(vsint16_t)sat);
-    sat = spu_cmpgt(sumA,vmax);
-    sumA = spu_sel(sumA,vmax,sat);
-
-    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-    
-    src += STRIDE_Y;
-    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
-   }
-}
-
-/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
-static void PREFIX_h264_qpel8_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
-  register int i;
-
-  const int16_t i20ss = 20;
-  const int16_t i5ss = 5;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  const int permM2 = (unsigned int) (src-2) & 15;
-  const int permM1 = (unsigned int) (src-1) & 15;
-  const int permP0 = (unsigned int) (src) & 15;
-  const int permP1 = (unsigned int) (src+1) & 15;
-  const int permP2 = (unsigned int) (src+2) & 15;
-  const int permP3 = (unsigned int) (src+3) & 15;
-
-  register int align = ((((unsigned long)src) - 2) % 16);
-
-  src -= (2 * STRIDE_Y);
-
-  for (i = 0 ; i < (h+5) ; i ++) {
-    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-    vuint8_t srcR1 = *(vuint8_t *)(src-2);
-    vuint8_t srcR2 = *(vuint8_t *)(src+14);
-
-    switch (align) {
-    default: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
-    } break;
-    case 11: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = srcR2;
-    } break;
-    case 12: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = srcR2;
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 13: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = srcR2;
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 14: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = srcR2;
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 15: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = srcR2;
-      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    }
-
-    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh);
-
-    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
-    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
-    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint16_t psumA = spu_sub(pp1A, pp2A);
-
-    *(vsint16_t *)tmp = psumA;
-
-    src += STRIDE_Y;
-    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
-  }
-
-  const int32_t ni10si = -10;
-  const int16_t i1ss = 1;
-  const int32_t i512si = 512;
-  const int32_t ni16si = -16;
-
-  const vsint32_t nv10si = spu_splats(ni10si);
-  const vsint16_t v1ss = spu_splats(i1ss);
-  const vsint32_t v512si = spu_splats(i512si);
-  const vsint32_t nv16si = spu_splats(ni16si);
-
-  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
-  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
-
-  const int shift_dst = (unsigned int) (dst) & 15;
-  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
-  vuint8_t dstmask;
-  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
-
-  if(shift_dst==0){
-    dstmask = dst8mask1;
-  }
-  else{
-    dstmask = dst8mask2;
-  }
-
-  int16_t *tmpbis = tmp - (tmpStride * (h+5));
-
-  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-
-  for (i = 0 ; i < h ; i++) {
-    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
-    tmpbis += tmpStride;
-
-    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
-    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
-    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
-
-    tmpM2ssA = tmpM1ssA;
-    tmpM1ssA = tmpP0ssA;
-    tmpP0ssA = tmpP1ssA;
-    tmpP1ssA = tmpP2ssA;
-    tmpP2ssA = tmpP3ssA;
-
-    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
-    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
-
-    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
-    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
-
-    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
-    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
-
-    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
-    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
-
-    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
-    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
-
-    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
-    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
-
-    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs);
-    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
-    ssume = spu_and(ssume,(vsint16_t)sat);
-    sat = spu_cmpgt(ssume,vmax);
-    ssume = spu_sel(ssume,vmax,sat);
-    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
-    ssumo = spu_and(ssumo,(vsint16_t)sat);
-    sat = spu_cmpgt(ssumo,vmax);
-    ssumo = spu_sel(ssumo,vmax,sat);
-
-    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
-
-    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-    
-    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
-
-  }
-}
-
-static void PREFIX_h264_qpel4_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
-  
-  register int i;
-
-  const int16_t i20ss= 20;
-  const int16_t i5ss= 5;
-  const int16_t i16ss= 16;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t v16ss = spu_splats(i16ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const int shift_src = (unsigned int) src & 15;
-
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
-  const int shift_dst = (unsigned int) dst & 15;
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-  switch(shift_dst){
-    case 0:  dstmask = dst4mask0;
-             break;
-    case 4:  dstmask = dst4mask4;
-             break;
-    case 8:  dstmask = dst4mask8;
-             break;
-    case 12: dstmask = dst4mask12;
-             break;
-  }
-
-  uint8_t *srcbis = src - (STRIDE_Y * 2);
-
-  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
-  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
-  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
-
-  srcbis += STRIDE_Y;
-
-  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-
-  for (i = 0 ; i < h ; i++) {
-    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
-    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
-    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
-
-    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-    srcbis += STRIDE_Y;
-
-    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
-    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
-    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
-
-    srcM2ssA = srcM1ssA;
-    srcM1ssA = srcP0ssA;
-    srcP0ssA = srcP1ssA;
-    srcP1ssA = srcP2ssA;
-    srcP2ssA = srcP3ssA;
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint16_t pp3A = spu_add(sum3A, pp1A);
-    const vsint16_t psumA = spu_sub(pp3A, pp2A);
-    vsint16_t sumA = spu_rlmask(psumA, -5);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
-    sumA = spu_and(sumA,(vsint16_t)sat);
-    sat = spu_cmpgt(sumA,vmax);
-    sumA = spu_sel(sumA,vmax,sat);
-
-    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-    
-    dst += dstStride; 
-  }
-}
-
-static void PREFIX_h264_qpel4_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
-
-  register int i;
-  
-  const int16_t i20ss = 20;
-  const int16_t i5ss = 5;
-  const int16_t i16ss = 16;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t v16ss = spu_splats(i16ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
-  const int shift_dst = (unsigned int) dst & 15;
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-  switch(shift_dst){
-    case 0:  dstmask = dst4mask0;
-             break;
-    case 4:  dstmask = dst4mask4;
-             break;
-    case 8:  dstmask = dst4mask8;
-             break;
-    case 12: dstmask = dst4mask12;
-             break;
-  }
-
-  const int permM2 = (unsigned int) (src-2) & 15;
-  const int permM1 = (unsigned int) (src-1) & 15;
-  const int permP0 = (unsigned int) (src) & 15;
-  const int permP1 = (unsigned int) (src+1) & 15;
-  const int permP2 = (unsigned int) (src+2) & 15;
-  const int permP3 = (unsigned int) (src+3) & 15;
-
-  register int align = ((((unsigned long)src) - 2) % 16);
-
-  for (i = 0 ; i < h ; i ++) {
-    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-    vuint8_t srcR1 = *(vuint8_t *)(src-2);
-    vuint8_t srcR2 = *(vuint8_t *)(src+14);
-
-    switch (align) {
-    default: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
-    } break;
-    case 11: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = srcR2;
-    } break;
-    case 12: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = srcR2;
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 13: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = srcR2;
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 14: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = srcR2;
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 15: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = srcR2;
-      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    }
-
-    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
-    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
-
-    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
-    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
-
-    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
-    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
-
-    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
-    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
-    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint16_t pp3A = spu_add(sum3A, pp1A);
-
-    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
-    
-    vsint16_t sumA = spu_rlmask(psumA, -5);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
-    sumA = spu_and(sumA,(vsint16_t)sat);
-    sat = spu_cmpgt(sumA,vmax);
-    sumA = spu_sel(sumA,vmax,sat);
-
-    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-    
-    src += STRIDE_Y;
-    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
-   }
-}
-
-static void PREFIX_h264_qpel4_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
-  register int i;
-
-  const int16_t i20ss = 20;
-  const int16_t i5ss = 5;
-  const int16_t imax = 255;
-
-  const vsint32_t vzero = spu_splats(0);
-  const vsint16_t v20ss = spu_splats(i20ss);
-  const vsint16_t v5ss = spu_splats(i5ss);
-  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
-  vuint16_t sat;
-
-  const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07};
-  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
-  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
-
-  const int permM2 = (unsigned int) (src-2) & 15;
-  const int permM1 = (unsigned int) (src-1) & 15;
-  const int permP0 = (unsigned int) (src) & 15;
-  const int permP1 = (unsigned int) (src+1) & 15;
-  const int permP2 = (unsigned int) (src+2) & 15;
-  const int permP3 = (unsigned int) (src+3) & 15;
-
-  register int align = ((((unsigned long)src) - 2) % 16);
-
-  src -= (2 * STRIDE_Y);
-
-  for (i = 0 ; i < (h+5) ; i ++) {
-    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-    vuint8_t srcR1 = *(vuint8_t *)(src-2);
-    vuint8_t srcR2 = *(vuint8_t *)(src+14);
-
-    switch (align) {
-    default: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
-    } break;
-    case 11: {
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
-      srcP3 = srcR2;
-    } break;
-    case 12: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
-      srcP2 = srcR2;
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 13: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
-      srcP1 = srcR2;
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 14: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
-      srcP0 = srcR2;
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    case 15: {
-      vuint8_t srcR3 = *(vuint8_t *)(src+30);
-      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
-      srcM1 = srcR2;
-      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
-      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
-      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
-      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
-    } break;
-    }
-
-    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh);
-    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh);
-
-    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
-    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
-    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
-
-    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
-    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
-    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
-
-    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
-    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
-
-    const vsint16_t psumA = spu_sub(pp1A, pp2A);
-
-    *(vsint16_t *)tmp = psumA;
-
-    src += STRIDE_Y;
-    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
-  }
-
-  const int32_t ni10si = -10;
-  const int16_t i1ss = 1;
-  const int32_t i512si = 512;
-  const int32_t ni16si = -16;
-
-  const vsint32_t nv10si = spu_splats(ni10si);
-  const vsint16_t v1ss = spu_splats(i1ss);
-  const vsint32_t v512si = spu_splats(i512si);
-  const vsint32_t nv16si = spu_splats(ni16si);
-
-  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
-  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
-
-  const int shift_dst = (unsigned int) (dst) & 15;
-  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
-  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
-  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
-
-  switch(shift_dst){
-    case 0:  dstmask = dst4mask0;
-             break;
-    case 4:  dstmask = dst4mask4;
-             break;
-    case 8:  dstmask = dst4mask8;
-             break;
-    case 12: dstmask = dst4mask12;
-             break;
-  }
-
-  int16_t *tmpbis = tmp - (tmpStride * (h+5));
-
-  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
-  tmpbis += tmpStride;
-
-  for (i = 0 ; i < h ; i++) {
-    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
-    tmpbis += tmpStride;
-
-    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
-    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
-    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
-
-    tmpM2ssA = tmpM1ssA;
-    tmpM1ssA = tmpP0ssA;
-    tmpP0ssA = tmpP1ssA;
-    tmpP1ssA = tmpP2ssA;
-    tmpP2ssA = tmpP3ssA;
-
-    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
-    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
-    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
-    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
-
-    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
-    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
-
-    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
-    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
-
-    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
-    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
-
-    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
-    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
-
-    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
-    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
-
-    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs);
-    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs);
-
-    //Saturation to 0 and 255
-    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
-    ssume = spu_and(ssume,(vsint16_t)sat);
-    sat = spu_cmpgt(ssume,vmax);
-    ssume = spu_sel(ssume,vmax,sat);
-    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
-    ssumo = spu_and(ssumo,(vsint16_t)sat);
-    sat = spu_cmpgt(ssumo,vmax);
-    ssumo = spu_sel(ssumo,vmax,sat);
-
-    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
-
-    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
-
-    const vuint8_t dst1 = *(vuint8_t *)dst;
-
-    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
-    vuint8_t fsum;
-    OP_U8_SPU(fsum, dsum, dst1);
-
-    *(vuint8_t *)dst=fsum;
-    
-    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
-
-  }
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding. 
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-
-#include <stdio.h>
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <assert.h>
-
-#include "h264_mc_spu.h"
-#include "h264_dma.h"
-#include "h264_tables.h"
-#include "h264_decode_mb_spu.h"
-
-
-//biweight buffer 
-DECLARE_ALIGNED_16(uint8_t, tmp_y_ls[48*16]);	      		
-DECLARE_ALIGNED_16(uint8_t, tmp_cb_ls[32*8]);
-DECLARE_ALIGNED_16(uint8_t, tmp_cr_ls[32*8]);
-
-//ref buffer (double buffered)
-DECLARE_ALIGNED_16(uint8_t, mc_ref[2][16*(4+5)*48 + 2*16*(2+1)*32]);
-uint8_t* ref_ptr;
-
-/** Motion Compensation functions*/
-
-static void fill_mc_part(H264mc *mc, int n, int chroma_height, int x_offset, int y_offset, int itp, int weight, int list0, int list1){
-	H264mc_part *mc_part = mc->mc_part + mc->npart;
-	mc_part->n =n;
-	mc_part->chroma_height =chroma_height;
-	mc_part->x_offset = x_offset;
-	mc_part->y_offset = y_offset;
-	mc_part->itp = itp;
-	mc_part->weight = weight;
-	mc_part->list0 = list0;
-	mc_part->list1 = list1;
-	
-	mc->npart++;
-}
-
-void calc_mc_params(H264Mb* mb, H264mc *mc){
-	int mb_type = mb->mb_type;
-	mc->npart=0;	
-
-	assert(!IS_INTRA(mb_type));
-	if(IS_16X16(mb_type)){
-		fill_mc_part(mc, 0, 8, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-    }else if(IS_16X8(mb_type)){
-		fill_mc_part(mc, 0, 4, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-		fill_mc_part(mc, 8, 4, 0, 4, 0, 1, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
-    }else if(IS_8X16(mb_type)){
-		fill_mc_part(mc, 0, 8, 0, 0, 1, 2, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-		fill_mc_part(mc, 4, 8, 4, 0, 1, 2, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
-    }else{
-        int i;
-        assert(IS_8X8(mb_type));
-
-        for(i=0; i<4; i++){
-            const int sub_mb_type= mb->sub_mb_type[i];
-            const int n= 4*i;
-            int x_offset= (i&1)<<2;
-            int y_offset= (i&2)<<1;
-
-			if(IS_SUB_8X8(sub_mb_type)){
-				fill_mc_part(mc, n, 4, x_offset, y_offset, 1, 3, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-            }else if(IS_SUB_8X4(sub_mb_type)){
-				fill_mc_part(mc, n, 2, x_offset, y_offset, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-				fill_mc_part(mc, n+2, 2, x_offset, y_offset+2, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-            }else if(IS_SUB_4X8(sub_mb_type)){
-				fill_mc_part(mc, n, 4, x_offset, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-				fill_mc_part(mc, n+1, 4, x_offset+2, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-            }else{
-                int j;
-                assert(IS_SUB_4X4(sub_mb_type));
-                for(j=0; j<4; j++){
-                    int sub_x_offset= x_offset + 2*(j&1);
-                    int sub_y_offset= y_offset +   (j&2);
-					fill_mc_part(mc, n+j, 2, sub_x_offset, sub_y_offset, 2, 6, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-                }
-            }
-        }
-    }
-}
-
-/**
-*	Returns a pointer to mc_buf 
-*/
-static void* alloc_mc_buf(int size){
-	void* ptr = ref_ptr;
-	ref_ptr += size;
-	return ptr;
-}
-
-#define TAG_OFFSET_MC MBD_mc_buf1
-static uint8_t* get_mc_data(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){
-	assert(src_ea);
-	int unalign;
-	unsigned address_align;
-	
-	uint8_t* ea;
-	uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride);
-
-	ea = src_ea + pic_xoffset + pic_yoffset*linesize; 
-	address_align = ((unsigned) ea) & 0xFFFFFFF0;
-	unalign = ((unsigned) ea) & 0xF;
-	get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, idx + TAG_OFFSET_MC, 0);
-	return (ref_ptr + unalign);
-}
-
-static uint8_t* get_mc_data_blocking(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){
-	assert(src_ea);
-	int unalign;
-	unsigned address_align;
-
-	uint8_t* ea;
-	uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride);
-
-	ea = src_ea + pic_xoffset + pic_yoffset*linesize;
-	address_align = ((unsigned) ea) & 0xFFFFFFF0;
-	unalign = ((unsigned) ea) & 0xF;
-	get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, MBD_mc_buf1, 0);
-	wait_dma_id(MBD_mc_buf1);
-	return (ref_ptr + unalign);
-}
-
-//#undef TAG_OFFSET_MC
-
-static void get_mc_components(H264Context_spu *h, H264Mb *mb, H264mc_part* mc_part, Picture_spu *pic, int n, int chroma_height, int list, int src_x_offset, int src_y_offset, int idx){
-	assert(pic);
-	H264slice *s = h->s;
-	ref_data *ref = &mc_part->ref[list];
-    const int mx= mb->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
-    const int my= mb->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
-
-    const int pic_width  = 16*s->mb_width;
-    const int pic_height = 16*s->mb_height;
-	
-	int blk_h= chroma_height*2+5;
-	//int blk_w= 8*2+5;
-	
-	int blk_h_c= chroma_height+1;
-	//int blk_w_c= 9;
-
-	int ymx= mx>>2;
-    int ymy= my>>2;
-    int cmy= my>>3;
-    int cmx= mx>>3;
-
-    //truncate the motion vectors references
-    if(ymy>= pic_height+2){
-        ymy=pic_height+1;
-    }else if(ymy <=-19){
-        ymy=-18;
-    }
-    if(ymx>= pic_width+2){
-        ymx= pic_width+1;
-    }else if(ymx<=-19){
-        ymx=-19;
-    }
-
-	if(cmy >= pic_height>>1){
-        cmy = (pic_height>>1) -1;
-    }else if(cmy<=-9){
-        cmy=-8;
-    }
-    if(cmx >= pic_width>>1){
-        cmx = (pic_width>>1) -1;
-    }else if(cmx<=-9){
-        cmx=-8;
-    }
-	if (!h->blocking){
-		ref->data[0]=get_mc_data(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx);
-		ref->data[1]=get_mc_data(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
-		ref->data[2]=get_mc_data(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
-	} else {
-		ref->data[0]=get_mc_data_blocking(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx);
-		ref->data[1]=get_mc_data_blocking(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
-		ref->data[2]=get_mc_data_blocking(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
-
-	}
-	
-}
-
-static void get_ref_data(H264Context_spu *h, H264Mb *mb, H264mc_part *mc_part, int idx){
-	H264slice *s = h->s;
-	int x_offset = mc_part->x_offset;
-	int y_offset = mc_part->y_offset;
-	int list0 = mc_part->list0;
-	int list1 = mc_part->list1;
-	int n = mc_part->n;
-	int chroma_height = mc_part->chroma_height;
-	Picture_spu *refpic;
-	
-	x_offset += 8*mb->mb_x;
-    y_offset += 8*mb->mb_y;
-	
-	if(list0){
-		refpic= &s->ref_list[0][ mb->ref_cache[0][ scan8[n] ] ];
-		get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 0, x_offset, y_offset, idx);
-	}
-	if(list1){
-		refpic= &s->ref_list[1][ mb->ref_cache[1][ scan8[n] ] ];
-		get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 1, x_offset, y_offset, idx);
-	}
-}
-
-void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc){
-	int idx = h->mc_idx;
-	int i;
-
-	get_list = get_list_buf;
-	ref_ptr = mc_ref[idx];
-	for(i=0; i<mc->npart; i++){
-		get_ref_data(h, mb, &mc->mc_part[i], idx);
-	}
-}
-
-static void mc_dir_part(H264Context_spu *h, H264mc_part* mc_part, int n, int chroma_height, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, int stride_y, int stride_c){
-	
-	H264Mb *mb = h->mb;
-	ref_data* ref = &mc_part->ref[list];
-    const int mx= mb->mv_cache[list][ scan8[n] ][0];	//to determine the interpolation mode
-    const int my= mb->mv_cache[list][ scan8[n] ][1];
-    const int luma_xy= (mx&3) + ((my&3)<<2);
-	uint8_t *src_y, *src_cb, *src_cr;
-    
-	src_y = ref->data[0] +2+2*STRIDE_Y;
-	src_cb = ref->data[1];
-	src_cr = ref->data[2];
-	
-	qpix_op[luma_xy](dest_y, src_y, stride_y, chroma_height*2);
-	chroma_op(dest_cb, src_cb, stride_c, chroma_height, mx&7, my&7);
-	chroma_op(dest_cr, src_cr, stride_c, chroma_height, mx&7, my&7);
-}
-
-
-static void mc_part_biweighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg){
-
-	H264Mb *mb = h->mb;
-	H264slice *s = h->s;
-	int n = mc_part->n;
-	int chroma_height = mc_part->chroma_height;
-	int itp = mc_part->itp;
-	int refn0 = mb->ref_cache[0][ scan8[n] ];
-	int refn1 = mb->ref_cache[1][ scan8[n] ];        
-	qpel_mc_func *qpix_put=  h->dsp.put_h264_qpel_pixels_tab[itp];
-    h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp];
-    
-	// don't optimize for luma-only case, since B-frames usually
-	// use implicit weights => chroma too. 
-	mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c);
-	
-	mc_dir_part(h, mc_part, n, chroma_height, 1, tmp_y_ls, tmp_cb_ls, tmp_cr_ls, qpix_put, chroma_put, STRIDE_Y, STRIDE_C);
-
-	if(s->use_weight == 2){
-		int weight0 = s->implicit_weight[refn0][refn1][mb->mb_y&1];
-		int weight1 = 64 - weight0;
-		luma_weight_avg(  dest_y,  tmp_y_ls, stride_y, STRIDE_Y, 5, weight0, weight1, 0);
-		chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0);
-		chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0);
-	}else{
-		luma_weight_avg(dest_y, tmp_y_ls, stride_y, STRIDE_Y, s->luma_log2_weight_denom,  s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]);
-		
-		chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]);
-		
-		chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]);
-	}
-}
-
-static void mc_part_weighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, int list1){
-
-	H264Mb *mb = h->mb;
-	H264slice *s = h->s;
-
-	int n = mc_part->n;
-	int chroma_height = mc_part->chroma_height;
-	int itp = mc_part->itp;
-	qpel_mc_func *qpix_put=  h->dsp.put_h264_qpel_pixels_tab[itp];
-    h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp];
-    
-    int list = list1 ? 1 : 0;
-	int refn = mb->ref_cache[list][ scan8[n] ];      
-
-	mc_dir_part(h, mc_part, n, chroma_height, list, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c);
-
-	luma_weight_op(dest_y, stride_y, s->luma_log2_weight_denom, s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]);
-	if(s->use_weight_chroma){
-		chroma_weight_op(dest_cb, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]);
-		
-		chroma_weight_op(dest_cr, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]);
-	}
-}
-
-
-static void mc_part_std(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, int list0, int list1){
-	int n = mc_part->n;
-	int chroma_height = mc_part->chroma_height;
-	int itp = mc_part->itp;
-
-    qpel_mc_func *qpix_op=  h->dsp.put_h264_qpel_pixels_tab[itp];
-    h264_chroma_mc_func chroma_op= h->dsp.put_h264_chroma_pixels_tab[itp];
-    
-    if(list0){
-        mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c);
-
-        qpix_op=  h->dsp.avg_h264_qpel_pixels_tab[itp];
-        chroma_op= h->dsp.avg_h264_chroma_pixels_tab[itp];
-    }
-
-    if(list1){
-        mc_dir_part(h, mc_part, n, chroma_height, 1, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c);
-    }
-}
-
-static void mc_part(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
-	H264slice *s = h->s;
-	
-	int weight = mc_part->weight;
-	
-	int x_offset = mc_part->x_offset;
-	int y_offset = mc_part->y_offset;
-	int list0 = mc_part->list0;
-	int list1 = mc_part->list1;
-    
-	dest_y  += 2*x_offset + 2*y_offset*stride_y;
-    dest_cb +=   x_offset +   y_offset*stride_c;
-    dest_cr +=   x_offset +   y_offset*stride_c;
-    
-	if(list0 && list1 && s->use_weight !=0){
-		h264_biweight_func *weight_avg = &h->dsp.biweight_h264_pixels_tab[weight];
-        mc_part_biweighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_avg[0], weight_avg[3]);
-	}
-	else if ((list0 || list1) && s->use_weight ==1){
-		h264_weight_func *weight_op = &h->dsp.weight_h264_pixels_tab[weight];
-		mc_part_weighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_op[0], weight_op[3], list1);
-	}
-	else{
-        mc_part_std(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, list0, list1);
-	}
-}
-
-void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
-	int i;
-	H264mc *mc =h->mc; 
-	for(i=0; i<mc->npart; i++){
-		mc_part(h, &mc->mc_part[i], dest_y, dest_cb, dest_cr, stride_y, stride_c);
-	}
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,53 +0,0 @@
-#ifndef H264_MC_SPU_H
-#define H264_MC_SPU_H
-
-//#include "types_spu.h"
-
-// motion compensation constants:
-#define MB_TYPE_16x16      0x0008
-#define MB_TYPE_16x8       0x0010
-#define MB_TYPE_8x16       0x0020
-#define MB_TYPE_8x8        0x0040
-#define MB_TYPE_P0L0       0x1000
-#define IS_16X16(a)        ((a)&MB_TYPE_16x16)
-#define IS_16X8(a)         ((a)&MB_TYPE_16x8)
-#define IS_8X16(a)         ((a)&MB_TYPE_8x16)
-#define IS_8X8(a)          ((a)&MB_TYPE_8x8)
-#define IS_SUB_8X8(a)      ((a)&MB_TYPE_16x16) //note reused
-#define IS_SUB_8X4(a)      ((a)&MB_TYPE_16x8)  //note reused
-#define IS_SUB_4X8(a)      ((a)&MB_TYPE_8x16)  //note reused
-#define IS_SUB_4X4(a)      ((a)&MB_TYPE_8x8)   //note reused
-#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list))))
-
-#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
-#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
-
-//Motion compensation buffer strides
-#define STRIDE_Y 48 
-#define STRIDE_C 32
-
-typedef struct ref_data{
-	uint8_t *data[3];
-}ref_data;
-
-typedef struct H264mc_part{
-	int n;
-	int chroma_height;
-	int x_offset;
-	int y_offset;
-	int itp;
-	int weight;
-	int list0;
-	int list1;
-	int use_weight;
-	ref_data ref[2];
-
-}H264mc_part;
-
-typedef struct H264mc{
-	H264mc_part mc_part[16];
-	int npart;
-}H264mc;
-
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 prediction functions.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_H264PRED_H
-#define AVCODEC_H264PRED_H
-
-//#include "libavutil/common.h"
-//#include "dsputil.h"
-
-/**
- * Prediction types
- */
-//@{
-#define VERT_PRED             0
-#define HOR_PRED              1
-#define DC_PRED               2
-#define DIAG_DOWN_LEFT_PRED   3
-#define DIAG_DOWN_RIGHT_PRED  4
-#define VERT_RIGHT_PRED       5
-#define HOR_DOWN_PRED         6
-#define VERT_LEFT_PRED        7
-#define HOR_UP_PRED           8
-
-#define LEFT_DC_PRED          9
-#define TOP_DC_PRED           10
-#define DC_128_PRED           11
-
-#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN   12
-#define HOR_UP_PRED_RV40_NODOWN           13
-#define VERT_LEFT_PRED_RV40_NODOWN        14
-
-#define DC_PRED8x8            0
-#define HOR_PRED8x8           1
-#define VERT_PRED8x8          2
-#define PLANE_PRED8x8         3
-
-#define LEFT_DC_PRED8x8       4
-#define TOP_DC_PRED8x8        5
-#define DC_128_PRED8x8        6
-
-#define ALZHEIMER_DC_L0T_PRED8x8 7
-#define ALZHEIMER_DC_0LT_PRED8x8 8
-#define ALZHEIMER_DC_L00_PRED8x8 9
-#define ALZHEIMER_DC_0L0_PRED8x8 10
-//@}
-
-/**
- * Context for storing H.264 prediction functions
- */
-typedef struct H264PredContext{
-    void (*pred4x4  [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
-    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
-    void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
-    void (*pred16x16[4+3])(uint8_t *src, int stride);
-
-    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
-    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
-    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
-    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
-}H264PredContext;
-
-void ff_h264_pred_init(H264PredContext *h);
-void ff_h264_pred_init_arm(H264PredContext *h);
-
-
-#endif /* AVCODEC_H264PRED_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-#include <stdint.h>
-#include "h264_tables.h"
-
-uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP] = {0, };
-
-int block_offset[16+4+4];
-
-void ff_cropTbl_init(){
-    int i;
-    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
-    for(i=0;i<MAX_NEG_CROP;i++) {
-        ff_cropTbl[i] = 0;
-        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
-    }
-}
-
-void init_block_offset(int linesize, int uvlinesize){
-	int i;
-	for(i=0; i<16; i++){
-        block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*linesize*((scan8[i] - scan8[0])>>3);
-    }
-    for(i=0; i<4; i++){
-        block_offset[16+i]=
-        block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*uvlinesize*((scan8[i] - scan8[0])>>3);
-    }
-}
\ No newline at end of file
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-#ifndef H264_TABLES_H
-#define H264_TABLES_H
-
-#define MAX_NEG_CROP       1024
-
-extern uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP];
-extern int block_offset[16+4+4];
-
-static const uint8_t scan8[16 + 2*4]={
-	4+1*8, 5+1*8, 4+2*8, 5+2*8,
-	6+1*8, 7+1*8, 6+2*8, 7+2*8,
-	4+3*8, 5+3*8, 4+4*8, 5+4*8,
-	6+3*8, 7+3*8, 6+4*8, 7+4*8,
-	1+1*8, 2+1*8,
-	1+2*8, 2+2*8,
-	1+4*8, 2+4*8,
-	1+5*8, 2+5*8,
-};
-
-static const uint8_t ff_zigzag_direct[64] = {
-    0,   1,  8, 16,  9,  2,  3, 10,
-    17, 24, 32, 25, 18, 11,  4,  5,
-    12, 19, 26, 33, 40, 48, 41, 34,
-    27, 20, 13,  6,  7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36,
-    29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46,
-    53, 60, 61, 54, 47, 55, 62, 63
-};
-
-static const uint8_t zigzag_scan[16]={
- 0+0*4, 1+0*4, 0+1*4, 0+2*4,
- 1+1*4, 2+0*4, 3+0*4, 2+1*4,
- 1+2*4, 0+3*4, 1+3*4, 2+2*4,
- 3+1*4, 3+2*4, 2+3*4, 3+3*4,
-};
-
-static const uint8_t luma_dc_zigzag_scan[16]={
- 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
- 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
- 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
- 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
-};
-
-static const uint8_t chroma_dc_scan[4]={
- (0+0*2)*16, (1+0*2)*16,
- (0+1*2)*16, (1+1*2)*16,  //FIXME
-};
-
-static const uint8_t rem6[52]={
-0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-};
-
-static const uint8_t div6[52]={
-0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
-};
-
-static const uint8_t dequant4_coeff_init[6][3]={
-  {10,13,16},
-  {11,14,18},
-  {13,16,20},
-  {14,18,23},
-  {16,20,25},
-  {18,23,29},
-};
-
-static const uint8_t dequant8_coeff_init_scan[16] = {
-  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
-};
-static const uint8_t dequant8_coeff_init[6][6]={
-  {20,18,32,19,25,24},
-  {22,19,35,21,28,26},
-  {26,23,42,24,33,31},
-  {28,25,45,26,35,33},
-  {32,28,51,30,40,38},
-  {36,32,58,34,46,43},
-};
-
-
-void init_block_offset(int linesize, int uvlinesize);
-void ff_cropTbl_init();
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,203 +0,0 @@
-#ifndef H264_CELL_TYPES_H
-#define H264_CELL_TYPES_H
-
-#include <libsync.h>
-#include <libavcodec/avcodec.h>
-
-typedef struct spe_pos{
-	volatile int count;		//number of mb processed
-	uint32_t pad[3];
-}spe_pos;
-
-//only the picture pointers are needed from the picture struct;
-typedef struct Picture_spu {
-	uint8_t* data[3];
-} Picture_spu;
-
-///For Cell, might be idea to use this instead for everything
-// struct that contains the pararms that change on slice
-typedef struct H264slice{
-	int deblocking_filter;
-    int linesize;
-    int uvlinesize;
-	int mb_width;
-	int mb_height;
-
-    int use_weight;
-    int use_weight_chroma;
-    int luma_log2_weight_denom;
-    int chroma_log2_weight_denom;
-
-    int16_t luma_weight[16][2][2];
-    int16_t chroma_weight[16][2][2][2];
-    int16_t implicit_weight[16][16][2];
-
-	// ref picture ptr
-    Picture_spu ref_list[2][16];
-	int state;
-	int emu_edge_width;
-    int emu_edge_height;
-
-    int slice_type;
-	int slice_type_nos;
-	int slice_alpha_c0_offset;
-    int slice_beta_offset;
-
-	uint8_t chroma_qp_table[2][64];
-
-	H264Mb *blocks;
-	uint8_t  *dst_y, *dst_cb, *dst_cr;
-
-    //uint32_t pad[2];		// padding the structure for multiple of 16 bytes
-}H264slice;
-
-typedef struct 	H264spe{
-#define EDIP 0
-#define EDB  1
-#define MBD  2
-	int type;
-	int idx;
-	int spe_id;
-	int spe_total;
-	int mb_width;
-	int mb_stride;
-	int mb_height;
-	int linesize;
-	int uvlinesize;
-	//H264slice* slice_params;
-	void* src_spe;
-	void* tgt_spe;
-
-	mutex_ea_t lock;
-	cond_ea_t cond;
-	atomic_ea_t cnt;
-
-	mutex_ea_t rl_lock;
-	cond_ea_t rl_cond;
-	atomic_ea_t rl_cnt;
-}H264spe;
-
-typedef struct H264Cabac_spu{
-	int blocking;
-
-    int top_cbp;
-    int left_cbp;
-    int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct
-
-    uint32_t dequant4_buffer[6][52][16];
-    uint32_t dequant8_buffer[2][52][64];
-    uint32_t (*dequant4_coeff[6])[16];
-    uint32_t (*dequant8_coeff[2])[64];
-
-    uint8_t (*non_zero_count_top)[32];
-	uint8_t (*non_zero_count)[32];
-
-	uint8_t (*mvd_top[2])[2];
-	uint8_t (*mvd[2])[2];
-
-	uint8_t *direct_top;
-	uint8_t *direct;    
-
-	uint8_t *chroma_pred_mode_top;
-	uint8_t *chroma_pred_mode;    
-
-	int8_t  *intra4x4_pred_mode_top;
-    int8_t  *intra4x4_pred_mode;	
-
-	uint16_t *cbp_top;
-	uint16_t *cbp;    
-
-	int8_t *qscale_top;
-	int8_t *qscale;	
-
-	int8_t *ref_index_top[2];
-	int8_t *ref_index[2];
-
-	int16_t (*motion_val_top[2])[2];
-	int16_t (*motion_val[2])[2];
-	uint32_t *mb_type_top;
-	uint32_t *mb_type;
-
-	int8_t *list1_ref_index[2];		
-	uint32_t *list1_mb_type;
-	DECLARE_ALIGNED_16(int16_t, list1_motion_val[2][4*4][2]); // fill for a macroblock when required
-
-	int b_stride;
-	int mb_stride;
-	int mb_width;
-	int mb_height;
-
-    uint8_t zigzag_scan[16];
-    uint8_t zigzag_scan8x8[64];
-
-    uint8_t direct_cache[5*8];
-    // Used to calculate loopfilter bS.
-    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
-    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
-    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
-
-} H264Cabac_spu;
-
-typedef struct EDSlice_spu{
-    PPS pps;                 ///< current pps
-    
-    H264Mb *mbs;
-
-    int state;
-    int qp_thresh;      ///< QP threshold to skip loopfilter
-
-	PictureInfo pic;
-	PictureInfo list1;
-//    Picture *ref_list[2][16];         ///Reordered version of default_ref_list according to picture reordering in slice header
-    int ref_count[2];   ///< counts frames or fields, depending on current mb mode
-	int slice_type;
-    int slice_type_nos;
-	int direct_8x8_inference_flag;
-
-    uint8_t list_count;
-    uint32_t coded_pic_num;
-///stuff only needed for nal/entropy decoding
-    H264Mb *m;
-    //GetBitContext gb;
-	const uint8_t *bytestream_start;
-	int byte_bufsize;
-    int transform_bypass;
-    int direct_spatial_mv_pred;
-    int map_col_to_list0[2][16];
-    int dist_scale_factor[16];
-
-    int cabac_init_idc;
-    int ref2frm[2][64];  ///< reference to frame number lists, the first 2 are for -2,-1
-    int qscale;
-    int chroma_qp[2]; //QPc
-    int last_qscale_diff;
-
-//  Picture* release_ref[MAX_MMCO_COUNT];
-//   int release_cnt;
-
-
-//     int use_weight;
-//     int use_weight_chroma;
-//    int luma_log2_weight_denom;
-//    int chroma_log2_weight_denom;
-
-//     int8_t luma_weight[16][2][2];
-//     int8_t chroma_weight[16][2][2][2];
-//     int8_t implicit_weight[16][16][2];
-
-
-
-//  int slice_alpha_c0_offset;
-//  int slice_beta_offset;
-    
-//    int nal_ref_idc;
-//    int nal_unit_type;
-//     uint8_t *rbsp_buffer;
-//     unsigned int rbsp_buffer_size;
-
-
-
-} EDSlice_spu;
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2001, 2002 Fabrice Bellard
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef AVCODEC_MATHOPS_H
-#define AVCODEC_MATHOPS_H
-
-// #include "libavutil/common.h"
-// #include "libavutil/internal.h"
-// 
-// /* generic implementation */
-// 
-// #ifndef MULL
-// #   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
-// #endif
-// 
-// #ifndef MULH
-// //gcc 3.4 creates an incredibly bloated mess out of this
-// //#    define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32)
-// 
-// static av_always_inline int MULH(int a, int b){
-//     return ((int64_t)(a) * (int64_t)(b))>>32;
-// }
-// #endif
-// 
-// #ifndef UMULH
-// static av_always_inline unsigned UMULH(unsigned a, unsigned b){
-//     return ((uint64_t)(a) * (uint64_t)(b))>>32;
-// }
-// #endif
-// 
-// #ifndef MUL64
-// #   define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
-// #endif
-// 
-// #ifndef MAC64
-// #   define MAC64(d, a, b) ((d) += MUL64(a, b))
-// #endif
-// 
-// #ifndef MLS64
-// #   define MLS64(d, a, b) ((d) -= MUL64(a, b))
-// #endif
-// 
-// /* signed 16x16 -> 32 multiply add accumulate */
-// #ifndef MAC16
-// #   define MAC16(rt, ra, rb) rt += (ra) * (rb)
-// #endif
-// 
-// /* signed 16x16 -> 32 multiply */
-// #ifndef MUL16
-// #   define MUL16(ra, rb) ((ra) * (rb))
-// #endif
-// 
-// #ifndef MLS16
-// #   define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb))
-// #endif
-
-/* median of 3 */
-#ifndef mid_pred
-#define mid_pred mid_pred
-static inline av_const int mid_pred(int a, int b, int c)
-{
-#if 0
-    int t= (a-b)&((a-b)>>31);
-    a-=t;
-    b+=t;
-    b-= (b-c)&((b-c)>>31);
-    b+= (a-b)&((a-b)>>31);
-
-    return b;
-#else
-    if(a>b){
-        if(c>b){
-            if(c>a) b=a;
-            else    b=c;
-        }
-    }else{
-        if(b>c){
-            if(c>a) b=c;
-            else    b=a;
-        }
-    }
-    return b;
-#endif
-}
-#endif
-
-// #ifndef sign_extend
-// static inline av_const int sign_extend(int val, unsigned bits)
-// {
-//     return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
-// }
-// #endif
-// 
-// #ifndef zero_extend
-// static inline av_const unsigned zero_extend(unsigned val, unsigned bits)
-// {
-//     return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
-// }
-// #endif
-// 
-// #ifndef COPY3_IF_LT
-// #define COPY3_IF_LT(x, y, a, b, c, d)\
-// if ((y) < (x)) {\
-//     (x) = (y);\
-//     (a) = (b);\
-//     (c) = (d);\
-// }
-// #endif
-// 
-// #ifndef NEG_SSR32
-// #   define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
-// #endif
-// 
-// #ifndef NEG_USR32
-// #   define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
-// #endif
-
-#endif /* AVCODEC_MATHOPS_H */
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,92 +0,0 @@
-/*
- * rectangle filling function
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * useful rectangle filling function
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_RECTANGLE_H
-#define AVCODEC_RECTANGLE_H
-
-#include <assert.h>
-
-#define STRIDE_ALIGN 16
-
-
-/**
- * fill a rectangle.
- * @param h height of the rectangle, should be a constant
- * @param w width of the rectangle, should be a constant
- * @param size the size of val (1, 2 or 4), should be a constant
- */
-static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
-    uint8_t *p= (uint8_t*)vp;
-    assert(size==1 || size==2 || size==4);
-    assert(w<=4);
-
-    w      *= size;
-    stride *= size;
-
-    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
-    assert((stride&(w-1))==0);
-    if(w==2){
-        const uint16_t v= size==4 ? val : val*0x0101;
-        *(uint16_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint16_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint16_t*)(p + 2*stride)= v;
-        *(uint16_t*)(p + 3*stride)= v;
-    }else if(w==4){
-        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
-        *(uint32_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint32_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint32_t*)(p + 2*stride)= v;
-        *(uint32_t*)(p + 3*stride)= v;
-    }else if(w==8){
-        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
-        *(uint64_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint64_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint64_t*)(p + 2*stride)= v;
-        *(uint64_t*)(p + 3*stride)= v;
-    }else if(w==16){
-        const uint64_t v= val*0x0100000001ULL;
-        *(uint64_t*)(p + 0+0*stride)= v;
-        *(uint64_t*)(p + 8+0*stride)= v;
-        *(uint64_t*)(p + 0+1*stride)= v;
-        *(uint64_t*)(p + 8+1*stride)= v;
-        if(h==2) return;
-        *(uint64_t*)(p + 0+2*stride)= v;
-        *(uint64_t*)(p + 8+2*stride)= v;
-        *(uint64_t*)(p + 0+3*stride)= v;
-        *(uint64_t*)(p + 8+3*stride)= v;
-    }else
-        assert(0);
-    assert(h==4);
-}
-
-#endif /* AVCODEC_RECTANGLE_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,508 +0,0 @@
-#define CELL_SPE
-
-#include <string.h>
-#include <stdio.h>
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include "libavcodec/avcodec.h"
-#include "h264_cabac_spu.h"
-#include "cabac_spu.h"
-#include "h264_types_spu.h"
-#include "h264_tables.h"
-#include "h264_dma.h"
-#include "h264_tables.h"
-
-#define MB_WIDTH 240
-#define MB_STRIDE (MB_WIDTH+16)
-
-H264Cabac_spu hcabac;
-CABACContext cabac;
-DECLARE_ALIGNED_16(EDSlice_spu, slice[2]);
-DECLARE_ALIGNED_16(H264Mb, mb[2]);
-DECLARE_ALIGNED_16(H264spe, spe);
-
-DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]);
-DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]);
-DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]);
-DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]);
-DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]);
-DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]);
-DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]);
-
-DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]);
-DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]);
-DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]);
-
-DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]);
-DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]);
-DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]);
-
-DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
-//mb position of neighbouring spes
-DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
-static int total_lines;
-
-static inline int dep_resolved(H264spe *p){
-	int spe_id = p->spe_id;
-	volatile int lines_proc = src_spe.count;
-	if (spe_id==0)
-		return (total_lines < lines_proc-1 +p->mb_height)? 1:0;
-	else
-		return (total_lines < lines_proc-1)? 1:0;
-}
-
-static void update_tgt_spe_dep(H264spe *p, int end){
-	// 	if (end ){
-   total_lines++;
-   spe_pos* dma_spe = &dma_temp;
-   spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store
-   dma_spe->count = end? total_lines+1: total_lines;
-   spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put);
-   // 	}
-   
-}
-
-static int init_cabac(H264spe *p, H264Cabac_spu *hc){
-	hc->mb_height = p->mb_height;
-	hc->mb_width = p->mb_width;
-	hc->b_stride = 4*p->mb_width;
-	hc->mb_stride = p->mb_stride;
-	
-	for(int i=0; i<16; i++){
-		#define T(x) (x>>2) | ((x<<2) & 0xF)
-		hc->zigzag_scan[i] = T(zigzag_scan[i]);
-		#undef T
-	}
-	for(int i=0; i<64; i++){
-		#define T(x) (x>>3) | ((x&7)<<3)
-		hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]);
-		#undef T
-	}
-}
-
-static void reset_cabac_buffers(){
- memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table));
-	memset(mvd_table, 0, sizeof(mvd_table));
-	memset(direct_table, 0, sizeof(direct_table));
-	memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table));
-	memset(cbp_table, 0, sizeof(cbp_table));
-	memset(qscale_table, 0, sizeof(qscale_table));
- 	memset(mb_type_table, 0, sizeof(mb_type_table));
-	memset(ref_index_table, 0, sizeof(ref_index_table));
-	memset(motion_val_table, 0, sizeof(motion_val_table));
-}
-
-static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){
-	int align = (unsigned) buf & 0xF;
-	int dma_size;
-	
-	c->bytestream_ea_start=
-	c->bytestream_ea= buf;
-	c->bytestream_ea_end= buf + bufsize;
-	c->bufsize = bufsize;
-	
-	if (bufsize + align >= sizeof(bytestream_ls)){
-		dma_size = sizeof(bytestream_ls);
-		c->bufsize = c->bufsize +align - sizeof(bytestream_ls);				
-	}else{
-		int align_end = (bufsize+align) &0xF;
-		if (align_end)
-			dma_size = bufsize+align + 16-align_end;
-		else
-			dma_size = bufsize+align;
-		c->bufsize = 0;
-	}
-// 	printf("%d\n", dma_size);
-	c->bytestream_end  = &bytestream_ls[dma_size]; 
-	c->bytestream_start= c->bytestream = &bytestream_ls[align];
- 	spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get );
-	c->bytestream_ea_start=
-	c->bytestream_ea= buf + dma_size -align;
-
-	wait_dma_id(ED_get);
-	
-	if (align %2){
-		c->low =  (*c->bytestream++)<<18;
-		c->low+=  (*c->bytestream++)<<10;
-		c->low+= ((*c->bytestream++)<<2) + 2;
-	}else {
-		c->low =  (*c->bytestream++)<<18;
-		c->low+=  (*c->bytestream++)<<10;
-		c->low+=  (2<<8);
-	}
-
-	c->range= 0x1FE;
-	bytecount=0;
-}
-
-static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
-    int i,q,x;
-    const int transpose = HAVE_ALTIVEC;
-    hc->dequant8_coeff[0] = hc->dequant8_buffer[0];
-    hc->dequant8_coeff[1] = hc->dequant8_buffer[1];
-
-    for(i=0; i<2; i++){
-        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
-            hc->dequant8_coeff[1] = hc->dequant8_buffer[0];
-            break;
-        }
-
-        for(q=0; q<52; q++){
-            int shift = div6[q];
-            int idx = rem6[q];
-            for(x=0; x<64; x++)
-                hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
-                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
-                    s->pps.scaling_matrix8[i][x]) << shift;
-        }
-    }
-}
-
-static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
-    int i,j,q,x;
-    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
-    for(i=0; i<6; i++ ){
-        hc->dequant4_coeff[i] = hc->dequant4_buffer[i];
-        for(j=0; j<i; j++){
-            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
-                hc->dequant4_coeff[i] = hc->dequant4_buffer[j];
-                break;
-            }
-        }
-        if(j<i)
-            continue;
-
-        for(q=0; q<52; q++){
-            int shift = div6[q] + 2;
-            int idx = rem6[q];
-            for(x=0; x<16; x++)
-                hc->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
-                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
-                    s->pps.scaling_matrix4[i][x]) << shift;
-        }
-    }
-}
-
-static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){
-    int i,x;
-
-    init_dequant4_coeff_table(s, hc);
-    if(s->pps.transform_8x8_mode)
-        init_dequant8_coeff_table(s, hc);
-    if(s->transform_bypass){
-        for(i=0; i<6; i++)
-            for(x=0; x<16; x++)
-                hc->dequant4_coeff[i][0][x] = 1<<6;
-        if(s->pps.transform_8x8_mode)
-            for(i=0; i<2; i++)
-                for(x=0; x<64; x++)
-                    hc->dequant8_coeff[i][0][x] = 1<<6;
-    }
-}
-
-static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){
-	hc->non_zero_count_top 		= non_zero_count_table[0];
-	hc->non_zero_count     		= non_zero_count_table[1];
-	hc->mvd_top[0]				= mvd_table[0][0];
-	hc->mvd[0]					= mvd_table[0][1];
-	hc->mvd_top[1]				= mvd_table[1][0];
-	hc->mvd[1]					= mvd_table[1][1];
-	hc->direct_top		   		= direct_table[0];
-	hc->direct			   		= direct_table[1];
-	hc->chroma_pred_mode_top	= chroma_pred_mode_table[0];
-	hc->chroma_pred_mode  		= chroma_pred_mode_table[1];
-	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[0];
-	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[1];
-	hc->cbp_top			   		= cbp_table[0];
-	hc->cbp				   		= cbp_table[1];
-	hc->qscale_top			   	= qscale_table[0] +1;
-	hc->qscale				   	= qscale_table[1] +1;
-
-	hc->mb_type_top 			= mb_type_table[0]+1;
-	hc->mb_type		 			= mb_type_table[1]+1;
-	hc->ref_index_top[0]		= ref_index_table[0][0];
-	hc->ref_index_top[1]		= ref_index_table[1][0];
-	hc->ref_index[0]			= ref_index_table[0][1];
-	hc->ref_index[1]			= ref_index_table[1][1];
-	hc->motion_val_top[0] 		= motion_val_table[0][0];
-	hc->motion_val_top[1] 		= motion_val_table[1][0];
-	hc->motion_val[0] 			= motion_val_table[0][1];
-	hc->motion_val[1] 			= motion_val_table[1][1];
-
-	int mb_stride = hc->mb_stride;
-
-	if (s->slice_type_nos == FF_B_TYPE){
-		while(!dep_resolved(&spe));
-		spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get);
-		spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get);
-		spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get);
-		wait_dma_id(ED_get);
-		spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get);
-		spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
-		spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
-		hc->list1_mb_type = list1_mb_type_table[0]+1;
-		hc->list1_ref_index[0] = list1_ref_index_table[0][0];
-		hc->list1_ref_index[1] = list1_ref_index_table[0][1];
-	}	
-
-}
-
-static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){
-	int mb_stride = hc->mb_stride;
-	int mb_width = hc->mb_width;
-	int top = (line+1)%2;
-	int cur = line%2;
-	int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line.
-
-	hc->non_zero_count_top 		= non_zero_count_table[top];
-	hc->non_zero_count     		= non_zero_count_table[cur];
-	hc->mvd_top[0]				= mvd_table[0][top];
-	hc->mvd[0]					= mvd_table[0][cur];
-	hc->mvd_top[1]				= mvd_table[1][top];
-	hc->mvd[1]					= mvd_table[1][cur];
-	hc->direct_top		   		= direct_table[top];
-	hc->direct			   		= direct_table[cur];
-	hc->chroma_pred_mode_top	= chroma_pred_mode_table[top];
-	hc->chroma_pred_mode  		= chroma_pred_mode_table[cur];
-	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[top];
-	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[cur];
-	hc->cbp_top			   		= cbp_table[top];
-	hc->cbp				   		= cbp_table[cur];
-	hc->qscale_top			   	= qscale_table[top] +1;
-	hc->qscale				   	= qscale_table[cur] +1;
-
-	hc->mb_type_top 			= mb_type_table[top]+1;
-	hc->mb_type		 			= mb_type_table[cur]+1;
-	hc->ref_index_top[0]		= ref_index_table[0][top];
-	hc->ref_index_top[1]		= ref_index_table[1][top];
-	hc->ref_index[0]			= ref_index_table[0][cur];
-	hc->ref_index[1]			= ref_index_table[1][cur];
-	hc->motion_val_top[0] 		= motion_val_table[0][top];
-	hc->motion_val_top[1] 		= motion_val_table[1][top];
-	hc->motion_val[0] 			= motion_val_table[0][cur];
-	hc->motion_val[1] 			= motion_val_table[1][cur];
-
-	wait_dma_id(ED_put);
-	
-	spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put);
-	spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
-	spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
-	spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
-	spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
-
-	if (s->slice_type_nos == FF_B_TYPE){
-		update_tgt_spe_dep(&spe, 0);
-		wait_dma_id(ED_get);
-						
-		if (line + 2 < hc->mb_height){
-			while(!dep_resolved(&spe));
-			spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get);
-			spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
-			spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
-		}
-		hc->list1_mb_type = list1_mb_type_table[bottom]+1;
-		hc->list1_ref_index[0] = list1_ref_index_table[bottom][0];
-		hc->list1_ref_index[1] = list1_ref_index_table[bottom][1];
-	}
-
-}
-
-// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){
-// 
-// 	printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x);
-// 	printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y);
-// 	printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy);
-// 	printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy);
-// 	printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy);
-// 	printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode);
-// 	printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode);
-// 	printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available);
-// 	printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available);
-// 	printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available);
-// 	printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available);
-// 
-// 	if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){
-// 		for (int i=0; i<5; i++){
-// 			for (int j=0; j<8; j++){
-// 				printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]);
-// 			}
-// 			printf("\n");
-// 		}
-// 	}
-// 
-// 	if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){
-// 		for (int i=0; i<6; i++){
-// 			for (int j=0; j<8; j++){
-// 				printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]);
-// 			}
-// 			printf("\n");
-// 		}
-// 	}
-// 
-// 	if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){
-// 		for (int i=0; i<4; i++){
-// 			printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]);
-// 			printf("\n");
-// 		}
-// 	}
-// 
-// 	if (memcmp(mp->mv_cache, ms->mv_cache, 320)){
-// 		for (int k=0; k<2; k++){
-// 			for (int i=0; i<5; i++){
-// 				for (int j=0; j<8; j++){
-// 					printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]);
-// 				}
-// 				printf("\n");
-// 			}
-// 		}
-// 	}
-// 
-// 	if (memcmp(mp->ref_cache, ms->ref_cache, 80)){
-// 		for (int k=0; k<2; k++){
-// 			for (int i=0; i<5; i++){
-// 				for (int j=0; j<8; j++){
-// 					printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]);
-// 				}
-// 				printf("\n");
-// 			}
-// 		}
-// 	}
-// 
-// 	printf("cbp %d, %d\n", mp->cbp, ms->cbp);
-// 	for (int i=0; i<hc->mb_stride; i++){
-//    		printf("%d, ", hc->cbp[i]); fflush(0);
-//    	}
-// 	printf("\n");
-// 
-// 	printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type);
-// 	printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) );
-// 	printf("left_type %d, %d\n", mp->left_type, ms->left_type);
-// 	printf("top_type %d, %d\n", mp->top_type, ms->top_type);
-// 	printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy);
-// 	printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy);
-// 	printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy);
-// // 	for (int i=0; i<hc->mb_stride; i++){
-// // 		printf("%d, ", qscale_table[0][i]); fflush(0);
-// // 	}
-// 
-// 	if (memcmp(mp->mb, ms->mb, 768)){
-// 		for (int i=0; i<16; i++){
-// 			for (int j=0; j<16; j++){
-// 				printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]);
-// 			}
-// 			printf("\n");
-// 		}
-// 		for (int i=0; i<8; i++){
-// 			for (int j=0; j<8; j++){
-// 				printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]);
-// 			}
-// 			printf("\n");
-// 		}
-// 		for (int i=0; i<8; i++){
-// 			for (int j=0; j<8; j++){
-// 				printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]);
-// 			}
-// 			printf("\n");
-// 		}
-// 	}
-// 
-// 	if (memcmp(mp->bS, ms->bS, 32)){
-// 		for (int k=0; k<2; k++){
-// 			for (int i=0; i<4; i++){
-// 				for (int j=0; j<4; j++){
-// 					printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]);
-// 				}
-// 				printf("\n");
-// 			}
-// 		}
-// 	}
-// 	if (memcmp(mp->edges, ms->edges, 4)){
-// 		printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]);
-// 		printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb);
-// 	}
-// 
-// 	printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y);
-// 	printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb);
-// 	printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr);
-// }
-// DECLARE_ALIGNED_16(H264Mb, tmp);
-
-
-int main(unsigned long long id, unsigned long long argp){
-	EDSlice_spu *s;
-	H264Cabac_spu *hc = &hcabac;
-	CABACContext *c = &cabac;
-	H264spe *p = &spe;
-	
-	spu_write_out_mbox((unsigned) slice);
-	spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience
-	wait_dma_id(ED_spe);
-
-	ff_init_cabac_states();
-	init_cabac(p, hc);
-	hc->blocking=0;
-	for(;;){
-		spu_read_in_mbox();
-		s = &slice[0];
-		reset_cabac_buffers();
-		init_entropy_buf(hc, s);
-
-		if (hc->blocking) wait_dma_id(ED_get);
-		//printf("framesize %d\n", s->byte_bufsize);fflush(0);
- 		init_dequant_tables(s, hc);
-		ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize );
- 		ff_h264_init_cabac_states(s, c);
-
-		int mb_slot=0;
- 		for(int j=0; j<hc->mb_height; j++){
-			for(int i=0; i<hc->mb_width; i++){
-				int eos,ret;
-				H264Mb *m = &mb[mb_slot];
-				m->mb_x=i;
-				m->mb_y=j;
-				s->m = m;
-
-				ret = ff_h264_decode_mb_cabac(hc, s, c);
-
-// 				spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get);
-// 				wait_dma_id(ED_get);
-// 				if (memcmp(&tmp, m, sizeof(H264Mb))){
-// 					printf("coded pic num %d\n", s->coded_pic_num);
-// 					printmbdiff(s, hc,&tmp, m);
-// 					return 0;
-// 				}
-				//printf("qscale %d\n", m->qscale_mb_xy);
-				if (!hc->blocking){
-					if (mb_slot){
-						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1);
-						wait_dma_id(ED_putmb0);
-					}else {
-						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
-						wait_dma_id(ED_putmb1);
-					}
-					mb_slot++; mb_slot%=2;
-				}else {
-					spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
-					wait_dma_id(ED_putmb0);
-				}
-				
-
-				eos = get_cabac_terminate( c);
-
-				if( ret < 0) {
-					fprintf(stderr, "error at %d bytecount\n", bytecount);
-					return -1;
-				}
-			}
-			update_entropy_buf(hc, s, j);
-			if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);}
-		}
-		wait_dma_id(ED_put);
-		spu_write_out_mbox(1);
-
-	}
-
-	return 0;
-
-
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c
--- a/ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2009 TUDelft 
- * 
- * Cell Parallel SPU - 2DWave Macroblock Decoding. 
- */
-
-/**
- * @file libavcodec/cell/spu/h264_main_spu.c
- * Cell Parallel SPU - 2DWave Macroblock Decoding
- * @author C C Chi <c.c.chi@student.tudelft.nl>
- * 
- * SIMD kernels 
- * H.264/AVC motion compensation
- * @author Mauricio Alvarez <alvarez@ac.upc.edu>
- * @author Albert Paradis <apar7632@hotmail.com>
- */ 
-
-
-/* Enable this lines to enable simulator statistic or generate traces */
-
-//#define ENABLE_SIMULATOR
-//#define ENABLE_PARAVER_TRACING_CELL
-
-#ifdef ENABLE_SIMULATOR
-	#include "/opt/ibm/systemsim-cell/include/callthru/spu/profile.h"
-#endif
-
-#ifdef ENABLE_TRACES
-	#include "spu_trace.h"
-#endif
-#include <unistd.h>
-#include <stdio.h>
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <libsync.h>
-#include <sys/time.h>
-#include <assert.h>
-
-//#include "dsputil_cell.h"
-#include "types_spu.h"
-#include "h264_intra_spu.h"
-#include "h264_decode_mb_spu.h"
-#include "h264_mc_spu.h"
-#include "h264_tables.h"
-#include "h264_dma.h"
-
-
-/** functions for supporting tracing with paraver for the SPU 
- *
- */
-inline void trace_init_SPU(){
-#ifdef ENABLE_PARAVER_TRACING_CELL
-	SPUtrace_init ();
-#endif
-}
-
-inline void trace_fini_SPU(){
-#ifdef ENABLE_PARAVER_TRACING_CELL
-	SPUtrace_fini ();
-#endif
-}
-
-inline void trace_event_SPU(int event, int id){
-#ifdef ENABLE_PARAVER_TRACING_CELL
-	SPUtrace_event (event, id);
-#else
-	(void) event;
-	(void) id;
-#endif
-}
-
-// for simulator statistic
-inline void clear_statistic(){
-#ifdef ENABLE_SIMULATOR
-	prof_clear();
-#endif
-}
-
-inline void start_statistic(){
-#ifdef ENABLE_SIMULATOR
-	prof_start();
-#endif
-}
-
-inline void stop_statistic(){
-#ifdef ENABLE_SIMULATOR
-	prof_stop();
-#endif
-}
-
-H264Context_spu h_context;  // struct that contain all the params to decode a macroblock
-
-DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
-//mb position of neighbouring spes
-DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
-//DECLARE_ALIGNED_16(spe_pos, tgt_spe); //written by SPE_ID +1
-
-/**	
-*	Initializes the buffering of the mb data and associated mc data. The init_mb_buffer needs to 
-*	be called before any get_next_mb and only once at the beginning of the slice.
-*
-*	Note: init_mc_buffer and get_next_mb expect the width of the picture to be more than 2 mb's
-*/
-#define TAG_OFFSET_MB MBD_buf1
-#define TAG_OFFSET_MC MBD_mc_buf1
-static void init_mb_buffer(H264Context_spu* h){
-	H264slice *s = h->s;
-	H264Mb *next_mb;
-	int mb_height = s->mb_height;
-	int mb_width = s->mb_width;
-
-	h->mc_idx =0;
-	
-	h->mb_dec = 0;
-	h->mb_mc = 0;
-	h->mb_dma = 0;
-		
-	h->curr_line %= mb_height;
-	h->next_mb_idx = h->curr_line * mb_width;
-	h->mb_id = h->curr_line * mb_width;
-	h->n_mc= h->curr_line * mb_width;
-	
-	next_mb = s->blocks + h->mb_id;
-	spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
-	h->mb_dma++;
-	h->mb_id++;
-	
-	next_mb = s->blocks + h->mb_id;
-	spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
-	h->mb_dma++;
-	h->mb_id++;
-	wait_dma_id(0 + TAG_OFFSET_MB);	
-	
-	H264Mb *mb = &h->mb_buf[0];
-	H264mc *mc = &h->mc_buf[0];
-	if(!IS_INTRA(mb->mb_type)){
-		calc_mc_params(mb, mc);
-		fill_ref_buf(h, mb, mc);
-	}
-	h->n_mc++;
-	h->mb_mc++;
-}
-
-static void *get_next_mb(H264Context_spu *h){
-	H264slice *s = h->s;
-	H264spe *spe = &h->spe;
-	H264Mb *mb_buf = h->mb_buf;	
-	H264mc *mc_buf = h->mc_buf;
-	H264Mb *next_mb;
-	H264Mb *next_dma_mb;
-	
-	if (h->curr_line >= s->mb_height)
-		return NULL;
-	
-	if (h->mb_id < h->mb_total){
-		next_dma_mb = s->blocks + h->mb_id;
-		spu_dma_get(&mb_buf[h->mb_dma], (unsigned) next_dma_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
-		h->mb_dma = (h->mb_dma+1)%3;
-		h->mb_id++;
-		if (h->mb_id%s->mb_width ==0){
-			h->mb_id+=(spe->spe_total-1)*s->mb_width;			
-		}
-	}
-	
-	h->mc = &mc_buf[h->mc_idx];
-	wait_dma_id(h->mc_idx + TAG_OFFSET_MC);
-	h->mc_idx = (h->mc_idx+1)%2;
-	if (h->n_mc < h->mb_total){
-		wait_dma_id(h->mb_mc + TAG_OFFSET_MB);
-		H264Mb *mb = &mb_buf[h->mb_mc];
-		H264mc *mc = &mc_buf[h->mc_idx];
-		if(!IS_INTRA(mb->mb_type)){
-			calc_mc_params(mb, mc);
-			fill_ref_buf(h, mb, mc);
-		}
-		h->n_mc++;
-		if (h->n_mc%s->mb_width ==0){
-			h->n_mc+=(spe->spe_total-1)*s->mb_width;			
-		}
-	}
-	h->next_mb_idx++;
-	if (h->next_mb_idx % s->mb_width ==0){
-		h->next_mb_idx+=(spe->spe_total-1)*s->mb_width;
-		h->curr_line+=spe->spe_total;		
-	}
-	
-	h->mb_mc = (h->mb_mc+1)%3;	
-	next_mb = &mb_buf[h->mb_dec];
-	h->mb_dec = (h->mb_dec+1)%3;
-	return next_mb;
-}
-
-static void *get_next_mb_blocking(H264Context_spu *h){
-	H264slice *s = h->s;
-	H264spe *spe = &h->spe;
-	H264Mb *mb_buf = h->mb_buf;
-	H264mc *mc_buf = h->mc_buf;
-	H264Mb *next_mb;
-	H264Mb *next_dma_mb;
-
-	if (h->mb_id >= h->mb_total)
-		return NULL;
-
-	//printf("%d\n", h->mb_id);
-	next_dma_mb = s->blocks + h->mb_id;
-	spu_dma_get(&mb_buf[0], (unsigned) next_dma_mb, sizeof(H264Mb), MBD_buf1);
-	//h->mb_dma = (h->mb_dma+1)%3;
-	h->mb_id++;
-	if (h->mb_id%s->mb_width ==0){
-		h->mb_id+=(spe->spe_total-1)*s->mb_width;
-	}
-	wait_dma_id(MBD_buf1);
-
-	h->mc = &mc_buf[0];	
-	//h->mc_idx = (h->mc_idx+1)%2;
-	//if (h->n_mc < h->mb_total){
-	H264Mb *mb = &mb_buf[0];
-	H264mc *mc = &mc_buf[0];
-	if(!IS_INTRA(mb->mb_type)){
-		calc_mc_params(mb, mc);
-		fill_ref_buf(h, mb, mc);
-	}
-	//h->n_mc++;
-	/*if (h->n_mc%s->mb_width ==0){
-		h->n_mc+=(spe->spe_total-1)*s->mb_width;
-	}*/	
-//	wait_dma_id(MBD_mc_buf1);
-
-// 	h->next_mb_idx++;
-// 	if (h->next_mb_idx % s->mb_width ==0){
-// 		h->next_mb_idx+=(spe->spe_total-1)*s->mb_width;
-// 		h->curr_line+=spe->spe_total;
-// 	}
-
-// 	h->mb_mc = (h->mb_mc+1)%3;
-	next_mb = &mb_buf[0];
-// 	h->mb_dec = (h->mb_dec+1)%3;
-	return next_mb;
-}
-
-
-#undef TAG_OFFSET_MB
-#undef TAG_OFFSET_MC
-static inline int dep_resolved(H264Context_spu *h){
-	H264slice *s = h->s;
-	int spe_id = h->spe.spe_id;
-	volatile int mb_proc_dep = src_spe.count;
-	if (spe_id==0)
-		return (h->mb_proc < mb_proc_dep-1 +s->mb_width)? 1:0;
-	else
-		return (h->mb_proc < mb_proc_dep-1)? 1:0;
-}
-
-void update_tgt_spe_dep(H264Context_spu *h, int end){
-	H264Mb *mb = h->mb;
-	H264slice *s = h->s;
-	H264spe *spe = &h->spe;
-	int mb_x = mb->mb_x;
-	
-	if (end || (mb_x%2==0 && mb_x!=0) || mb_x==s->mb_width-1){
-		spe_pos* dma_spe = &dma_temp;
-		spe_pos* tgt_spe = (spe_pos*) ((unsigned) spe->tgt_spe + (unsigned) &src_spe); //located in target spe local store
-		dma_spe->count = end? h->mb_proc+1: h->mb_proc;
-		spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), MBD_put);
-	}
-	h->mb_proc++;
-}
-
-
-int main(unsigned long long id, unsigned long long argp)
-{
-	(void) id;
-	H264Context_spu* h = &h_context;
-	H264spe *spe_params = (H264spe *) (unsigned) argp;    
-	
-	spu_dma_get(&h->spe, (unsigned) spe_params, sizeof(H264spe), MBD_slice); //ID_slice is used out of convienience
-	wait_dma_id(MBD_slice);
-
-    //clear_statistic();
-    dsputil_h264_init_cell(&h->dsp);
-    ff_cropTbl_init();
-    init_pred_ptrs(&h->hpc);
-
-	//send slice_buf to ppe
-	spu_write_out_mbox((unsigned) h->slice_buf);
-	h->sl_idx=0;
-	// initialize tracing with paraver
-    //trace_init_SPU();
-	h->frames =0;	
-	src_spe.count =0;
-	h->mb_proc = 0;
-
-	h->mb_id=0;
-	h->mc_idx=0;
-	h->mb_dec=0;
-	h->mb_mc=0;
-	h->mb_dma=0;
-	h->next_mb_idx=0;
-
-	h->blocking=0;
-
-
-	H264spe* p = &h->spe;
-	h->curr_line =p->spe_id;
-	h->mb_total = p->mb_height*p->mb_width;
-	int stride_y = 32;
-	int stride_c = 16;
-	//init block_offset array
-	init_block_offset(stride_y, stride_c);
-	for(;;){
-		spu_read_in_mbox();
-
-		h->s = &h->slice_buf[h->sl_idx];
-		h->sl_idx++; h->sl_idx%=2;
-
-		if (h->s->state< 0){			
-			break;
-		}
-
-		{
-			if(!h->blocking){
-				init_mb_buffer(h);
-				while((h->mb=(H264Mb *)get_next_mb(h))){
-					while(!dep_resolved(h));
-					//printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p-	>spe_id);
-					hl_decode_mb_internal(h, stride_y, stride_c);
-				}
-				update_tgt_spe_dep(h, 1);
-			}else{
-				h->mb_id=0;
-				while((h->mb=(H264Mb *)get_next_mb_blocking(h))){
-					while(!dep_resolved(h));
-					//printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p-	>spe_id);
-					hl_decode_mb_internal(h, stride_y, stride_c);
-				}
-				update_tgt_spe_dep(h, 1);
-			}
-			
-		}
-
-		h->frames++;
-		
-		if (p->spe_id == ((h->frames*p->mb_height -1)%p->spe_total)){
-			//printf("spe %d, %d\n", atomic_read(p->rl_cnt), h->frames);
-			//MBSlice is copied beforehand.
-			//only inc cnt.
-			atomic_inc(p->rl_cnt);		
-		}
-		{
-			atomic_dec(p->cnt);
-		}
-	}
-	
-	return 0;
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h
--- a/ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef TYPES_SPU_H
-#define TYPES_SPU_H
-
-/***********************************************************************
- * Scalar types
- **********************************************************************/
-    typedef signed char  int8_t;
-    typedef signed short int16_t;
-    typedef signed int   int32_t;
-    typedef unsigned char  uint8_t;
-    typedef unsigned short uint16_t;
-    typedef unsigned int   uint32_t;
-    typedef unsigned long long uint64_t;
-
-//     typedef short DCTELEM;		// transform coeficients of dct
-
-/***********************************************************************
- * Vector types
- **********************************************************************/
-    typedef	vector	signed int	vsint32_t;
-    typedef	vector	unsigned int	vuint32_t;
-    typedef	vector	signed short	vsint16_t;
-    typedef	vector	unsigned short	vuint16_t;
-    typedef	vector	signed char	vsint8_t;
-    typedef	vector	unsigned char	vuint8_t;
-
-/***********************************************************************
- * Functions
- **********************************************************************/
-    typedef void (*qpel_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h);
-    typedef void (*h264_chroma_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h, int x, int y);
-    typedef void (*h264_idct_func)(uint8_t *dst, short *block, int stride);
-    typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-    typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd,
-                  int weights, int offset);
-    typedef void(* intra_pred4x4)(uint8_t *src, uint8_t *topright, int stride);
-    typedef void(* intra_pred16x16)(uint8_t *src, int stride);
-    typedef void(* intra_pred8x8)(uint8_t *src, int stride);
-    typedef void(* intra_pred8x8l)(uint8_t *src, int topleft, int topright, int stride);
-
-
-#define AVV(x...) {x}
-	
-	
-#endif // AVCODEC_TYPES_SPU_H
-
-
-
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/dsputil.c
--- a/ffmpeg_smp/h264dec/libavcodec/dsputil.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1057 +0,0 @@
-/*
- * DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * DSP utils
- */
-
-#include "libavutil/log.h"
-#include "dsputil.h"
-#include "simple_idct.h"
-#include "mathops.h"
-#include "config.h"
-
-uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
-uint32_t ff_squareTbl[512] = {0, };
-
-const uint8_t ff_zigzag_direct[64] = {
-    0,   1,  8, 16,  9,  2,  3, 10,
-    17, 24, 32, 25, 18, 11,  4,  5,
-    12, 19, 26, 33, 40, 48, 41, 34,
-    27, 20, 13,  6,  7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36,
-    29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46,
-    53, 60, 61, 54, 47, 55, 62, 63
-};
-
-
-#define PIXOP2(OPNAME, OP) \
-static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
-        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN32(&src1[i*src_stride1  ]);\
-        b= AV_RN32(&src2[i*src_stride2  ]);\
-        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN32(&src1[i*src_stride1  ]);\
-        b= AV_RN32(&src2[i*src_stride2  ]);\
-        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN32(&src1[i*src_stride1  ]);\
-        b= AV_RN32(&src2[i*src_stride2  ]);\
-        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN16(&src1[i*src_stride1  ]);\
-        b= AV_RN16(&src2[i*src_stride2  ]);\
-        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
-    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
-    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a, b, c, d, l0, l1, h0, h1;\
-        a= AV_RN32(&src1[i*src_stride1]);\
-        b= AV_RN32(&src2[i*src_stride2]);\
-        c= AV_RN32(&src3[i*src_stride3]);\
-        d= AV_RN32(&src4[i*src_stride4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x02020202UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        c= AV_RN32(&src3[i*src_stride3+4]);\
-        d= AV_RN32(&src4[i*src_stride4+4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x02020202UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a, b, c, d, l0, l1, h0, h1;\
-        a= AV_RN32(&src1[i*src_stride1]);\
-        b= AV_RN32(&src2[i*src_stride2]);\
-        c= AV_RN32(&src3[i*src_stride3]);\
-        d= AV_RN32(&src4[i*src_stride4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x01010101UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        c= AV_RN32(&src3[i*src_stride3+4]);\
-        d= AV_RN32(&src4[i*src_stride4+4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x01010101UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-    }\
-}\
-static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-}\
-static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-}\
-\
-static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-        int i, a0, b0, a1, b1;\
-        a0= pixels[0];\
-        b0= pixels[1] + 2;\
-        a0 += b0;\
-        b0 += pixels[2];\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            a1= pixels[0];\
-            b1= pixels[1];\
-            a1 += b1;\
-            b1 += pixels[2];\
-\
-            block[0]= (a1+a0)>>2; /* FIXME non put */\
-            block[1]= (b1+b0)>>2;\
-\
-            pixels+=line_size;\
-            block +=line_size;\
-\
-            a0= pixels[0];\
-            b0= pixels[1] + 2;\
-            a0 += b0;\
-            b0 += pixels[2];\
-\
-            block[0]= (a1+a0)>>2;\
-            block[1]= (b1+b0)>>2;\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-}\
-\
-static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-        int i;\
-        const uint32_t a= AV_RN32(pixels  );\
-        const uint32_t b= AV_RN32(pixels+1);\
-        uint32_t l0=  (a&0x03030303UL)\
-                    + (b&0x03030303UL)\
-                    + 0x02020202UL;\
-        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
-                   + ((b&0xFCFCFCFCUL)>>2);\
-        uint32_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint32_t a= AV_RN32(pixels  );\
-            uint32_t b= AV_RN32(pixels+1);\
-            l1=  (a&0x03030303UL)\
-               + (b&0x03030303UL);\
-            h1= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN32(pixels  );\
-            b= AV_RN32(pixels+1);\
-            l0=  (a&0x03030303UL)\
-               + (b&0x03030303UL)\
-               + 0x02020202UL;\
-            h0= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-}\
-\
-static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int j;\
-    for(j=0; j<2; j++){\
-        int i;\
-        const uint32_t a= AV_RN32(pixels  );\
-        const uint32_t b= AV_RN32(pixels+1);\
-        uint32_t l0=  (a&0x03030303UL)\
-                    + (b&0x03030303UL)\
-                    + 0x02020202UL;\
-        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
-                   + ((b&0xFCFCFCFCUL)>>2);\
-        uint32_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint32_t a= AV_RN32(pixels  );\
-            uint32_t b= AV_RN32(pixels+1);\
-            l1=  (a&0x03030303UL)\
-               + (b&0x03030303UL);\
-            h1= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN32(pixels  );\
-            b= AV_RN32(pixels+1);\
-            l0=  (a&0x03030303UL)\
-               + (b&0x03030303UL)\
-               + 0x02020202UL;\
-            h0= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-        pixels+=4-line_size*(h+1);\
-        block +=4-line_size*h;\
-    }\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int j;\
-    for(j=0; j<2; j++){\
-        int i;\
-        const uint32_t a= AV_RN32(pixels  );\
-        const uint32_t b= AV_RN32(pixels+1);\
-        uint32_t l0=  (a&0x03030303UL)\
-                    + (b&0x03030303UL)\
-                    + 0x01010101UL;\
-        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
-                   + ((b&0xFCFCFCFCUL)>>2);\
-        uint32_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint32_t a= AV_RN32(pixels  );\
-            uint32_t b= AV_RN32(pixels+1);\
-            l1=  (a&0x03030303UL)\
-               + (b&0x03030303UL);\
-            h1= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN32(pixels  );\
-            b= AV_RN32(pixels+1);\
-            l0=  (a&0x03030303UL)\
-               + (b&0x03030303UL)\
-               + 0x01010101UL;\
-            h0= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-        pixels+=4-line_size*(h+1);\
-        block +=4-line_size*h;\
-    }\
-}\
-\
-CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
-
-#define op_avg(a, b) a = rnd_avg32(a, b)
-
-#define op_put(a, b) a = b
-
-PIXOP2(avg, op_avg)
-PIXOP2(put, op_put)
-#undef op_avg
-#undef op_put
-
-
-#define H264_CHROMA_MC(OPNAME, OP)\
-static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }else{\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            OP(dst[1], (A*src[1] + E*src[step+1]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}\
-\
-static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
-            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
-            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }else{\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            OP(dst[1], (A*src[1] + E*src[step+1]));\
-            OP(dst[2], (A*src[2] + E*src[step+2]));\
-            OP(dst[3], (A*src[3] + E*src[step+3]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}\
-\
-static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
-            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
-            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
-            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
-            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
-            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
-            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }else{\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            OP(dst[1], (A*src[1] + E*src[step+1]));\
-            OP(dst[2], (A*src[2] + E*src[step+2]));\
-            OP(dst[3], (A*src[3] + E*src[step+3]));\
-            OP(dst[4], (A*src[4] + E*src[step+4]));\
-            OP(dst[5], (A*src[5] + E*src[step+5]));\
-            OP(dst[6], (A*src[6] + E*src[step+6]));\
-            OP(dst[7], (A*src[7] + E*src[step+7]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}
-
-#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
-#define op_put(a, b) a = (((b) + 32)>>6)
-
-H264_CHROMA_MC(put_       , op_put)
-H264_CHROMA_MC(avg_       , op_avg)
-#undef op_avg
-#undef op_put
-
-
-#define H264_LOWPASS(OPNAME, OP, OP2) \
-static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=2;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=2;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int srcB= src[-2*srcStride];\
-        const int srcA= src[-1*srcStride];\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    const int h=2;\
-    const int w=2;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    src -= 2*srcStride;\
-    for(i=0; i<h+5; i++)\
-    {\
-        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
-        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
-        tmp+=tmpStride;\
-        src+=srcStride;\
-    }\
-    tmp -= tmpStride*(h+5-2);\
-    for(i=0; i<w; i++)\
-    {\
-        const int tmpB= tmp[-2*tmpStride];\
-        const int tmpA= tmp[-1*tmpStride];\
-        const int tmp0= tmp[0 *tmpStride];\
-        const int tmp1= tmp[1 *tmpStride];\
-        const int tmp2= tmp[2 *tmpStride];\
-        const int tmp3= tmp[3 *tmpStride];\
-        const int tmp4= tmp[4 *tmpStride];\
-        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
-        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
-        dst++;\
-        tmp++;\
-    }\
-}\
-static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=4;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
-        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
-        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=4;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int srcB= src[-2*srcStride];\
-        const int srcA= src[-1*srcStride];\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        const int src5= src[5 *srcStride];\
-        const int src6= src[6 *srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
-        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    const int h=4;\
-    const int w=4;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    src -= 2*srcStride;\
-    for(i=0; i<h+5; i++)\
-    {\
-        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
-        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
-        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
-        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
-        tmp+=tmpStride;\
-        src+=srcStride;\
-    }\
-    tmp -= tmpStride*(h+5-2);\
-    for(i=0; i<w; i++)\
-    {\
-        const int tmpB= tmp[-2*tmpStride];\
-        const int tmpA= tmp[-1*tmpStride];\
-        const int tmp0= tmp[0 *tmpStride];\
-        const int tmp1= tmp[1 *tmpStride];\
-        const int tmp2= tmp[2 *tmpStride];\
-        const int tmp3= tmp[3 *tmpStride];\
-        const int tmp4= tmp[4 *tmpStride];\
-        const int tmp5= tmp[5 *tmpStride];\
-        const int tmp6= tmp[6 *tmpStride];\
-        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
-        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
-        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
-        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
-        dst++;\
-        tmp++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
-        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
-        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
-        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
-        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
-        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
-        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int srcB= src[-2*srcStride];\
-        const int srcA= src[-1*srcStride];\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        const int src5= src[5 *srcStride];\
-        const int src6= src[6 *srcStride];\
-        const int src7= src[7 *srcStride];\
-        const int src8= src[8 *srcStride];\
-        const int src9= src[9 *srcStride];\
-        const int src10=src[10*srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
-        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
-        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
-        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
-        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
-        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    const int h=8;\
-    const int w=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    src -= 2*srcStride;\
-    for(i=0; i<h+5; i++)\
-    {\
-        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
-        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
-        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
-        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
-        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
-        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
-        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
-        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
-        tmp+=tmpStride;\
-        src+=srcStride;\
-    }\
-    tmp -= tmpStride*(h+5-2);\
-    for(i=0; i<w; i++)\
-    {\
-        const int tmpB= tmp[-2*tmpStride];\
-        const int tmpA= tmp[-1*tmpStride];\
-        const int tmp0= tmp[0 *tmpStride];\
-        const int tmp1= tmp[1 *tmpStride];\
-        const int tmp2= tmp[2 *tmpStride];\
-        const int tmp3= tmp[3 *tmpStride];\
-        const int tmp4= tmp[4 *tmpStride];\
-        const int tmp5= tmp[5 *tmpStride];\
-        const int tmp6= tmp[6 *tmpStride];\
-        const int tmp7= tmp[7 *tmpStride];\
-        const int tmp8= tmp[8 *tmpStride];\
-        const int tmp9= tmp[9 *tmpStride];\
-        const int tmp10=tmp[10*tmpStride];\
-        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
-        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
-        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
-        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
-        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
-        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
-        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
-        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
-        dst++;\
-        tmp++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
-    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
-    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
-}\
-
-#define H264_MC(OPNAME, SIZE) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t half[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t half[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfV[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfV[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-
-#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
-#define op_put(a, b)  a = cm[((b) + 16)>>5]
-#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
-#define op2_put(a, b)  a = cm[((b) + 512)>>10]
-
-H264_LOWPASS(put_       , op_put, op2_put)
-H264_LOWPASS(avg_       , op_avg, op2_avg)
-H264_MC(put_, 2)
-H264_MC(put_, 4)
-H264_MC(put_, 8)
-H264_MC(put_, 16)
-H264_MC(avg_, 4)
-H264_MC(avg_, 8)
-H264_MC(avg_, 16)
-
-#undef op_avg
-#undef op_put
-#undef op2_avg
-#undef op2_put
-
-static void clear_block_c(DCTELEM *block)
-{
-    memset(block, 0, sizeof(DCTELEM)*64);
-}
-
-/**
- * memset(blocks, 0, sizeof(DCTELEM)*6*64)
- */
-static void clear_blocks_c(DCTELEM *blocks)
-{
-    memset(blocks, 0, sizeof(DCTELEM)*6*64);
-}
-
-static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
-
-/* init static data */
-av_cold void dsputil_static_init(void)
-{
-    int i;
-
-    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
-    for(i=0;i<MAX_NEG_CROP;i++) {
-        ff_cropTbl[i] = 0;
-        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
-    }
-
-    for(i=0;i<512;i++) {
-        ff_squareTbl[i] = (i - 256) * (i - 256);
-    }
-}
-
-int ff_check_alignment(void){
-    static int did_fail=0;
-    DECLARE_ALIGNED(16, int, aligned);
-
-    if((intptr_t)&aligned & 15){
-        if(!did_fail){
-#if HAVE_MMX || HAVE_ALTIVEC
-            av_log(AV_LOG_ERROR,
-                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
-                "and may be very slow or crash. This is not a bug in libavcodec,\n"
-                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
-                "Do not report crashes to FFmpeg developers.\n");
-#endif
-            did_fail=1;
-        }
-        return -1;
-    }
-    return 0;
-}
-
-av_cold void dsputil_init(DSPContext* c)
-{
-    (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function.
-    ff_check_alignment();
-    dsputil_static_init();
- 
-    c->idct_put= ff_simple_idct_put;
-    c->idct_add= ff_simple_idct_add;
-    c->idct    = ff_simple_idct;
-
-    c->clear_block = clear_block_c;
-    c->clear_blocks = clear_blocks_c;
-
-#define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
-    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
-    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
-    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
-    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
-    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
-    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
-    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
-    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
-    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
-    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
-    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
-    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
-    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
-    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
-    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
-
-
-    dspfunc(put_h264_qpel, 0, 16);
-    dspfunc(put_h264_qpel, 1, 8);
-    dspfunc(put_h264_qpel, 2, 4);
-    dspfunc(put_h264_qpel, 3, 2);
-    dspfunc(avg_h264_qpel, 0, 16);
-    dspfunc(avg_h264_qpel, 1, 8);
-    dspfunc(avg_h264_qpel, 2, 4);
-
-#undef dspfunc
-    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
-    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
-    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
-    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
-    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
-    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
-
-
-    c->prefetch= just_return;
-
-    if (HAVE_MMX)        dsputil_init_mmx   (c);
-    if (ARCH_ARM)        dsputil_init_arm   (c);
-    if (HAVE_ALTIVEC)    dsputil_init_ppc   (c); //fixme PPC prefetch
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/dsputil.h
--- a/ffmpeg_smp/h264dec/libavcodec/dsputil.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,465 +0,0 @@
-/*
- * DSP utils
- * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * DSP utils.
- * note, many functions in here may use MMX which trashes the FPU state, it is
- * absolutely necessary to call emms_c() between dsp & float/double code
- */
-
-#ifndef AVCODEC_DSPUTIL_H
-#define AVCODEC_DSPUTIL_H
-
-#include "libavutil/intreadwrite.h"
-#include "avcodec.h"
-#include "h264_idct.h"
-// 
-void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
-                             const float *win, float add_bias, int len);
-void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
-void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
-
-/* encoding scans */
-extern const uint8_t ff_alternate_horizontal_scan[64];
-extern const uint8_t ff_alternate_vertical_scan[64];
-extern const uint8_t ff_zigzag_direct[64];
-extern const uint8_t ff_zigzag248_direct[64];
-
-/* pixel operations */
-#define MAX_NEG_CROP 1024
-
-/* temporary */
-extern uint32_t ff_squareTbl[512];
-extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
-
-/* VP3 DSP functions */
-void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
-void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
-
-void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
-void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
-
-/* VP6 DSP functions */
-void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
-                           const int16_t *h_weights, const int16_t *v_weights);
-
-/* Bink functions */
-void ff_bink_idct_c    (DCTELEM *block);
-void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
-void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
-
-/* CAVS functions */
-void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-
-/* VC1 functions */
-void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-
-/* EA functions */
-void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
-
-/* 1/2^n downscaling functions from imgconvert.c */
-void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-
-void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
-              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
-
-/* minimum alignment rules ;)
-If you notice errors in the align stuff, need more alignment for some ASM code
-for some CPU or need to use a function with less aligned data then send a mail
-to the ffmpeg-devel mailing list, ...
-
-!warning These alignments might not match reality, (missing attribute((align))
-stuff somewhere possible).
-I (Michael) did not check them, these are just the alignments which I think
-could be reached easily ...
-
-!future video codecs might need functions with less strict alignment
-*/
-
-/*
-void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
-void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
-void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
-void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
-void clear_blocks_c(DCTELEM *blocks);
-*/
-
-/* add and put pixel (decoding) */
-// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
-//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
-typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
-typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
-typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
-typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-
-typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
-
-#define DEF_OLD_QPEL(name)\
-void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
-void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
-void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
-
-DEF_OLD_QPEL(qpel16_mc11_old_c)
-DEF_OLD_QPEL(qpel16_mc31_old_c)
-DEF_OLD_QPEL(qpel16_mc12_old_c)
-DEF_OLD_QPEL(qpel16_mc32_old_c)
-DEF_OLD_QPEL(qpel16_mc13_old_c)
-DEF_OLD_QPEL(qpel16_mc33_old_c)
-DEF_OLD_QPEL(qpel8_mc11_old_c)
-DEF_OLD_QPEL(qpel8_mc31_old_c)
-DEF_OLD_QPEL(qpel8_mc12_old_c)
-DEF_OLD_QPEL(qpel8_mc32_old_c)
-DEF_OLD_QPEL(qpel8_mc13_old_c)
-DEF_OLD_QPEL(qpel8_mc33_old_c)
-
-#define CALL_2X_PIXELS(a, b, n)\
-static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    b(block  , pixels  , line_size, h);\
-    b(block+n, pixels+n, line_size, h);\
-}
-
-/* motion estimation */
-// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
-// although currently h<4 is not used as functions with width <8 are neither used nor implemented
-typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
-
-/**
- * Scantable.
- */
-typedef struct ScanTable{
-    const uint8_t *scantable;
-    uint8_t permutated[64];
-    uint8_t raster_end[64];
-#if ARCH_PPC
-                /** Used by dct_quantize_altivec to find last-non-zero */
-    DECLARE_ALIGNED(16, uint8_t, inverse)[64];
-#endif
-} ScanTable;
-
-void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
-
-void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
-                         int block_w, int block_h,
-                         int src_x, int src_y, int w, int h);
-
-
-/**
- * DSPContext.
- */
-typedef struct DSPContext {
-    /* pixel ops : interface with DCT */
-    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
-    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
-    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
-    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
-    
-    void (*clear_block)(DCTELEM *block/*align 16*/);
-    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
-
-
-    /**
-     * Halfpel motion compensation with rounding (a+b+1)>>1.
-     * this is an array[4][4] of motion compensation functions for 4
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination where the result is stored
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func put_pixels_tab[4][4];
-
-    /**
-     * Halfpel motion compensation with rounding (a+b+1)>>1.
-     * This is an array[4][4] of motion compensation functions for 4
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination into which the result is averaged (a+b+1)>>1
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func avg_pixels_tab[4][4];
-
-    /**
-     * Halfpel motion compensation with no rounding (a+b)>>1.
-     * this is an array[2][4] of motion compensation functions for 2
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination where the result is stored
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func put_no_rnd_pixels_tab[4][4];
-
-    /**
-     * Halfpel motion compensation with no rounding (a+b)>>1.
-     * this is an array[2][4] of motion compensation functions for 2
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination into which the result is averaged (a+b)>>1
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func avg_no_rnd_pixels_tab[4][4];
-
-    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
-
-
-    qpel_mc_func put_qpel_pixels_tab[2][16];
-    qpel_mc_func avg_qpel_pixels_tab[2][16];
-    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
-    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
-    qpel_mc_func put_mspel_pixels_tab[8];
-
-    /**
-     * h264 Chroma MC
-     */
-    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
-    /* This is really one func used in VC-1 decoding */
-    h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3];
-
-    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
-    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
-
-    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
-    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
-
-   
-    /* (I)DCT */
-    void (*fdct)(DCTELEM *block/* align 16*/);
-    void (*fdct248)(DCTELEM *block/* align 16*/);
-
-    /* IDCT really*/
-    void (*idct)(DCTELEM *block/* align 16*/);
-
-    /**
-     * block -> idct -> clip to unsigned 8 bit -> dest.
-     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
-     * @param line_size size in bytes of a horizontal line of dest
-     */
-    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-
-    /**
-     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
-     * @param line_size size in bytes of a horizontal line of dest
-     */
-    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-
-    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
-#define EDGE_WIDTH 32
-
-    void (*prefetch)(void *mem, int stride, int h);
-
-} DSPContext;
-
-void dsputil_static_init(void);
-void dsputil_init(DSPContext* p);
-
-int ff_check_alignment(void);
-
-/**
- * permute block according to permuatation.
- * @param last last non zero element in scantable order
- */
-void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
-
-void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
-
-#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
-
-static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
-{
-    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
-}
-
-static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
-{
-    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
-}
-
-
-/**
- * Empty mmx state.
- * this must be called between any dsp function and float/double code.
- * for example sin(); dsp->idct_put(); emms_c(); cos()
- */
-#define emms_c()
-
-/* should be defined by architectures supporting
-   one or more MultiMedia extension */
-int mm_support(void);
-extern int mm_flags;
-
-void dsputil_init_arm(DSPContext* c);
-void dsputil_init_mmx(DSPContext* c);
-void dsputil_init_ppc(DSPContext* c);
-
-void ff_dsputil_init_dwt(DSPContext *c);
-
-#if HAVE_MMX
-
-#undef emms_c
-
-static inline void emms(void)
-{
-    __asm__ volatile ("emms;":::"memory");
-}
-
-
-#define emms_c() \
-{\
-    if (mm_flags & FF_MM_MMX)\
-        emms();\
-}
-
-#elif ARCH_ARM
-
-#if HAVE_NEON
-#   define STRIDE_ALIGN 16
-#endif
-
-#elif ARCH_PPC || ARCH_PPC64 || ARCH_CELL
-
-#define STRIDE_ALIGN 16
-
-#endif
-
-#ifndef STRIDE_ALIGN
-#   define STRIDE_ALIGN 8
-#endif
-
-#define WRAPPER8_16(name8, name16)\
-static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
-    return name8(s, dst           , src           , stride, h)\
-          +name8(s, dst+8         , src+8         , stride, h);\
-}
-
-#define WRAPPER8_16_SQ(name8, name16)\
-static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
-    int score=0;\
-    score +=name8(s, dst           , src           , stride, 8);\
-    score +=name8(s, dst+8         , src+8         , stride, 8);\
-    if(h==16){\
-        dst += 8*stride;\
-        src += 8*stride;\
-        score +=name8(s, dst           , src           , stride, 8);\
-        score +=name8(s, dst+8         , src+8         , stride, 8);\
-    }\
-    return score;\
-}
-
-static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN16(dst   , AV_RN16(src   ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        dst[8]= src[8];
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        AV_WN32(dst+8 , AV_RN32(src+8 ));
-        AV_WN32(dst+12, AV_RN32(src+12));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        AV_WN32(dst+8 , AV_RN32(src+8 ));
-        AV_WN32(dst+12, AV_RN32(src+12));
-        dst[16]= src[16];
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-#endif /* AVCODEC_DSPUTIL_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/get_bits.h
--- a/ffmpeg_smp/h264dec/libavcodec/get_bits.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,325 +0,0 @@
-/*
- * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * bitstream reader API header.
- */
-
-#ifndef AVCODEC_GET_BITS_H
-#define AVCODEC_GET_BITS_H
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#include "libavutil/bswap.h"
-#include "libavutil/common.h"
-#include "libavutil/intreadwrite.h"
-#include "libavutil/log.h"
-#include "mathops.h"
-
-
-typedef struct GetBitContext {
-    uint8_t *rbsp;
-    unsigned int rbsp_size;
-    uint8_t *raw;
-    const uint8_t *buffer, *buffer_end;
-    unsigned int alloc_size;
-    unsigned int buf_size;
-    uint32_t *buffer_ptr;
-    uint32_t cache0;
-    uint32_t cache1;
-    int bit_count;
-    int size_in_bits;
-} GetBitContext;
-
-/* Bitstream reader API docs:
-name
-    arbitrary name which is used as prefix for the internal variables
-
-gb
-    getbitcontext
-
-OPEN_READER(name, gb)
-    loads gb into local variables
-
-CLOSE_READER(name, gb)
-    stores local vars in gb
-
-UPDATE_CACHE(name, gb)
-    refills the internal cache from the bitstream
-    after this call at least MIN_CACHE_BITS will be available,
-
-GET_CACHE(name, gb)
-    will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit)
-
-SHOW_UBITS(name, gb, num)
-    will return the next num bits
-
-SHOW_SBITS(name, gb, num)
-    will return the next num bits and do sign extension
-
-SKIP_BITS(name, gb, num)
-    will skip over the next num bits
-    note, this is equivalent to SKIP_CACHE; SKIP_COUNTER
-
-SKIP_CACHE(name, gb, num)
-    will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER)
-
-SKIP_COUNTER(name, gb, num)
-    will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS)
-
-LAST_SKIP_CACHE(name, gb, num)
-    will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing
-
-LAST_SKIP_BITS(name, gb, num)
-    is equivalent to LAST_SKIP_CACHE; SKIP_COUNTER
-
-for examples see get_bits, show_bits, skip_bits, get_vlc
-*/
-
-#define MIN_CACHE_BITS 32
-
-#define OPEN_READER(name, gb)\
-	int name##_bit_count=(gb)->bit_count;\
-	uint32_t name##_cache0= (gb)->cache0;\
-	uint32_t name##_cache1= (gb)->cache1;\
-	uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\
-
-#define CLOSE_READER(name, gb)\
-	(gb)->bit_count= name##_bit_count;\
-	(gb)->cache0= name##_cache0;\
-	(gb)->cache1= name##_cache1;\
-	(gb)->buffer_ptr= name##_buffer_ptr;\
-
-#define UPDATE_CACHE(name, gb)\
-	if(name##_bit_count > 0){\
-		const uint32_t next= be2me_32( *name##_buffer_ptr );\
-		name##_cache0 |= NEG_USR32(next,name##_bit_count);\
-		name##_cache1 |= next<<name##_bit_count;\
-		name##_buffer_ptr++;\
-		name##_bit_count-= 32;\
-	}\
-
-#if ARCH_X86
-#   define SKIP_CACHE(name, gb, num)\
-        __asm__(\
-            "shldl %2, %1, %0          \n\t"\
-            "shll %2, %1               \n\t"\
-            : "+r" (name##_cache0), "+r" (name##_cache1)\
-            : "Ic" ((uint8_t)(num))\
-           );
-#else
-#   define SKIP_CACHE(name, gb, num)\
-        name##_cache0 <<= (num);\
-        name##_cache0 |= NEG_USR32(name##_cache1,num);\
-        name##_cache1 <<= (num);
-#endif
-
-#define SKIP_COUNTER(name, gb, num)\
-	name##_bit_count += (num);\
-
-#define SKIP_BITS(name, gb, num)\
-	{\
-		SKIP_CACHE(name, gb, num)\
-		SKIP_COUNTER(name, gb, num)\
-	}\
-
-#define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
-#define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
-
-#define SHOW_UBITS(name, gb, num)\
-	NEG_USR32(name##_cache0, num)
-
-#define SHOW_SBITS(name, gb, num)\
-        NEG_SSR32(name##_cache0, num)
-
-#define GET_CACHE(name, gb)\
-	(name##_cache0)
-
-static inline int get_bits_count(const GetBitContext *s){
-    return ((uint8_t*)s->buffer_ptr - s->buffer)*8 - 32 + s->bit_count;
-}
-
-static inline void skip_bits_long(GetBitContext *s, int n){
-    OPEN_READER(re, s)
-    re_bit_count += n;
-    re_buffer_ptr += re_bit_count>>5;
-    re_bit_count &= 31;
-    re_cache0 = be2me_32( re_buffer_ptr[-1] ) << re_bit_count;
-    re_cache1 = 0;
-    UPDATE_CACHE(re, s)
-    CLOSE_READER(re, s)
-}
-
-/**
- * read mpeg1 dc style vlc (sign bit + mantisse with no MSB).
- * if MSB not set it is negative
- * @param n length in bits
- * @author BERO
- */
-static inline int get_xbits(GetBitContext *s, int n){
-    register int sign;
-    register int32_t cache;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    cache = GET_CACHE(re,s);
-    sign=(~cache)>>31;
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-    return (NEG_USR32(sign ^ cache, n) ^ sign) - sign;
-}
-
-static inline int get_sbits(GetBitContext *s, int n){
-    register int tmp;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    tmp= SHOW_SBITS(re, s, n);
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-    return tmp;
-}
-
-/**
- * reads 1-17 bits.
- * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't
- */
-static inline unsigned int get_bits(GetBitContext *s, int n){
-    register int tmp;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    tmp= SHOW_UBITS(re, s, n);
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-    return tmp;
-}
-
-/**
- * shows 1-17 bits.
- * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't
- */
-static inline unsigned int show_bits(GetBitContext *s, int n){
-    register int tmp;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    tmp= SHOW_UBITS(re, s, n);
-//    CLOSE_READER(re, s)
-    return tmp;
-}
-
-static inline void skip_bits(GetBitContext *s, int n){
- //Note gcc seems to optimize this to s->index+=n for the ALT_READER :))
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-}
-
-static inline unsigned int get_bits1(GetBitContext *s){
-    return get_bits(s, 1);
-}
-
-static inline unsigned int show_bits1(GetBitContext *s){
-    return show_bits(s, 1);
-}
-
-static inline void skip_bits1(GetBitContext *s){
-    skip_bits(s, 1);
-}
-
-/**
- * reads 0-32 bits.
- */
-static inline unsigned int get_bits_long(GetBitContext *s, int n){
-    if(n<=MIN_CACHE_BITS) return get_bits(s, n);
-    else{
-        int ret= get_bits(s, 16) << (n-16);
-        return ret | get_bits(s, n-16);
-    }
-}
-
-/**
- * reads 0-32 bits as a signed integer.
- */
-static inline int get_sbits_long(GetBitContext *s, int n) {
-    return sign_extend(get_bits_long(s, n), n);
-}
-
-/**
- * shows 0-32 bits.
- */
-static inline unsigned int show_bits_long(GetBitContext *s, int n){
-    if(n<=MIN_CACHE_BITS) return show_bits(s, n);
-    else{
-        GetBitContext gb= *s;
-        return get_bits_long(&gb, n);
-    }
-}
-
-static inline int check_marker(GetBitContext *s, const char *msg)
-{
-    int bit= get_bits1(s);
-    if(!bit)
-        av_log(AV_LOG_INFO, "Marker bit missing %s\n", msg);
-
-    return bit;
-}
-
-/**
- * init GetBitContext.
- * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits
- * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end
- * @param bit_size the size of the buffer in bits
- *
- * While GetBitContext stores the buffer size, for performance reasons you are
- * responsible for checking for the buffer end yourself (take advantage of the padding)!
- */
-static inline void init_get_bits(GetBitContext *s,
-                   const uint8_t *buffer, int bit_size)
-{
-    int buffer_size= (bit_size+7)>>3;
-    if(buffer_size < 0 || bit_size < 0) {
-        buffer_size = bit_size = 0;
-        buffer = NULL;
-    }
-
-    s->buffer= buffer;
-    s->size_in_bits= bit_size;
-    s->buffer_end= buffer + buffer_size;
-
-    s->buffer_ptr = (uint32_t*)((intptr_t)buffer&(~3));
-    s->bit_count = 32 + 8*((intptr_t)buffer&3);
-    skip_bits_long(s, 0);
-}
-
-static inline void align_get_bits(GetBitContext *s)
-{
-    int n= (-get_bits_count(s)) & 7;
-    if(n) skip_bits(s, n);
-}
-
-#define tprintf(p, ...) {}
-
-static inline int get_bits_left(GetBitContext *gb)
-{
-    return gb->size_in_bits - get_bits_count(gb);
-}
-
-#endif /* AVCODEC_GET_BITS_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/golomb.c
--- a/ffmpeg_smp/h264dec/libavcodec/golomb.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,184 +0,0 @@
-/*
- * exp golomb vlc stuff
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * @brief
- *     exp golomb vlc stuff
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "libavutil/common.h"
-
-const uint8_t ff_log2_tab[256]={
-    0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
-    5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
-    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
-};
-
-const uint8_t ff_golomb_vlc_len[512]={
-14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
-5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-};
-
-const uint8_t ff_ue_golomb_vlc_code[512]={
-31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
- 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-const int8_t ff_se_golomb_vlc_code[512]={
- 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  8, -8,  9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15,
-  4,  4,  4,  4, -4, -4, -4, -4,  5,  5,  5,  5, -5, -5, -5, -5,  6,  6,  6,  6, -6, -6, -6, -6,  7,  7,  7,  7, -7, -7, -7, -7,
-  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
-  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-};
-
-
-const uint8_t ff_ue_golomb_len[256]={
- 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11,
-11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13,
-13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15,
-15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17,
-};
-
-const uint8_t ff_interleaved_golomb_vlc_len[256]={
-9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
-9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
-9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-};
-
-const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={
- 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3,
- 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5,
- 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-const int8_t ff_interleaved_se_golomb_vlc_code[256]={
-  8, -8,  4,  4,  9, -9, -4, -4,  2,  2,  2,  2,  2,  2,  2,  2,
- 10,-10,  5,  5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2,
-  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 12,-12,  6,  6, 13,-13, -6, -6,  3,  3,  3,  3,  3,  3,  3,  3,
- 14,-14,  7,  7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-};
-
-const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={
-0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2,
-12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3,
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,};
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/golomb.h
--- a/ffmpeg_smp/h264dec/libavcodec/golomb.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,410 +0,0 @@
-/*
- * exp golomb vlc stuff
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2004 Alex Beregszaszi
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * @brief
- *     exp golomb vlc stuff
- * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi
- */
-
-#ifndef AVCODEC_GOLOMB_H
-#define AVCODEC_GOLOMB_H
-
-#include <stdint.h>
-#include "get_bits.h"
-
-#define INVALID_VLC           0x80000000
-
-extern const uint8_t ff_golomb_vlc_len[512];
-extern const uint8_t ff_ue_golomb_vlc_code[512];
-extern const  int8_t ff_se_golomb_vlc_code[512];
-extern const uint8_t ff_ue_golomb_len[256];
-
-extern const uint8_t ff_interleaved_golomb_vlc_len[256];
-extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256];
-extern const  int8_t ff_interleaved_se_golomb_vlc_code[256];
-extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256];
-
-
- /**
- * read unsigned exp golomb code.
- */
-static inline int get_ue_golomb(GetBitContext *gb){
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    if(buf >= (1<<27)){
-        buf >>= 32 - 9;
-        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_ue_golomb_vlc_code[buf];
-    }else{
-        log= 2*av_log2_c(buf) - 31;
-        buf>>= log;
-        buf--;
-        LAST_SKIP_BITS(re, gb, 32 - log);
-        CLOSE_READER(re, gb);
-
-        return buf;
-    }
-}
-
- /**
- * read unsigned exp golomb code, constraint to a max of 31.
- * the return value is undefined if the stored value exceeds 31.
- */
-static inline int get_ue_golomb_31(GetBitContext *gb){
-    unsigned int buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    buf >>= 32 - 9;
-    LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
-    CLOSE_READER(re, gb);
-
-    return ff_ue_golomb_vlc_code[buf];
-}
-
-static inline int svq3_get_ue_golomb(GetBitContext *gb){
-    uint32_t buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    if(buf&0xAA800000){
-        buf >>= 32 - 8;
-        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_interleaved_ue_golomb_vlc_code[buf];
-    }else{
-        int ret = 1;
-
-        while (1) {
-            buf >>= 32 - 8;
-            LAST_SKIP_BITS(re, gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
-
-            if (ff_interleaved_golomb_vlc_len[buf] != 9){
-                ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
-                ret |= ff_interleaved_dirac_golomb_vlc_code[buf];
-                break;
-            }
-            ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
-            UPDATE_CACHE(re, gb);
-            buf = GET_CACHE(re, gb);
-        }
-
-        CLOSE_READER(re, gb);
-        return ret - 1;
-    }
-}
-
-/**
- * read unsigned truncated exp golomb code.
- */
-static inline int get_te0_golomb(GetBitContext *gb, int range){
-    assert(range >= 1);
-
-    if(range==1)      return 0;
-    else if(range==2) return get_bits1(gb)^1;
-    else              return get_ue_golomb(gb);
-}
-
-/**
- * read unsigned truncated exp golomb code.
- */
-static inline int get_te_golomb(GetBitContext *gb, int range){
-    assert(range >= 1);
-
-    if(range==2) return get_bits1(gb)^1;
-    else         return get_ue_golomb(gb);
-}
-
-
-/**
- * read signed exp golomb code.
- */
-static inline int get_se_golomb(GetBitContext *gb){
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    if(buf >= (1<<27)){
-        buf >>= 32 - 9;
-        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_se_golomb_vlc_code[buf];
-    }else{
-        log= 2*av_log2_c(buf) - 31;
-        buf>>= log;
-
-        LAST_SKIP_BITS(re, gb, 32 - log);
-        CLOSE_READER(re, gb);
-
-        if(buf&1) buf= -(buf>>1);
-        else      buf=  (buf>>1);
-
-        return buf;
-    }
-}
-
-static inline int svq3_get_se_golomb(GetBitContext *gb){
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    if(buf&0xAA800000){
-        buf >>= 32 - 8;
-        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_interleaved_se_golomb_vlc_code[buf];
-    }else{
-        LAST_SKIP_BITS(re, gb, 8);
-        UPDATE_CACHE(re, gb);
-        buf |= 1 | (GET_CACHE(re, gb) >> 8);
-
-        if((buf & 0xAAAAAAAA) == 0)
-            return INVALID_VLC;
-
-        for(log=31; (buf & 0x80000000) == 0; log--){
-            buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
-        }
-
-        LAST_SKIP_BITS(re, gb, 63 - 2*log - 8);
-        CLOSE_READER(re, gb);
-
-        return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
-    }
-}
-
-static inline int dirac_get_se_golomb(GetBitContext *gb){
-    uint32_t buf;
-    uint32_t ret;
-
-    ret = svq3_get_ue_golomb(gb);
-
-    if (ret) {
-        OPEN_READER(re, gb);
-        UPDATE_CACHE(re, gb);
-        buf = SHOW_SBITS(re, gb, 1);
-        LAST_SKIP_BITS(re, gb, 1);
-        ret = (ret ^ buf) - buf;
-        CLOSE_READER(re, gb);
-    }
-
-    return ret;
-}
-
-/**
- * read unsigned golomb rice code (ffv1).
- */
-static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, int esc_len){
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    log= av_log2_c(buf);
-
-    if(log > 31-limit){
-        buf >>= log - k;
-        buf += (30-log)<<k;
-        LAST_SKIP_BITS(re, gb, 32 + k - log);
-        CLOSE_READER(re, gb);
-
-        return buf;
-    }else{
-        LAST_SKIP_BITS(re, gb, limit);
-        UPDATE_CACHE(re, gb);
-
-        buf = SHOW_UBITS(re, gb, esc_len);
-
-        LAST_SKIP_BITS(re, gb, esc_len);
-        CLOSE_READER(re, gb);
-
-        return buf + limit - 1;
-    }
-}
-
-/**
- * read unsigned golomb rice code (jpegls).
- */
-static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit, int esc_len){
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf=GET_CACHE(re, gb);
-
-    log= av_log2_c(buf);
-
-    if(log - k >= 32-MIN_CACHE_BITS+(MIN_CACHE_BITS==32) && 32-log < limit){
-        buf >>= log - k;
-        buf += (30-log)<<k;
-        LAST_SKIP_BITS(re, gb, 32 + k - log);
-        CLOSE_READER(re, gb);
-
-        return buf;
-    }else{
-        int i;
-        for(i=0; SHOW_UBITS(re, gb, 1) == 0; i++){
-            LAST_SKIP_BITS(re, gb, 1);
-            UPDATE_CACHE(re, gb);
-        }
-        SKIP_BITS(re, gb, 1);
-
-        if(i < limit - 1){
-            if(k){
-                buf = SHOW_UBITS(re, gb, k);
-                LAST_SKIP_BITS(re, gb, k);
-            }else{
-                buf=0;
-            }
-
-            CLOSE_READER(re, gb);
-            return buf + (i<<k);
-        }else if(i == limit - 1){
-            buf = SHOW_UBITS(re, gb, esc_len);
-            LAST_SKIP_BITS(re, gb, esc_len);
-            CLOSE_READER(re, gb);
-
-            return buf + 1;
-        }else
-            return -1;
-    }
-}
-
-/**
- * read signed golomb rice code (ffv1).
- */
-static inline int get_sr_golomb(GetBitContext *gb, int k, int limit, int esc_len){
-    int v= get_ur_golomb(gb, k, limit, esc_len);
-
-    v++;
-    if (v&1) return v>>1;
-    else return -(v>>1);
-
-//    return (v>>1) ^ -(v&1);
-}
-
-/**
- * read signed golomb rice code (flac).
- */
-static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){
-    int v= get_ur_golomb_jpegls(gb, k, limit, esc_len);
-    return (v>>1) ^ -(v&1);
-}
-
-/**
- * read unsigned golomb rice code (shorten).
- */
-static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){
-        return get_ur_golomb_jpegls(gb, k, INT_MAX, 0);
-}
-
-/**
- * read signed golomb rice code (shorten).
- */
-static inline int get_sr_golomb_shorten(GetBitContext* gb, int k)
-{
-    int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
-    if (uvar & 1)
-        return ~(uvar >> 1);
-    else
-        return uvar >> 1;
-}
-
-
-
-#ifdef TRACE
-
-static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){
-    int show= show_bits(s, 24);
-    int pos= get_bits_count(s);
-    int i= get_ue_golomb(s);
-    int len= get_bits_count(s) - pos;
-    int bits= show>>(24-len);
-
-    print_bin(bits, len);
-
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
-
-    return i;
-}
-
-static inline int get_se(GetBitContext *s, char *file, const char *func, int line){
-    int show= show_bits(s, 24);
-    int pos= get_bits_count(s);
-    int i= get_se_golomb(s);
-    int len= get_bits_count(s) - pos;
-    int bits= show>>(24-len);
-
-    print_bin(bits, len);
-
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
-
-    return i;
-}
-
-static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){
-    int show= show_bits(s, 24);
-    int pos= get_bits_count(s);
-    int i= get_te0_golomb(s, r);
-    int len= get_bits_count(s) - pos;
-    int bits= show>>(24-len);
-
-    print_bin(bits, len);
-
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
-
-    return i;
-}
-
-#define get_ue_golomb(a) get_ue(a, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_se_golomb(a) get_se(a, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_te_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-
-#endif
-
-
-#endif /* AVCODEC_GOLOMB_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,215 +0,0 @@
-#include "config.h"
-#include "h264.h"
-#include "h264_misc.h"
-#include <math.h>
-
-H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int width, int height, h264_options *opts){
-    int i;
-    const int mb_height = (height + 15) / 16;
-    const int mb_width  = (width  + 15) / 16;
-    const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16
-
-    ff_init_cabac_states();
-
-    H264Context *h= av_mallocz(sizeof(H264Context));
-
-    start_timer(h, TOTAL);
-    h->file_name = file_name;
-    h->profile = opts->profile;
-    for (i=0; i<PROFILE_STAGES; i++)
-        h->total_time[i]=0;
-
-    h->ifile=ifile;
-    h->ofile =ofile;
-
-    h->verbose =opts->verbose;
-    h->no_mbd =opts->no_mbd;
-    h->static_3d =opts->static_3d;
-    h->pipe_bufs = opts->pipe_bufs;
-    h->slice_bufs = opts->slice_bufs;
-
-    h->ed_ppe_threads =0;
-    if (opts->ppe_ed){
-        h->ed_ppe_threads = (opts->threads >opts->ppe_ed)? opts->ppe_ed :opts->threads;
-    }
-
-    h->threads = opts->threads - h->ed_ppe_threads;
-    h->smt = opts->smt;
-    if (h->smt){
-        h->threads *= 2;
-    }
-
-    h->num_frames = opts->numframes;
-
-    h->frame_width = width;
-    h->frame_height = height;
-
-    while ((width/2) %STRIDE_ALIGN)
-        width+=STRIDE_ALIGN;
-    h->width = width;
-    h->height = mb_height*16;
-
-    h->mb_height = mb_height;
-    h->mb_width = mb_width;
-    h->mb_stride = mb_stride;
-    h->b4_stride = mb_width*4 + 1;
-    h->b_stride = mb_width*4;
-
-    h->smb_width = opts->smb_size[0];
-    h->smb_height = opts->smb_size[1] < h->smb_width ?  opts->smb_size[1]  : h->smb_width;
-    h->smbc = getSuperMBContext(h, h->smb_width, h->smb_height);    
-
-    h->wave_order = opts->wave_order;
-
-    h->pipe_bufs = opts->pipe_bufs;
-
-    h->max_dpb_cnt = DPB_SIZE + opts->pipe_bufs;
-    h->free_dpb_cnt = h->max_dpb_cnt;
-    h->dpb = av_mallocz (h->max_dpb_cnt* sizeof (DecodedPicture));
-    
-
-    h->free_sb_cnt = h->threads*opts->slice_bufs + (h->no_mbd != 0) ;  //one extra to overlap some latency of signaling/freeing slicebuffers in entropy only mode
-    h->sb_size = h->free_sb_cnt;
-    h->sb = av_mallocz(h->sb_size* sizeof(SliceBufferEntry));
-
-    h->rl_q.size = FFMAX(1, FFMIN( (h->height-3 - 512)/16, h->mb_width/2)) +1;
-    h->rl_q.free = h->rl_q.size -1;
-    h->rl_q.ready=0;
-    h->rl_q.fi = h->rl_q.fo= 0;
-    h->rl_q.queue = av_malloc(h->rl_q.size* sizeof(RingLineEntry*));
-    for (i=0; i<h->rl_q.size; i++){
-        if( posix_memalign((void**)&h->rl_q.queue[i],64,sizeof(RingLineEntry)))
-            h->rl_q.queue[i]=NULL;
-        h->rl_q.queue[i]->top = av_malloc(h->mb_width*sizeof(TopBorder));
-    }
-
-    h->rl_q.queue[0]->prev_line = h->rl_q.queue[h->rl_q.size-1];
-    for (i=1; i<h->rl_q.size; i++){
-        h->rl_q.queue[i]->prev_line = h->rl_q.queue[i-1];
-    }
-
-    if( HAVE_MMX | HAVE_ALTIVEC| HAVE_NEON ){
-        for(i=0; i<16; i++){
-            #define T(x) (x>>2) | ((x<<2) & 0xF)
-            h->zigzag_scan[i] = T(zigzag_scan[i]);
-            #undef T
-        }
-        for(i=0; i<64; i++){
-            #define T(x) (x>>3) | ((x&7)<<3)
-            h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
-            #undef T
-        }
-    }else{
-        memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
-        memcpy(h->zigzag_scan8x8, ff_zigzag_direct, 64*sizeof(uint8_t));
-    }
-
-    pthread_mutex_init(&h->smb_lock, NULL);
-    pthread_mutex_init(&h->sdl_lock, NULL);
-    pthread_cond_init(&h->sdl_cond, NULL);
-
-    ///pthread initialization
-    pthread_mutex_init(&h->ilock, NULL);
-    pthread_cond_init(&h->icond, NULL);
-    pthread_mutex_init(&h->slock, NULL);
-    pthread_cond_init(&h->scond, NULL);
-    pthread_mutex_init(&h->tlock, NULL);
-    pthread_cond_init(&h->tcond, NULL);
-    pthread_mutex_init(&h->tdlock, NULL);
-    pthread_cond_init(&h->tdcond, NULL);
-    h->start =!opts->numamap; //default dont wait for start signal
-    h->statmbd = opts->statmbd;
-    h->rl_side_touch= opts->numamap;
-    h->touch_start=0;
-    h->setaff =opts->statsched;
-    h->init_threads=0;
-
-    pthread_mutex_init(&h->task_lock, NULL);
-    pthread_cond_init(&h->task_cond, NULL);
-    for (i=0; i<STAGES; i++){
-        pthread_mutex_init (&h->lock[i], NULL);
-        pthread_cond_init (&h->cond[i], NULL);
-
-        pthread_mutex_init (&h->sb_q[i].lock, NULL);
-        pthread_cond_init (&h->sb_q[i].cond, NULL);
-        h->sb_q[i].size = h->free_sb_cnt; //change to num threads later
-        h->sb_q[i].queue = av_malloc(h->free_sb_cnt* sizeof(SliceBufferEntry*));
-        h->sb_q[i].cnt = h->sb_q[i].fi = h->sb_q[i].fo =0;
-    }
-
-#if HAVE_LIBSDL2
-    h->sdlq.size=2;
-    h->sdlq.ready=2;
-    h->sdlq.queue = av_malloc(2* sizeof(SDL_Texture*));
-    pthread_mutex_init (&h->sdlq.sdl_lock, NULL);
-    pthread_cond_init (&h->sdlq.sdl_cond, NULL);
-#endif
-
-    h->display=opts->display;
-    h->fullscreen=opts->fullscreen;
-
-    return h;
-}
-
-
-void free_h264dec_context(H264Context *h) {
-    int i;
-
-    for(i=0; i<h->max_dpb_cnt; i++)
-        free_dp(&h->dpb[i]);
-    av_free (h->dpb);
-
-    for(i=0; i<h->sb_size; i++){
-        if (h->sb[i].initialized){
-            free_sb_entry(&h->sb[i]);
-        }
-    }
-    av_freep(&h->sb);
-
-    for (i=0; i<h->rl_q.size; i++){
-        av_freep(&h->rl_q.queue[i]->top);
-        av_freep(&h->rl_q.queue[i]);
-    }
-    av_freep(&h->rl_q.queue);
-
-    ///pthread cleanup
-    pthread_mutex_destroy (&h->task_lock);
-    pthread_cond_destroy (&h->task_cond);
-    for (i=0; i<STAGES; i++){
-        pthread_mutex_destroy (&h->lock[i]);
-        pthread_cond_destroy (&h->cond[i]);
-
-        pthread_mutex_destroy (&h->sb_q[i].lock);
-        pthread_cond_destroy (&h->sb_q[i].cond);
-        av_freep( &h->sb_q[i].queue);
-    }
-    pthread_mutex_destroy (&h->slock);
-    pthread_cond_destroy (&h->scond);
-    pthread_mutex_destroy (&h->ilock);
-    pthread_cond_destroy (&h->icond);
-
-    pthread_mutex_destroy(&h->smb_lock);
-    pthread_mutex_destroy (&h->sdl_lock);
-    pthread_cond_destroy (&h->sdl_cond);
-#if HAVE_LIBSDL2
-    av_free(h->sdlq.queue);
-    pthread_mutex_destroy (&h->sdlq.sdl_lock);
-    pthread_cond_destroy (&h->sdlq.sdl_cond);
-#endif
-
-    stop_timer(h, TOTAL);
-    if (h->threads==0){
-        for (i=0; i<PROFILE_STAGES; i++)
-            h->total_time[i] /= h->num_frames;
-        double others = h->total_time[TOTAL];
-        for (i=1; i<PROFILE_STAGES; i++)
-            others-=h->total_time[i];
-        if (h->profile == 1){
-            printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [MBREC %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED], h->total_time[REC], others);
-        }else if (h->profile ==2){
-            printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [PRED  %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED],h->total_time[REC], others);
-        }
-    }
-
-    av_free(h);
-}
\ No newline at end of file
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,76 +0,0 @@
-/*
-* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
-* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
-*
-* This file is part of FFmpeg.
-*
-* FFmpeg is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* FFmpeg is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with FFmpeg; if not, write to the Free Software
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-
-/**
-* @file
-* H.264 / AVC / MPEG4 part10 codec.
-* @author Michael Niedermayer <michaelni@gmx.at>
-*/
-
-#ifndef H264_H
-#define H264_H
-
-#include "h264_entropy.h"
-#include "h264_data.h"
-#include "h264_mc.h"
-#include "h264_misc.h"
-#include "h264_dsp.h"
-#include "h264_pred.h"
-#include "h264_parser.h"
-#include "h264_nal.h"
-#include "h264_rec.h"
-#include "h264_deblock.h"
-#include "h264_types.h"
-
-typedef struct h264_options{
-    int statsched;
-    int statmbd;
-    int numamap;
-    int no_mbd;
-    int numframes;
-    int display;
-    int fullscreen;
-    int verbose;
-    int ppe_ed;         // only useful for Cell
-    int profile;
-    int threads;
-    int smb_size[2];    // only useful for OmpSs
-    int wave_order;
-    int static_3d;
-    int pipe_bufs;
-    int slice_bufs;
-    int smt;
-}h264_options;
-
-int h264_decode_cell(H264Context *h);
-int h264_decode_cell_seq(H264Context *h);
-
-int h264_decode_ompss(H264Context *h);
-
-int h264_decode_pthread(H264Context *h);
-int h264_decode_seq(H264Context *h);
-
-
-H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int frame_width, int frame_height, h264_options *opts);
-void free_h264dec_context(H264Context *h);
-
-
-#endif /* AVCODEC_H264_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_cell.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_cell.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1242 +0,0 @@
-
-#include "h264_types.h"
-#include "h264_parser.h"
-#include "h264_nal.h"
-#include "h264_entropy.h"
-#include "h264_rec.h"
-#include "h264_misc.h"
-#include "cell/h264_types_spu.h"
-#include "h264_pthread.h"
-
-#include <pthread.h>
-#include <assert.h>
-#include <unistd.h>
-
-#include <libspe2.h>
-#include <ppu_intrinsics.h>
-#include <cbe_mfc.h>
-#include <libsync.h>
-
-// spe global variables
-unsigned rl_cnt_var, rl_mutex_var, rl_cond_var;
-atomic_ea_t rl_cnt;
-cond_ea_t rl_cond;
-mutex_ea_t rl_lock;
-
-H264spe * spe_params;
-unsigned mutex_var[16];
-unsigned cond_var[16];
-unsigned atomic_var[16];
-
-pthread_t * spe_tid;
-spe_context_ptr_t *spe_context;
-void** spe_control_area;
-void** spe_ls_area;
-H264slice **spe_slice_buf;
-
-H264spe * spe_ed_params;
-unsigned mutex_ed_var[16];
-unsigned cond_ed_var[16];
-unsigned atomic_ed_var[16];
-
-pthread_t * spe_ed_tid;
-spe_context_ptr_t *spe_ed_context;
-void** spe_ed_control_area;
-void** spe_ed_ls_area;
-EDSlice_spu **spe_ed_slice_buf;
-
-//structs to propagate stop signal
-MBSlice last_slice;
-EDSlice last_ed_slice;
-DecodedPicture last_pic;
-RawFrame last_frm;
-
-static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){
-    int i;
-    int cnt = *poc_cnt;
-    for(i=0; i<cnt; i++){
-        if (poc_list[i]==s->ref_list[1][0]->poc){
-            *poc_cnt=i+1;
-            while(++i<cnt)
-                poc_list[i]=0;
-            return 1;
-        }
-    }
-    return 0;
-}
-
-static void update_IP_poc_list(int *poc_list, int *poc_cnt, int poc) {
-    int i=0;
-    int cnt = *poc_cnt;
-
-    while (poc_list[i] > poc) { i++;}
-    if ( i< cnt)
-        memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int));
-
-    poc_list[i]=poc;
-    (*poc_cnt)++;
-}
-
-static void *spe_ed_thread(void *arg){
-    H264spe *params = (H264spe *)arg;
-    unsigned int idx = params->idx;
-    unsigned int runflags = 0;
-    unsigned int entry = SPE_DEFAULT_ENTRY;
-    // run SPE context
-    spe_context_run(spe_ed_context[idx],  &entry, runflags, (void*) params, NULL, NULL);
-    // done - now exit thread
-    pthread_exit(NULL);
-}
-
-static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) {
-    int i;
-    int num_threads = ip_threads+b_threads;
-    spe_program_handle_t * spe_program = spe_image_open("spe_ed");
-    // reserve memory for spe thread id, context and argument addresses
-    spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t));
-    spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
-    spe_ed_params = av_malloc(num_threads * sizeof (H264spe));
-    spe_ed_control_area = av_malloc(num_threads * sizeof (void*));
-    spe_ed_ls_area = av_malloc(num_threads * sizeof (void*));
-    spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*));
-
-    if (spe_program == NULL)
-        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
-
-    for (i = 0; i < num_threads; i++) {
-        // create context for spe program
-        spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL);
-        if (spe_ed_context[i] == NULL)
-            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
-        // load SPE program into main memory
-        if ((spe_program_load(spe_ed_context[i], spe_program)) == -1)
-            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
-        //get the control_area for fast mailboxing
-        if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL)
-            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
-        //get ls area for inter spe communication
-        if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL)
-            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
-    }
-
-    for (i = 0; i < ip_threads; i++) {
-        spe_ed_params[i].mb_width = h->mb_width;
-        spe_ed_params[i].mb_stride = h->mb_stride;
-        spe_ed_params[i].mb_height = h->mb_height;
-        spe_ed_params[i].type = EDIP;
-        spe_ed_params[i].spe_id = i;
-        spe_ed_params[i].idx = i;
-        //spe_ed_params[i].spe_total = ip_threads; //not used
-        //spe_params[i].slice_params= &slice_params;
-        spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
-        spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads];
-
-        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
-        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
-        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
-
-        mutex_init(spe_ed_params[i].lock);
-        cond_init(spe_ed_params[i].cond);
-        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
-            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
-
-        //slicebufaddr
-        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
-        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
-    }
-    for (int j = 0; j < b_threads; j++) {
-        i = j+ip_threads;
-        spe_ed_params[i].mb_width = h->mb_width;
-        spe_ed_params[i].mb_stride = h->mb_stride;
-        spe_ed_params[i].mb_height = h->mb_height;
-        spe_ed_params[i].type = EDB;
-        spe_ed_params[i].idx = i;
-        spe_ed_params[i].spe_id = j;
-        spe_ed_params[i].spe_total = b_threads;
-        //spe_params[i].slice_params= &slice_params;
-        //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
-        spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads];
-
-        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
-        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
-        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
-
-        mutex_init(spe_ed_params[i].lock);
-        cond_init(spe_ed_params[i].cond);
-        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
-            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
-
-        //slicebufaddr
-        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
-        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
-    }
-    spe_image_close(spe_program);
-
-}
-
-static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){
-    dst->pps 	= src->pps;
-    dst->mbs 	= src->mbs;
-    dst->state 	= src->state;
-    dst->qp_thresh = src->qp_thresh;
-    dst->pic	= *src->current_picture;
-
-    dst->ref_count[0] = src->ref_count[0];
-    dst->ref_count[1] = src->ref_count[1];
-    dst->slice_type	  = src->slice_type;
-    dst->slice_type_nos = src->slice_type_nos;
-    dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag;
-    dst->list_count = src->list_count;
-    dst->coded_pic_num = src->coded_pic_num;
-
-    GetBitContext *gb = &src->gb;
-    align_get_bits( gb);
-    dst->bytestream_start = gb->buffer + get_bits_count(gb)/8;
-    dst->byte_bufsize = (get_bits_left(gb) + 7)/8;
-
-    dst->transform_bypass = src->transform_bypass;
-    dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred;
-    memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int));
-    memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int));
-    dst->cabac_init_idc = src->cabac_init_idc;
-    memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int));
-    dst->chroma_qp[0]= src->chroma_qp[0];
-    dst->chroma_qp[1]= src->chroma_qp[1];
-    dst->qscale = src->qscale;
-    dst->last_qscale_diff = src->last_qscale_diff;
-
-    if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0];
-}
-
-static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){
-    unsigned status;
-
-    spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0);
-    spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status);
-
-
-    _spe_in_mbox_write(spe_ed_control_area[id], 0);
-
-    while (!spe_out_mbox_status(spe_ed_context[id])){
-        //pthread_yield();
-        usleep(1000);
-    }
-    _spe_out_mbox_read(spe_ed_control_area[id]);
-}
-
-static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){
-    int i,j;
-
-    if( !s->pps.cabac ){
-        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
-        return -1;
-    }
-    DECLARE_ALIGNED(16, EDSlice_spu, slice);
-    fill_EDSlice_spu(&slice, s);
-
-    send_slice_to_spe_and_wait(&slice, id);
-
-    return 0;
-}
-
-static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){
-    int i,j;
-
-    if( !s->pps.cabac ){
-        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
-        return -1;
-    }
-    DECLARE_ALIGNED(16, EDSlice_spu, slice);
-    fill_EDSlice_spu(&slice, s);
-
-    send_slice_to_spe_and_wait(&slice, 0);
-    
-    if (s->release_cnt>0) {
-        for (int i=0; i<s->release_cnt; i++){
-            release_pib_entry(h, s->release_ref[i], 2);
-        }
-        s->release_cnt=0;
-    }
-
-    release_pib_entry(h, s->current_picture, 1);
-    av_freep(&s->gb.raw);
-    if (s->gb.rbsp)
-        av_freep(&s->gb.rbsp);
-
-    return 0;
-}
-
-static void *entr_IP_spe_thread(void *arg){
-    EDThreadContext *eip = (EDThreadContext *) arg;
-    H264Context *h = eip->h;
-// 	printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid));
-    for (int i=0; i<SLICE_BUFS; i++){
-        eip->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
-    }
-
-    EntropyContext *ec = get_entropy_context(h);
-    EDSlice *s;
-
-    for(;;){
-        {
-            pthread_mutex_lock(&eip->ed_lock);
-            while (eip->ed_cnt <= 0)
-                pthread_cond_wait(&eip->ed_cond, &eip->ed_lock);
-            s = &eip->ed_q[eip->ed_fo];
-            eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT;
-            pthread_mutex_unlock(&eip->ed_lock);
-        }
-
-        if (s->state<0)
-            break;
-        {
-            pthread_mutex_lock(&eip->mbs_lock);
-            while (eip->mbs_cnt <= 0)
-                pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock);
-
-            s->mbs = eip->mbs[eip->mbs_fo];
-            s->ed = eip;
-            eip->mbs_cnt--;
-            eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS;
-            pthread_mutex_unlock(&eip->mbs_lock);
-        }
-        if (eip->cell){
-            decode_slice_entropy_cell(ec, s, eip->thread_num);
-        }else{
-            decode_slice_entropy(ec, s);
-        }
-
-//         {
-//             pthread_mutex_lock(&h->lock[ENTROPY2]);
-//             h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc;
-//             while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT)
-//                 h->ed_poc_fo++;
-//
-//             pthread_cond_signal(&h->cond[ENTROPY2]);
-//             pthread_mutex_unlock(&h->lock[ENTROPY2]);
-//         }
-
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY4]);
-            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
-                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
-            h->ed_reorder_q[h->ed_reorder_fi] = *s;
-            h->ed_reorder_cnt++;
-            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
-            pthread_cond_signal(&h->cond[ENTROPY4]);
-            pthread_mutex_unlock(&h->lock[ENTROPY4]);
-        }
-
-        {
-            pthread_mutex_lock(&eip->ed_lock);
-            eip->ed_cnt--;
-            pthread_cond_signal(&eip->ed_cond);
-            pthread_mutex_unlock(&eip->ed_lock);
-        }
-    }
-
-    free_entropy_context(ec);
-
-    pthread_exit(NULL);
-    return NULL;
-}
-
-static void *entr_B_spe_thread(void *arg){
-    EDThreadContext *eb = (EDThreadContext *) arg;
-    H264Context *h = eb->h;
-// 	printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid));
-    for (int i=0; i<SLICE_BUFS; i++){
-        eb->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
-    }
-
-    EntropyContext *ec = get_entropy_context(h);
-    EDSlice *s;
-
-    for(;;){
-        {
-            pthread_mutex_lock(&eb->ed_lock);
-            while (eb->ed_cnt <= 0)
-                pthread_cond_wait(&eb->ed_cond, &eb->ed_lock);
-            s = &eb->ed_q[eb->ed_fo];
-            eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT;
-            pthread_mutex_unlock(&eb->ed_lock);
-        }
-
-        if (s->state<0)
-            break;
-        {
-            pthread_mutex_lock(&eb->mbs_lock);
-            while (eb->mbs_cnt <= 0)
-                pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock);
-            s->mbs = eb->mbs[eb->mbs_fo];
-            s->ed = eb;
-            eb->mbs_cnt--;
-            eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS;
-            pthread_mutex_unlock(&eb->mbs_lock);
-        }
-        //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed);
-        decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads);
-
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY4]);
-            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
-                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
-            h->ed_reorder_q[h->ed_reorder_fi] = *s;
-            h->ed_reorder_cnt++;
-            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
-            pthread_cond_signal(&h->cond[ENTROPY4]);
-            pthread_mutex_unlock(&h->lock[ENTROPY4]);
-
-        }
-
-        {
-            pthread_mutex_lock(&eb->ed_lock);
-            eb->ed_cnt--;
-            pthread_cond_signal(&eb->ed_cond);
-            pthread_mutex_unlock(&eb->ed_lock);
-        }
-    }
-    eb->lines_cnt++;
-
-    free_entropy_context(ec);
-
-    pthread_exit(NULL);
-    return NULL;
-}
-
-static void *entr_B_distribute(void *arg){
-    H264Context *h = (H264Context *) arg;
-    EDSlice *s;
-
-    int i, n=0, poc;
-
-// 	printf("eb dist, pid %d\n", syscall(SYS_gettid));
-
-    for(i=0; i<h->edb_threads; i++){
-        h->b[i].h =h;
-        h->b[i].thread_num =i;
-        h->b[i].thread_total =h->edb_threads;
-        pthread_mutex_init(&h->b[i].mbs_lock, NULL);
-        pthread_cond_init(&h->b[i].mbs_cond, NULL);
-        h->b[i].mbs_fo = 0;
-        h->b[i].mbs_cnt = SLICE_BUFS;
-        h->b[i].ed_fi =0;
-        h->b[i].ed_fo =0;
-        h->b[i].ed_cnt =0;
-        h->b[i].lines_cnt =0;
-        h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads];
-        pthread_mutex_init(&h->b[i].ed_lock, NULL);
-        pthread_cond_init(&h->b[i].ed_cond, NULL);
-        pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]);
-    }
-
-    for(;;){
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY3B]);
-            while (h->ed_B_cnt<=0)
-                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
-            s= &h->ed_B_q[h->ed_B_fo];
-            h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT;
-            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
-
-        }
-        if (s->state<0)
-            break;
-
-        if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){
-            while (poc < s->ref_list[1][0]->poc){
-                pthread_mutex_lock(&h->lock[ENTROPY2]);
-                while (poc == h->ed_poc)
-                    pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]);
-                poc = h->ed_poc;
-                pthread_mutex_unlock(&h->lock[ENTROPY2]);
-            }
-        }
-        {
-            pthread_mutex_lock(&h->b[n].ed_lock);
-            while (h->b[n].ed_cnt >= MAX_SLICE_COUNT)
-                pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock);
-            h->b[n].ed_q[ h->b[n].ed_fi] = *s;
-            h->b[n].ed_cnt++;
-            h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT;
-            pthread_cond_signal(&h->b[n].ed_cond);
-            pthread_mutex_unlock(&h->b[n].ed_lock);
-
-            n++; n%=h->edb_threads;
-        }
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY3B]);
-            h->ed_B_cnt--;
-            pthread_cond_signal(&h->cond[ENTROPY3B]);
-            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
-
-        }
-
-    }
-
-    for (i=0; i<h->edb_threads; i++){
-        pthread_mutex_lock(&h->b[i].ed_lock);
-        while (h->b[i].ed_cnt >= MAX_SLICE_COUNT)
-            pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock);
-        h->b[i].ed_q[ h->b[i].ed_fi] = *s;
-        h->b[i].ed_cnt++;
-        h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT;
-        pthread_cond_signal(&h->b[i].ed_cond);
-        pthread_mutex_unlock(&h->b[i].ed_lock);
-
-    }
-    for(int i=0; i<h->edb_threads; i++){
-        pthread_join(h->ed_B_thr[i], NULL);
-    }
-    pthread_exit(NULL);
-    return NULL;
-}
-
-
-static void *entr_IPB_distribute(void *arg){
-    H264Context *h = (H264Context *) arg;
-    EDSlice *s;
-    int i,n=0;
-
-    create_spe_ED_threads(h, h->edip_threads, h->edb_threads);
-    pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h);
-    for(i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
-        h->ip[i].h =h;
-        h->ip[i].cell = (i >= h->edip_ppe_threads);
-        pthread_mutex_init(&h->ip[i].mbs_lock, NULL);
-        pthread_cond_init(&h->ip[i].mbs_cond, NULL);
-        h->ip[i].thread_num = i - h->edip_ppe_threads;
-        h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads;
-        h->ip[i].mbs_fo = 0;
-        h->ip[i].mbs_cnt = SLICE_BUFS;
-        h->ip[i].ed_fi =0;
-        h->ip[i].ed_fo =0;
-        pthread_mutex_init(&h->ip[i].ed_lock, NULL);
-        pthread_cond_init(&h->ip[i].ed_cond, NULL);
-        pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]);
-    }
-
-    for(;;){
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY]);
-            while (h->ed_cnt<=0)
-                pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
-            s= &h->ed_q[h->ed_fo];
-
-            pthread_mutex_unlock(&h->lock[ENTROPY]);
-            h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
-        }
-        if (s->state<0)
-            break;
-
-        assert(s->current_picture);
-        if (s->slice_type_nos == FF_B_TYPE )
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY3B]);
-            while (h->ed_B_cnt>=MAX_SLICE_COUNT)
-                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
-            h->ed_B_q[h->ed_B_fi] = *s;
-            h->ed_B_cnt++;
-            h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
-            pthread_cond_signal(&h->cond[ENTROPY3B]);
-            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
-        }else
-        {
-            ///round robin now, change to based on rawframes size.
-            pthread_mutex_lock(&h->ip[n].ed_lock);
-            while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT)
-                pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock);
-            h->ip[n].ed_q[ h->ip[n].ed_fi] = *s;
-            h->ip[n].ed_cnt++;
-            h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT;
-            pthread_cond_signal(&h->ip[n].ed_cond);
-            pthread_mutex_unlock(&h->ip[n].ed_lock);
-
-            n++; n %=(h->edip_threads+h->edip_ppe_threads);
-        }
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY]);
-            h->ed_cnt--;
-            pthread_cond_signal(&h->cond[ENTROPY]);
-            pthread_mutex_unlock(&h->lock[ENTROPY]);
-
-        }
-    }
-
-    {
-        pthread_mutex_lock(&h->lock[ENTROPY3B]);
-        while (h->ed_B_cnt>=MAX_SLICE_COUNT)
-            pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
-        h->ed_B_q[h->ed_B_fi] = *s;
-        h->ed_B_cnt++;
-        h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
-        pthread_cond_signal(&h->cond[ENTROPY3B]);
-        pthread_mutex_unlock(&h->lock[ENTROPY3B]);
-    }
-    {
-        for (i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
-            pthread_mutex_lock(&h->ip[i].ed_lock);
-            while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT)
-                pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock);
-            h->ip[i].ed_q[ h->ip[i].ed_fi] = *s;
-            h->ip[i].ed_cnt++;
-            h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT;
-            pthread_cond_signal(&h->ip[i].ed_cond);
-            pthread_mutex_unlock(&h->ip[i].ed_lock);
-        }
-    }
-    {
-        pthread_mutex_lock(&h->lock[ENTROPY4]);
-        while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
-            pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
-        h->ed_reorder_q[h->ed_reorder_fi] = *s;
-        h->ed_reorder_cnt++;
-        h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
-        pthread_cond_signal(&h->cond[ENTROPY4]);
-        pthread_mutex_unlock(&h->lock[ENTROPY4]);
-
-    }
-    pthread_join(h->ed_B_dist, NULL);
-    for(i=0; i<h->edip_threads; i++){
-        pthread_join(h->ed_IP_thr[i], NULL);
-    }
-    pthread_exit(NULL);
-    return NULL;
-}
-
-static pthread_t ed_IPB_dist;
-static void *entropy_IPB_cell_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-    int i;
-    EDSlice reorder[MAX_SLICE_COUNT];
-    int ip_poc[MAX_SLICE_COUNT][2]={0,};
-    int next_ip_id=0;
-    int ip_poc_cnt=0;
-    EDSlice *s;
-    int reorder_cnt=0;
-    unsigned next_pic_num=0;
-
-    pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h);
-    int count =0;
-    for(;;){
-        //signals received from the entropy decoders
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY4]);
-            while (h->ed_reorder_cnt<=0)
-                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
-            s= &h->ed_reorder_q[h->ed_reorder_fo];
-            h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT;
-            pthread_mutex_unlock(&h->lock[ENTROPY4]);
-        }
-
-        if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){
-            for (i=0; i<ip_poc_cnt; i++){
-                if (s->ip_id < ip_poc[i][0]){
-                    memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int));
-                    break;
-                }
-            }
-            ip_poc[i][0]= s->ip_id;
-            ip_poc[i][1]= s->current_picture->poc;
-            ip_poc_cnt++;
-
-            while (next_ip_id == ip_poc[0][0]){
-                pthread_mutex_lock(&h->lock[ENTROPY2]);
-                h->ed_poc = ip_poc[0][1];
-
-                pthread_cond_signal(&h->cond[ENTROPY2]);
-                pthread_mutex_unlock(&h->lock[ENTROPY2]);
-                memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int));
-                ip_poc_cnt--;
-                next_ip_id++;
-            }
-        }
-
-        for(i=reorder_cnt; i>0; i--){
-            if (s->coded_pic_num < reorder[i-1].coded_pic_num)
-                break;
-            reorder[i]=reorder[i-1];
-        }
-        reorder[i]=*s;
-
-        while(reorder_cnt>=0){
-            if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){
-                break;
-            }
-            EDSlice *es = &reorder[reorder_cnt];
-
-            {
-                pthread_mutex_lock(&h->lock[MBDEC]);
-                while (h->mbdec_cnt >= MAX_SLICE_COUNT)
-                    pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
-                copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es);
-
-                h->mbdec_cnt++;
-                h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
-                pthread_cond_signal(&h->cond[MBDEC]);
-                pthread_mutex_unlock(&h->lock[MBDEC]);
-
-            }
-
-            if (es->state<0)
-                goto end;
-
-            assert(es->current_picture);
-            for (int i=0; i<es->release_cnt; i++){
-                release_pib_entry(h, es->release_ref[i], 2);
-            }
-            release_pib_entry(h, es->current_picture, 1);
-            av_freep(&es->gb.raw);
-            if (es->gb.rbsp)
-                av_freep(&es->gb.rbsp);
-
-            next_pic_num++;
-            reorder_cnt--;
-        }
-        reorder_cnt++;
-
-        {
-            pthread_mutex_lock(&h->lock[ENTROPY4]);
-            h->ed_reorder_cnt--;
-            pthread_cond_signal(&h->cond[ENTROPY4]);
-            pthread_mutex_unlock(&h->lock[ENTROPY4]);
-        }
-    }
-
-end:
-    pthread_join(ed_IPB_dist, NULL);
-    pthread_exit(NULL);
-    return NULL;
-}
-
-
-static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){
-    dst->deblocking_filter =1;
-    dst->linesize = src->current_picture->linesize[0];
-    dst->uvlinesize = src->current_picture->linesize[1];
-    dst->mb_width = h->mb_width;
-    dst->mb_height = h->mb_height;
-    dst->use_weight = src->use_weight;
-    dst->use_weight_chroma = src->use_weight_chroma;
-    dst->luma_log2_weight_denom = src->luma_log2_weight_denom;
-    dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom;
-
-    //weights later
-    memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t));
-    memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t));
-    memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t));
-
-    for(int list=0; list<2; list++){
-        for (int i=0; i<src->ref_count[list]; i++){
-            Picture_spu *p_dst = &dst->ref_list[list][i];
-            DecodedPicture *p_src = src->ref_list[list][i];
-            if (p_src){
-                p_dst->data[0] = p_src->data[0];
-                p_dst->data[1] = p_src->data[1];
-                p_dst->data[2] = p_src->data[2];
-            }
-        }
-    }
-    dst->state = src->state;
-
-    dst->emu_edge_width  =32;
-    dst->emu_edge_height =32;
-    dst->slice_type = src->slice_type;
-    dst->slice_type_nos = src->slice_type_nos;
-    dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset;
-    dst->slice_beta_offset = src->slice_beta_offset;
-
-    memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64);
-
-    dst->blocks = src->mbs;
-    dst->dst_y = src->current_picture->data[0];
-    dst->dst_cb = src->current_picture->data[1];
-    dst->dst_cr = src->current_picture->data[2];
-}
-
-static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){
-    static int rl_fi=0;
-
-    DECLARE_ALIGNED(16, H264slice, spe_slice);
-    H264spe *p=&spe_params[0];
-    unsigned status;
-    uint8_t *dst_y, *dst_cb, *dst_cr;
-
-    DecodedPicture *dp;
-
-    for (int i=0; i<2; i++){
-        for(int j=0; j< s->ref_count[i]; j++){
-            if (s->ref_list_cpn[i][j] ==-1)
-                continue;
-            int k;
-            for (k=0; k<DPB_SIZE; k++){
-                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
-                    s->ref_list[i][j] = &h->dpb[k];
-                    break;
-                }
-            }
-        }
-    }
-
-    dp = get_dpb_entry(h);
-    init_dpb_entry(dp, s, d->width, d->height);
-
-    if (h->no_mbd)
-        return;
-
-
-    fill_spe_slice(&spe_slice, s, h);
-    spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0);
-    spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status);
-    rl_fi++; rl_fi %= 2;
-
-    _spe_in_mbox_write(spe_control_area[0], 0);
-    while (atomic_read(rl_cnt)<=0){
-        //pthread_yield();
-        usleep(1000);
-    }
-    atomic_dec(rl_cnt);
-
-
-/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/
-// 	memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16);
-// 	memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8);
-// 	memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8);
-//
-// 	memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16);
-// 	memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8);
-// 	memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8);
-//
-// 	decode_slice_mb_seq(d, s);
-//
-// 	for (int i=0; i<h->mb_height*16; i++){
-// 		for (int j=0; j<h->width; j++){
-// 			if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){
-// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]);
-// 				return;
-// 			}
-// 		}
-// 	}
-//
-// 	for (int i=0; i<h->mb_height*8; i++){
-// 		for (int j=0; j<h->width/2; j++){
-// 			if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){
-// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]);
-// 				return;
-// 			}
-// 		}
-// 	}
-//
-// 	for (int i=0; i<h->mb_height*8; i++){
-// 		for (int j=0; j<h->width/2; j++){
-// 			if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){
-// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]);
-// 				return;
-// 			}
-// 		}
-// 	}
-
-
-    //printf("dst_y %p\n", dst_y);
-
-
-     for (int i=0; i<s->release_cnt; i++){
-        for(int j=0; j<DPB_SIZE; j++){
-            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
-                release_dpb_entry(h, &h->dpb[j], 2);
-                break;
-            }
-        }
-    }
-    s->release_cnt=0;
-
-}
-
-static void *h264_spe_thread(void * thread_args ) {
-    H264spe *params = (H264spe *)thread_args;
-    unsigned int spe_id = params->spe_id;
-    unsigned int runflags = 0;
-    unsigned int entry = SPE_DEFAULT_ENTRY;
-    // run SPE context
-    spe_context_run(spe_context[spe_id],  &entry, runflags, (void*) params, NULL, NULL);
-    // done - now exit thread
-    pthread_exit(NULL);
-}
-
-static int create_spe_MBR_threads(H264Context *h, int num_threads) {
-    int i;
-
-    // reserve memory for spe thread id, context and argument addresses
-    spe_tid = av_malloc(num_threads * sizeof (pthread_t));
-    spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
-    spe_params = av_malloc(num_threads * sizeof (H264spe));
-    spe_control_area = av_malloc(num_threads * sizeof (void*));
-    spe_ls_area = av_malloc(num_threads * sizeof (void*));
-    spe_slice_buf = av_malloc(num_threads * sizeof (void*));
-
-    spe_program_handle_t *spe_program = spe_image_open("spe_mbd");
-
-    if (spe_program == NULL)
-        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
-
-    for (i = 0; i < num_threads; i++) {
-        // create context for spe program
-        spe_context[i] = spe_context_create(SPE_MAP_PS, NULL);
-        if (spe_context[i] == NULL)
-            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
-        // load SPE program into main memory
-        if ((spe_program_load(spe_context[i], spe_program)) == -1)
-            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
-        //get the control_area for fast mailboxing
-        if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL)
-            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
-        //get ls area for inter spe communication
-        if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL)
-            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
-    }
-
-    for (i = 0; i < num_threads; i++) {
-        spe_params[i].mb_width = h->mb_width;
-        spe_params[i].mb_height = h->mb_height;
-        spe_params[i].mb_stride = h->mb_stride;
-        spe_params[i].spe_id = i;
-        spe_params[i].spe_total = num_threads;
-        //spe_params[i].slice_params= &slice_params;
-        spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads];
-        spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads];
-
-        spe_params[i].rl_lock = rl_lock;
-        spe_params[i].rl_cond = rl_cond;
-        spe_params[i].rl_cnt = rl_cnt;
-        spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i];
-        spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i];
-        spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0);
-
-        mutex_init(spe_params[i].lock);
-        cond_init(spe_params[i].cond);
-        if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i]))
-            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
-
-        //slicebufaddr
-        spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]);
-
-        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
-    }
-    spe_image_close(spe_program);
-    return 0;
-}
-
-//_spe_out_mbox_read(spe_control_area[i]);
-/**
-* joins all the spe worker threads.
-*/
-static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) {
-    int i;
-    ///just to keep coding consistency.
-    {
-        for (i=0; i<num_threads; i++){
-            H264spe *p=&spe_params[i];
-            unsigned status;
-
-            while (atomic_read(p->cnt)>=2) {//double buffered
-                usleep(1000);//cond_wait(p->cond, p->lock);
-            }
-
-            spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0);
-            spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
-            //mutex_unlock(p->lock);
-            _spe_in_mbox_write(spe_control_area[i], 0);
-        }
-    }
-
-    for (i=0; i<num_threads; i++){
-        pthread_join(spe_tid[i], NULL);
-    }
-
-    for (i=0; i<num_threads; i++){
-        spe_context_destroy(spe_context[i]);
-    }
-    atomic_inc(rl_cnt);
-
-    // destroy memory reserved for spe thread id, context and argument addresses
-    av_freep(&spe_tid);
-    av_freep(&spe_context);
-    av_freep(&spe_params);
-    av_freep(&spe_control_area);
-    av_freep(&spe_slice_buf);
-}
-
-
-static void *rl_dist_thread(void *arg){
-    int i;
-    H264Context *h = (H264Context *) arg;
-    MBSlice *s;
-    DecodedPicture *dp;
-    int rl_fi[16]={0,};
-    DECLARE_ALIGNED(16, H264slice, spe_slice);
-
-    create_spe_MBR_threads(h, h->rl_threads);
-    for(;;){
-        {
-            pthread_mutex_lock(&h->lock[MBDEC]);
-            while (h->mbdec_cnt<=0)
-                pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
-            s= &h->mbdec_q[h->mbdec_fo];
-            h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT;
-            pthread_mutex_unlock(&h->lock[MBDEC]);
-        }
-
-        if (s->state<0){
-            break;
-        }
-        for (int i=0; i<2; i++){
-            for(int j=0; j< s->ref_count[i]; j++){
-                if (s->ref_list_cpn[i][j] ==-1)
-                    continue;
-                int k;
-                for (k=0; k<DPB_SIZE; k++){
-                    if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
-                        s->ref_list[i][j] = &h->dpb[k];
-                        break;
-                    }
-                }
-
-            }
-        }
-        dp = get_dpb_entry(h);
-        init_dpb_entry(dp, s, h->width, h->height);
-        assert(s->current_picture);
-        {
-            while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
-                usleep(1000);
-            }
-            h->mbrel_q[h->mbrel_fi] = *s;
-
-            h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
-        }
-        {
-            if(h->no_mbd){
-                atomic_inc(rl_cnt);
-            }else {
-                fill_spe_slice(&spe_slice, s, h);
-                for (i=0; i<h->rl_threads; i++){
-                    H264spe *p=&spe_params[i];
-                    unsigned status;
-                    while (atomic_read(p->cnt)>=2){ //double buffered
-                        usleep(1000);
-                        //cond_wait(p->cond, p->lock);
-                    }
-                    spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0);
-                    spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
-                    rl_fi[i]++; rl_fi[i] %= 2;
-                    atomic_inc(p->cnt);
-
-                    _spe_in_mbox_write(spe_control_area[i], 0);
-                }
-            }
-        }
-
-        {
-            pthread_mutex_lock(&h->lock[MBDEC]);
-            h->mbdec_cnt--;
-            pthread_cond_signal(&h->cond[MBDEC]);
-            pthread_mutex_unlock(&h->lock[MBDEC]);
-        }
-
-    }
-
-    {
-        while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
-            usleep(1000);
-        }
-        h->mbrel_q[h->mbrel_fi] = *s;
-
-        h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
-    }
-    spe_slice.state=-1;
-    join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi);
-    pthread_exit(NULL);
-    return NULL;
-}
-
-static void *mbdec_cell_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-
-    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
-    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
-    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
-    atomic_set(rl_cnt, 0);
-    mutex_init(rl_lock);
-    cond_init(rl_cond);
-// 	printf("mbdec, pid %d\n", syscall(SYS_gettid));
-    pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h);
-
-    for(;;){
-        MBSlice *s=NULL;
-        {
-            while (atomic_read(rl_cnt)<=0){
-                usleep(1000);
-            }
-            s= &h->mbrel_q[h->mbrel_fo];
-            h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT;
-        }
-
-        if (s->state<0)
-            break;
-
-        for (int i=0; i<s->release_cnt; i++){
-            for(int j=0; j<DPB_SIZE; j++){
-                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
-                    release_dpb_entry(h, &h->dpb[j], 2);
-                    break;
-                }
-            }
-        }
-
-        {
-            EDThreadContext *ed = s->ed;
-            pthread_mutex_lock(&ed->mbs_lock);
-            ed->mbs_cnt++;
-            pthread_cond_signal(&ed->mbs_cond);
-            pthread_mutex_unlock(&ed->mbs_lock);
-        }
-
-        {
-            pthread_mutex_lock(&h->lock[WRITE]);
-            while (h->write_cnt>= DPB_SIZE)
-                pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
-            assert(s);
-            assert(s->current_picture);
-            h->write_q[h->write_fi]= s->current_picture;
-            h->write_cnt++;
-            h->write_fi++; h->write_fi %= DPB_SIZE;
-            pthread_cond_signal(&h->cond[WRITE]);
-            pthread_mutex_unlock(&h->lock[WRITE]);
-
-        }
-        {
-            atomic_dec(rl_cnt);
-        }
-
-    }
-
-    {//propagate exit
-        pthread_mutex_lock(&h->lock[WRITE]);
-        while (h->write_cnt>= DPB_SIZE)
-            pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
-        last_pic.reference = -1;
-        h->write_q[h->write_fi] = &last_pic;
-        h->write_cnt++;
-        h->write_fi++; h->write_fi %= DPB_SIZE;
-        pthread_cond_signal(&h->cond[WRITE]);
-        pthread_mutex_unlock(&h->lock[WRITE]);
-
-    }
-    pthread_join(h->rl_dist_thr, NULL);
-    pthread_exit(NULL);
-    return NULL;
-}
-
-/*
-* The following code is the main loop of the file converter
-*/
-int h264_decode_cell(H264Context *h) {
-
-    pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;   
-
-    start_timer();
-
-    pthread_create(&read_thr, NULL, read_thread, h);
-    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
-    pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h);
-    pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h);
-    pthread_create(&write_thr, NULL, write_thread, h);
-
-    pthread_join(read_thr, NULL);
-    pthread_join(parsenal_thr, NULL);
-    pthread_join(entropy_thr, NULL);
-    pthread_join(mbdec_thr, NULL);
-    pthread_join(write_thr, NULL);
-
-    return 0;
-}
-
-/*
-* The following code is the main loop of the file converter
-*/
-int h264_decode_cell_seq(H264Context *h) {
-ParserContext *pc;
-    NalContext *nc;
-    EntropyContext *ec;
-    MBRecContext *rc;
-    OutputContext *oc;
-
-    RawFrame frm;
-    EDSlice slice, *s=&slice;
-    MBSlice mbslice, *s2=&mbslice;
-    PictureInfo *pic=NULL;
-    DecodedPicture *out;
-    int size;
-    int frames=0;
-    
-    pc = get_parse_context(h->ifile);
-    nc = get_nal_context(h->width, h->height);
-    ec = get_entropy_context( h );
-    rc = get_mbrec_context(h);
-    oc = get_output_context( h );
-
-    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
-    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
-    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
-    atomic_set(rl_cnt, 0);
-    mutex_init(rl_lock);
-    cond_init(rl_cond);
-
-    memset(s, 0, sizeof(EDSlice));
-    ff_init_slice(nc, s);
-    s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
-
-    DecodedPicture tmp;
-    tmp.base[0]=0;
-    ///fix this when want to debug the Cell errors
-    //init_dpb_entry(&tmp, h->width, h->height);
-
-    create_spe_ED_threads(h, 1, 0);
-    create_spe_MBR_threads(h, 1);
-    
-    start_timer();
-
-    while(!pc->final_frame && frames++ < h->num_frames){
-
-        av_read_frame_internal(pc, &frm);
-        
-        PictureInfo *pic=get_pib_entry(h);
-        ff_alloc_picture_info(nc, s, pic);
-        decode_nal_units(nc, s, &frm);
-
-        copyEDtoMBSlice(s2, s);
-        decode_slice_entropy_cell_seq(h, ec, s);
-        
-        decode_slice_mb_seq_cell(h, rc, s2, &tmp);
-
-        out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height);
-        
-        if (out){
-            release_dpb_entry(h, out, 1);
-        }
-        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
-    }
-    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
-
-    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
-
-    /* finished ! */
-    av_freep(&s->mbs);
-
-    free_parse_context(pc);
-    free_nal_context  (nc);
-    free_entropy_context(ec);
-    free_mbrec_context(rc);
-    free_output_context(oc);                
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_data.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_data.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,243 +0,0 @@
-/*
- * H26L/H264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * @brief
- *     H264 / AVC / MPEG4 part10 codec data table
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_H264DATA_H
-#define AVCODEC_H264DATA_H
-
-#include <stdint.h>
-#include "avcodec.h"
-//#include "h264.h"
-
-/*
-o-o o-o
- / / /
-o-o o-o
- ,---'
-o-o o-o
- / / /
-o-o o-o
-*/
-//This table must be here because scan8[constant] must be known at compiletime
-static const uint8_t scan8[16 + 2*4]={
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
- 1+1*8, 2+1*8,
- 1+2*8, 2+2*8,
- 1+4*8, 2+4*8,
- 1+5*8, 2+5*8,
-};
-
-static const uint8_t golomb_to_pict_type[5]=
-{FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
-
-static const uint8_t golomb_to_intra4x4_cbp[48]={
- 47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
- 16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
-  8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
-};
-
-static const uint8_t golomb_to_inter_cbp[48]={
-  0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
- 14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
- 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
-};
-
-static const uint8_t zigzag_scan[16]={
- 0+0*4, 1+0*4, 0+1*4, 0+2*4,
- 1+1*4, 2+0*4, 3+0*4, 2+1*4,
- 1+2*4, 0+3*4, 1+3*4, 2+2*4,
- 3+1*4, 3+2*4, 2+3*4, 3+3*4,
-};
-
-static const uint8_t field_scan[16]={
- 0+0*4, 0+1*4, 1+0*4, 0+2*4,
- 0+3*4, 1+1*4, 1+2*4, 1+3*4,
- 2+0*4, 2+1*4, 2+2*4, 2+3*4,
- 3+0*4, 3+1*4, 3+2*4, 3+3*4,
-};
-
-static const uint8_t luma_dc_zigzag_scan[16]={
- 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
- 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
- 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
- 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
-};
-
-static const uint8_t luma_dc_field_scan[16]={
- 0*16 + 0*64, 2*16 + 0*64, 1*16 + 0*64, 0*16 + 2*64,
- 2*16 + 2*64, 3*16 + 0*64, 1*16 + 2*64, 3*16 + 2*64,
- 0*16 + 1*64, 2*16 + 1*64, 0*16 + 3*64, 2*16 + 3*64,
- 1*16 + 1*64, 3*16 + 1*64, 1*16 + 3*64, 3*16 + 3*64,
-};
-
-static const uint8_t chroma_dc_scan[4]={
- (0+0*2)*16, (1+0*2)*16,
- (0+1*2)*16, (1+1*2)*16,  //FIXME
-};
-
-
-static const uint8_t field_scan8x8[64]={
- 0+0*8, 0+1*8, 0+2*8, 1+0*8,
- 1+1*8, 0+3*8, 0+4*8, 1+2*8,
- 2+0*8, 1+3*8, 0+5*8, 0+6*8,
- 0+7*8, 1+4*8, 2+1*8, 3+0*8,
- 2+2*8, 1+5*8, 1+6*8, 1+7*8,
- 2+3*8, 3+1*8, 4+0*8, 3+2*8,
- 2+4*8, 2+5*8, 2+6*8, 2+7*8,
- 3+3*8, 4+1*8, 5+0*8, 4+2*8,
- 3+4*8, 3+5*8, 3+6*8, 3+7*8,
- 4+3*8, 5+1*8, 6+0*8, 5+2*8,
- 4+4*8, 4+5*8, 4+6*8, 4+7*8,
- 5+3*8, 6+1*8, 6+2*8, 5+4*8,
- 5+5*8, 5+6*8, 5+7*8, 6+3*8,
- 7+0*8, 7+1*8, 6+4*8, 6+5*8,
- 6+6*8, 6+7*8, 7+2*8, 7+3*8,
- 7+4*8, 7+5*8, 7+6*8, 7+7*8,
-};
-
-typedef struct IMbInfo{
-    uint16_t type;
-    uint8_t pred_mode;
-    uint8_t cbp;
-} IMbInfo;
-
-static const IMbInfo i_mb_type_info[26]={
-{MB_TYPE_INTRA4x4  , -1, -1},
-{MB_TYPE_INTRA16x16,  2,  0},
-{MB_TYPE_INTRA16x16,  1,  0},
-{MB_TYPE_INTRA16x16,  0,  0},
-{MB_TYPE_INTRA16x16,  3,  0},
-{MB_TYPE_INTRA16x16,  2,  16},
-{MB_TYPE_INTRA16x16,  1,  16},
-{MB_TYPE_INTRA16x16,  0,  16},
-{MB_TYPE_INTRA16x16,  3,  16},
-{MB_TYPE_INTRA16x16,  2,  32},
-{MB_TYPE_INTRA16x16,  1,  32},
-{MB_TYPE_INTRA16x16,  0,  32},
-{MB_TYPE_INTRA16x16,  3,  32},
-{MB_TYPE_INTRA16x16,  2,  15+0},
-{MB_TYPE_INTRA16x16,  1,  15+0},
-{MB_TYPE_INTRA16x16,  0,  15+0},
-{MB_TYPE_INTRA16x16,  3,  15+0},
-{MB_TYPE_INTRA16x16,  2,  15+16},
-{MB_TYPE_INTRA16x16,  1,  15+16},
-{MB_TYPE_INTRA16x16,  0,  15+16},
-{MB_TYPE_INTRA16x16,  3,  15+16},
-{MB_TYPE_INTRA16x16,  2,  15+32},
-{MB_TYPE_INTRA16x16,  1,  15+32},
-{MB_TYPE_INTRA16x16,  0,  15+32},
-{MB_TYPE_INTRA16x16,  3,  15+32},
-{MB_TYPE_INTRA_PCM , -1, -1},
-};
-
-typedef struct PMbInfo{
-    uint16_t type;
-    uint8_t partition_count;
-} PMbInfo;
-
-static const PMbInfo p_mb_type_info[5]={
-{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
-};
-
-static const PMbInfo p_sub_mb_type_info[4]={
-{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
-{MB_TYPE_16x8 |MB_TYPE_P0L0             , 2},
-{MB_TYPE_8x16 |MB_TYPE_P0L0             , 2},
-{MB_TYPE_8x8  |MB_TYPE_P0L0             , 4},
-};
-
-static const PMbInfo b_mb_type_info[23]={
-{MB_TYPE_DIRECT2|MB_TYPE_L0L1                                      , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
-{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
-};
-
-static const PMbInfo b_sub_mb_type_info[13]={
-{MB_TYPE_DIRECT2                                                   , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
-{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
-{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
-{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x8  |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 4, },
-{MB_TYPE_8x8               |MB_TYPE_P0L1             |MB_TYPE_P1L1, 4, },
-{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
-};
-
-static const uint8_t dequant4_coeff_init[6][3]={
-  {10,13,16},
-  {11,14,18},
-  {13,16,20},
-  {14,18,23},
-  {16,20,25},
-  {18,23,29},
-};
-
-static const uint8_t dequant8_coeff_init_scan[16] = {
-  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
-};
-static const uint8_t dequant8_coeff_init[6][6]={
-  {20,18,32,19,25,24},
-  {22,19,35,21,28,26},
-  {26,23,42,24,33,31},
-  {28,25,45,26,35,33},
-  {32,28,51,30,40,38},
-  {36,32,58,34,46,43},
-};
-
-#endif /* AVCODEC_H264DATA_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_deblock.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_deblock.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,507 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... loop filter
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 loop filter.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "dsputil.h"
-#include "mathops.h"
-#include "rectangle.h"
-#include "h264_types.h"
-#include "h264_misc.h"
-#include "h264_data.h"
-//#undef NDEBUG
-#include <assert.h>
-
-/* Deblocking filter (p153) */
-static const uint8_t alpha_table[52*3] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
-     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
-    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
-    80, 90,101,113,127,144,162,182,203,226,
-   255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
-static const uint8_t beta_table[52*3] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
-     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
-     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
-    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
-    18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-};
-static const uint8_t tc0_table[52*3][4] = {
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
-    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
-    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
-    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
-    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
-    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
-    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-};
-
-av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) {
-    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]];
-        tc[1] = tc0_table[index_a][bS[1]];
-        tc[2] = tc0_table[index_a][bS[2]];
-        tc[3] = tc0_table[index_a][bS[3]];
-        mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
-    } else {
-        mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
-    }
-}
-
-av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
-    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]]+1;
-        tc[1] = tc0_table[index_a][bS[1]]+1;
-        tc[2] = tc0_table[index_a][bS[2]]+1;
-        tc[3] = tc0_table[index_a][bS[3]]+1;
-        mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
-    } else {
-        mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
-    }
-}
-
-
-av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
-    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]];
-        tc[1] = tc0_table[index_a][bS[1]];
-        tc[2] = tc0_table[index_a][bS[2]];
-        tc[3] = tc0_table[index_a][bS[3]];
-        mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
-    } else {
-        mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
-    }
-}
-
-av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
-    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a];
-    const int beta  = beta_table[qp + s->slice_beta_offset];
-    if (alpha ==0 || beta == 0) return;
-
-    if( bS[0] < 4 ) {
-        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]]+1;
-        tc[1] = tc0_table[index_a][bS[1]]+1;
-        tc[2] = tc0_table[index_a][bS[2]]+1;
-        tc[3] = tc0_table[index_a][bS[3]]+1;
-        mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
-    } else {
-        mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
-    }
-}
-
-static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) {
-    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
-    const int qp_xy= m->qscale_mb_xy;
-    const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy;
-    const int linesize = mrc->linesize;
-    const int uvlinesize = mrc->uvlinesize;
-    const int mb_type = m->mb_type;
-    int edge;
-    const int edges = mrs->edges[dir];
-
-    if(mbm_type){
-        int16_t* bS=mrs->bS[dir][0];
-        /* Filter edge */
-        // Do not use s->qscale as luma quantizer because it has not the same
-        // value in IPCM macroblocks.
-        if(bS[0]+bS[1]+bS[2]+bS[3]){
-            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
-            if( dir == 0 ) {
-                filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s );
-                {
-                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
-                    filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s);
-                    filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s);
-                }
-            } else {
-                filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s );
-                {
-                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
-                    filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s);
-                    filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s);
-                }
-            }
-        }
-    }
-
-    for( edge = 1; edge < edges; edge++ ) {
-        int16_t* bS=mrs->bS[dir][edge];
-        int qp = qp_xy;
-
-        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
-            continue;
-
-        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-            continue;
-
-        /* Filter edge */
-        // Do not use s->qscale as luma quantizer because it has not the same
-        // value in IPCM macroblocks.
-
-        if( dir == 0 ) {
-            filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s);
-            if( (edge&1) == 0 ) {
-                filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
-                filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
-            }
-        } else {
-            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s );
-            if( (edge&1) == 0 ) {
-                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
-                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
-            }
-        }
-    }
-}
-
-static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){
-    int v;
-    v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx];
-    if(!v && mrs->ref_cache[0][b_idx]!=-1)
-        // absolute value >= 7 | ...
-        v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
-        ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
-
-    if(s->list_count==2){
-        if(!v)
-            v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) |
-            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
-            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit);
-
-        if(v){
-            if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) |
-                (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx]))
-                return 1;
-            return
-            ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
-            ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
-            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
-            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
-        }
-    }
-
-    return v;
-}
-
-static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) {
-    int mb_type = m->mb_type;
-    int edge;
-    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
-
-    // how often to recheck mv-based bS when iterating between edges
-    static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
-    {0,3,1,1,3,3,3,3}};
-    const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
-    const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
-    // how often to recheck mv-based bS when iterating along each edge
-    const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
-
-    mrs->edges[dir]= edges;
-
-    if(mbm_type){
-        int16_t* bS=mrs->bS[dir][0];
-        if( IS_INTRA(mb_type|mbm_type)) {
-            AV_WN64A(bS, 0x0004000400040004ULL);
-        } else {
-            int i;
-            int mv_done;
-            if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
-                int b_idx= 8 + 4;
-                int bn_idx= b_idx - (dir ? 8:1);
-
-                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit);
-                mv_done = 1;
-            }
-            else
-                mv_done = 0;
-
-            for( i = 0; i < 4; i++ ) {
-                int x = dir == 0 ? 0 : i;
-                int y = dir == 0 ? i    : 0;
-                int b_idx= 8 + 4 + x + 8*y;
-                int bn_idx= b_idx - (dir ? 8:1);
-
-                if( mrs->non_zero_count_cache[b_idx] |
-                    mrs->non_zero_count_cache[bn_idx] ) {
-                    bS[i] = 2;
-                }
-                else if(!mv_done)
-                {
-                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
-                }
-            }
-        }
-    }
-
-    /* Calculate bS */
-    for( edge = 1; edge < edges; edge++ ) {
-        int16_t* bS=mrs->bS[dir][edge];
-
-        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
-            continue;
-
-        if( IS_INTRA(mb_type)) {
-            AV_WN64A(bS, 0x0003000300030003ULL);
-        } else {
-            int i;
-            int mv_done;
-
-            if( edge & mask_edge ) {
-                AV_ZERO64(bS);
-                mv_done = 1;
-            }
-            else if( mask_par0 ) {
-                int b_idx= 8 + 4 + edge * (dir ? 8:1);
-                int bn_idx= b_idx - (dir ? 8:1);
-
-                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
-                mv_done = 1;
-            }
-            else
-                mv_done = 0;
-
-            for( i = 0; i < 4; i++ ) {
-                int x = dir == 0 ? edge : i;
-                int y = dir == 0 ? i    : edge;
-                int b_idx= 8 + 4 + x + 8*y;
-                int bn_idx= b_idx - (dir ? 8:1);
-
-                if( mrs->non_zero_count_cache[b_idx] |
-                    mrs->non_zero_count_cache[bn_idx] ) {
-                    bS[i] = 2;
-                }
-                else if(!mv_done)
-                {
-                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
-                }
-            }
-
-            if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-                continue;
-        }
-
-    }
-}
-
-
-/**
-*
-* @return zero if the loop filter can be skiped
-*/
-static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
-    H264Mb *m_top = m - mrc->mb_width;
-    H264Mb *m_left = m - 1;
-    const int mb_x = m->mb_x;
-    const int mb_y = m->mb_y;
-    int top_type, left_type;
-    int qp, top_qp, left_qp;
-    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
-
-    qp = m->qscale_mb_xy ;
-    left_qp = m->qscale_left_mb_xy ;
-    top_qp  = m->qscale_top_mb_xy ;
-
-    //for sufficiently low qp, filtering wouldn't do anything
-    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
-    if(qp <= qp_thresh
-        && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
-        && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
-        return 0;
-    }
-
-    if(IS_INTRA(mb_type)){
-        return 1;
-    }
-
-    {
-        int list;
-        for(list=0; list<s->list_count; list++){
-            int8_t *ref;
-
-            if(!USES_LIST(mb_type, list)){
-                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
-                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
-                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-                continue;
-            }
-
-            ref = &mrs->ref_index[list][4*mb_x];
-            {
-                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
-                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-                ref += 2;
-
-                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-                AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
-            }
-        }
-    }
-
-    /*
-    0 . T T. T T T T
-    1 L . .L . . . .
-    2 L . .L . . . .
-    3 . T TL . . . .
-    4 L . .L . . . .
-    5 L . .. . . . .
-    */
-
-    if (IS_SKIP(mb_type)){
-        memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
-    }
-
-    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
-    top_type  = mrs->top_type;
-    left_type = mrs->left_type;
-    if(top_type){
-        AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]);
-    }
-
-    if(left_type){
-        mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4];
-        mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4];
-        mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4];
-        mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4];
-    }
-
-    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
-        int list;
-        for(list=0; list<s->list_count; list++){
-            if(USES_LIST(top_type, list)){
-                const int b_xy= 4*mb_x + 3*mrc->b_stride;
-                const int b8_x= 4*mb_x + 2;
-                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
-                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
-
-                mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
-                mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]];
-                mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
-                mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]];
-            }else{
-                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
-                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
-            }
-
-            if(USES_LIST(left_type, list)){
-                const int b_x = 4*(mb_x-1) + 3;
-                const int b8_x= 4*(mb_x-1) + 1;
-                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
-                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]);
-                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]);
-                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]);
-                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]);
-
-                mrs->ref_cache[list][scan8[0] - 1 + 0 ]=
-                mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]];
-                mrs->ref_cache[list][scan8[0] - 1 +16 ]=
-                mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]];
-
-            }else{
-                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]);
-                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]);
-                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]);
-                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]);
-
-                mrs->ref_cache[list][scan8[0] - 1 + 0  ]=
-                mrs->ref_cache[list][scan8[0] - 1 + 8  ]=
-                mrs->ref_cache[list][scan8[0] - 1 + 16 ]=
-                mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
-            }
-        }
-    }
-    return 1;
-}
-
-void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
-    if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){
-        calc_bS_values(mrc, mrs, s, m, 4, 0);
-        calc_bS_values(mrc, mrs, s, m, 4, 1);
-        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0);
-        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1);
-    }
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_deblock.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_deblock.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#ifndef H264_LOOPFILTER_H
-#define H264_LOOPFILTER_H
-
-#include "h264_types.h"
-
-void ff_h264_filter_mb(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_dsp.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_dsp.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,320 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 DSP functions.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include <stdint.h>
-#include "avcodec.h"
-#include "h264_dsp.h"
-
-#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
-#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-    int y; \
-    offset <<= log2_denom; \
-    if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
-        op_scale1(0); \
-        op_scale1(1); \
-        if(W==2) continue; \
-        op_scale1(2); \
-        op_scale1(3); \
-        if(W==4) continue; \
-        op_scale1(4); \
-        op_scale1(5); \
-        op_scale1(6); \
-        op_scale1(7); \
-        if(W==8) continue; \
-        op_scale1(8); \
-        op_scale1(9); \
-        op_scale1(10); \
-        op_scale1(11); \
-        op_scale1(12); \
-        op_scale1(13); \
-        op_scale1(14); \
-        op_scale1(15); \
-    } \
-} \
-static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    int y; \
-    offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
-        op_scale2(0); \
-        op_scale2(1); \
-        if(W==2) continue; \
-        op_scale2(2); \
-        op_scale2(3); \
-        if(W==4) continue; \
-        op_scale2(4); \
-        op_scale2(5); \
-        op_scale2(6); \
-        op_scale2(7); \
-        if(W==8) continue; \
-        op_scale2(8); \
-        op_scale2(9); \
-        op_scale2(10); \
-        op_scale2(11); \
-        op_scale2(12); \
-        op_scale2(13); \
-        op_scale2(14); \
-        op_scale2(15); \
-    } \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
-
-#undef op_scale1
-#undef op_scale2
-#undef H264_WEIGHT
-
-static av_always_inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
-{
-    int i, d;
-    for( i = 0; i < 4; i++ ) {
-        if( tc0[i] < 0 ) {
-            pix += 4*ystride;
-            continue;
-        }
-        for( d = 0; d < 4; d++ ) {
-            const int p0 = pix[-1*xstride];
-            const int p1 = pix[-2*xstride];
-            const int p2 = pix[-3*xstride];
-            const int q0 = pix[0];
-            const int q1 = pix[1*xstride];
-            const int q2 = pix[2*xstride];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                int tc = tc0[i];
-                int i_delta;
-
-                if( FFABS( p2 - p0 ) < beta ) {
-                    if(tc0[i])
-                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
-                    tc++;
-                }
-                if( FFABS( q2 - q0 ) < beta ) {
-                    if(tc0[i])
-                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
-                    tc++;
-                }
-
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
-            }
-            pix += ystride;
-        }
-    }
-}
-static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
-}
-static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
-}
-
-static av_always_inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
-{
-    int d;
-    for( d = 0; d < 16; d++ ) {
-        const int p2 = pix[-3*xstride];
-        const int p1 = pix[-2*xstride];
-        const int p0 = pix[-1*xstride];
-
-        const int q0 = pix[ 0*xstride];
-        const int q1 = pix[ 1*xstride];
-        const int q2 = pix[ 2*xstride];
-
-        if( FFABS( p0 - q0 ) < alpha &&
-            FFABS( p1 - p0 ) < beta &&
-            FFABS( q1 - q0 ) < beta ) {
-
-            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                if( FFABS( p2 - p0 ) < beta)
-                {
-                    const int p3 = pix[-4*xstride];
-                    /* p0', p1', p2' */
-                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                } else {
-                    /* p0' */
-                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                }
-                if( FFABS( q2 - q0 ) < beta)
-                {
-                    const int q3 = pix[3*xstride];
-                    /* q0', q1', q2' */
-                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                } else {
-                    /* q0' */
-                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                }
-            }else{
-                /* p0', q0' */
-                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-            }
-        }
-        pix += ystride;
-    }
-}
-static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
-}
-static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
-}
-
-static av_always_inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
-{
-    int i, d;
-    for( i = 0; i < 4; i++ ) {
-        const int tc = tc0[i];
-        if( tc <= 0 ) {
-            pix += 2*ystride;
-            continue;
-        }
-        for( d = 0; d < 2; d++ ) {
-            const int p0 = pix[-1*xstride];
-            const int p1 = pix[-2*xstride];
-            const int q0 = pix[0];
-            const int q1 = pix[1*xstride];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-
-                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
-                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
-            }
-            pix += ystride;
-        }
-    }
-}
-static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
-}
-static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
-}
-
-static av_always_inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
-{
-    int d;
-    for( d = 0; d < 8; d++ ) {
-        const int p0 = pix[-1*xstride];
-        const int p1 = pix[-2*xstride];
-        const int q0 = pix[0];
-        const int q1 = pix[1*xstride];
-
-        if( FFABS( p0 - q0 ) < alpha &&
-            FFABS( p1 - p0 ) < beta &&
-            FFABS( q1 - q0 ) < beta ) {
-
-            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
-            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
-        }
-        pix += ystride;
-    }
-}
-static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
-}
-static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
-}
-
-void ff_h264dsp_init(H264DSPContext *c)
-{
-    c->h264_idct_add= ff_h264_idct_add_c;
-    c->h264_idct8_add= ff_h264_idct8_add_c;
-    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
-    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
-    c->h264_idct_add16     = ff_h264_idct_add16_c;
-    c->h264_idct8_add4     = ff_h264_idct8_add4_c;
-    c->h264_idct_add8      = ff_h264_idct_add8_c;
-    c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
-
-    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
-    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
-    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
-    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
-    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
-    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
-    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
-    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
-    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
-    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
-    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
-    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
-    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
-    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
-    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
-    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
-    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
-    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
-    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
-    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
-
-    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
-    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
-    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
-    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
-    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
-    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
-    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
-    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
-    c->h264_loop_filter_strength= NULL;
-
-    if (ARCH_ARM) ff_h264dsp_init_arm(c);
-    if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c);
-    if (HAVE_MMX) ff_h264dsp_init_x86(c);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_dsp.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_dsp.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 DSP functions.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_H264DSP_H
-#define AVCODEC_H264DSP_H
-
-#include <stdint.h>
-#include "dsputil.h"
-
-//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
-
-/**
- * Context for storing H.264 DSP functions
- */
-typedef struct H264DSPContext{
-    /* weighted MC */
-    h264_weight_func weight_h264_pixels_tab[10];
-    h264_biweight_func biweight_h264_pixels_tab[10];
-
-    /* loop filter */
-    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
-    /* v/h_loop_filter_luma_intra: align 16 */
-    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
-    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
-    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
-    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
-    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
-    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
-                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
-
-    /* IDCT */
-    /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
-       NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
-        The reason for above, is that no 2 out of one list may use a different permutation.
-    */
-    void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_dct)(DCTELEM block[4][4]);
-    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-
-    qpel_mc_func (*qpel_put)[16];
-    qpel_mc_func (*qpel_avg)[16];
-}H264DSPContext;
-
-void ff_h264dsp_init(H264DSPContext *c);
-void ff_h264dsp_init_arm(H264DSPContext *c);
-void ff_h264dsp_init_ppc(H264DSPContext *c);
-void ff_h264dsp_init_x86(H264DSPContext *c);
-
-#endif /* AVCODEC_H264DSP_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_entropy.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_entropy.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2065 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 cabac decoding.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "avcodec.h"
-#include "h264_types.h"
-#include "h264_data.h"
-#include "cabac.h"
-#include "rectangle.h"
-#include "h264_misc.h"
-
-// #undef NDEBUG
-#include <assert.h>
-
-/* Cabac pre state table */
-
-static const int8_t cabac_context_init_I[460][2] =
-{
-    /* 0 - 10 */
-    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
-    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
-    { -6,  53 }, { -1, 54 },  {  7,  51 },
-
-    /* 11 - 23 unsused for I */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },
-
-    /* 24- 39 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-
-    /* 40 - 53 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },
-
-    /* 54 - 59 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },
-
-    /* 60 - 69 */
-    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-    { 13, 41 },  { 3, 62 },
-
-    /* 70 -> 87 */
-    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
-    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
-    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
-    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
-    { -12, 115 },{ -16, 122 },
-
-    /* 88 -> 104 */
-    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
-    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
-    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
-    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
-    { -22, 125 },
-
-    /* 105 -> 135 */
-    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
-    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
-    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
-    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
-    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
-    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
-    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
-    { 14, 62 },  { -13, 108 },{ -15, 100 },
-
-    /* 136 -> 165 */
-    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
-    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
-    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
-    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
-    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
-    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
-    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
-    { 0, 62 },   { 12, 72 },
-
-    /* 166 -> 196 */
-    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
-    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
-    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
-    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
-    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
-    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
-    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
-    { 0, 89 },   { 26, -19 }, { 22, -17 },
-
-    /* 197 -> 226 */
-    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
-    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
-    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
-    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
-    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
-    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
-    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
-    { 12, 68 },  { 2, 97 },
-
-    /* 227 -> 251 */
-    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
-    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
-    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
-    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
-    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
-    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
-    { -4, 65 },
-
-    /* 252 -> 275 */
-    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
-    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
-    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
-    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
-    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
-    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
-
-    /* 276 a bit special (not used, bypass is used instead) */
-    { 0, 0 },
-
-    /* 277 -> 307 */
-    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
-    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
-    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
-    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
-    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
-    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
-    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
-    { 9, 64 },   { -12, 104 },{ -11, 97 },
-
-    /* 308 -> 337 */
-    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
-    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
-    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
-    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
-    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
-    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
-    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
-    { 5, 64 },   { 12, 70 },
-
-    /* 338 -> 368 */
-    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
-    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
-    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
-    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
-    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
-    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
-    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
-    { -12, 109 },{ 36, -35 }, { 36, -34 },
-
-    /* 369 -> 398 */
-    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
-    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
-    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
-    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
-    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
-    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
-    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
-    { 29, 39 },  { 19, 66 },
-
-    /* 399 -> 435 */
-    {  31,  21 }, {  31,  31 }, {  25,  50 },
-    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
-    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
-    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
-    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
-    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
-    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
-    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
-    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
-    {   0,  68 }, {  -9,  92 },
-
-    /* 436 -> 459 */
-    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
-    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
-    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
-    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
-    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
-};
-
-static const int8_t cabac_context_init_PB[3][460][2] =
-{
-    /* i_cabac_init_idc == 0 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
-        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
-        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
-        {  17,  50 },
-
-        /* 24 - 39 */
-        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
-        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
-        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
-        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
-
-        /* 40 - 53 */
-        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
-        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
-        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
-        {  -3,  81 }, {   0,  88 },
-
-        /* 54 - 59 */
-        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
-        {  -7,  72 }, {   1,  58 },
-
-        /* 60 - 69 */
-        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
-        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
-        {  13,  41 }, {   3,  62 },
-
-        /* 70 - 87 */
-        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
-        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
-        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
-        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
-        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
-        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
-        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
-        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
-        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
-
-        /* 105 -> 165 */
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
-        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
-        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
-        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
-        {   9,  69 },
-
-        /* 166 - 226 */
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
-        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
-        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
-        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
-        {  -9, 108 },
-
-        /* 227 - 275 */
-        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
-        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
-        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
-        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
-        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
-        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
-        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
-        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
-        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
-        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
-        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
-        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
-        {  -8,  85 },
-
-        /* 276 a bit special (not used, bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
-        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
-        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
-        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
-        {  26,  43 },
-
-        /* 338 - 398 */
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
-        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
-        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
-        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
-        {  11,  86 },
-
-        /* 399 - 435 */
-        {  12,  40 }, {  11,  51 }, {  14,  59 },
-        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
-        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
-        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
-        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
-        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
-        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
-        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
-        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
-        {  -8,  66 }, {  -8,  76 },
-
-        /* 436 - 459 */
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
-        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
-        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
-    },
-
-    /* i_cabac_init_idc == 1 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
-        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
-        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
-        {  10,  54 },
-
-        /* 24 - 39 */
-        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
-        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
-        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
-        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
-
-        /* 40 - 53 */
-        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
-        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
-        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
-        {  -7,  86 },{  -5,  95 },
-
-        /* 54 - 59 */
-        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
-        {  -5,  72 },{   0,  61 },
-
-        /* 60 - 69 */
-        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-        { 13, 41 },  { 3, 62 },
-
-        /* 70 - 104 */
-        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
-        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
-        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
-        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
-        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
-        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
-        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
-        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
-        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
-
-        /* 105 -> 165 */
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
-        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
-        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
-        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
-        {   0,  89 },
-
-        /* 166 - 226 */
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
-        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
-        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
-        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
-        { -10, 116 },
-
-        /* 227 - 275 */
-        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
-        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
-        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
-        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
-        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
-        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
-        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
-        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
-        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
-        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
-        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
-        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
-        {  -4,  78 },
-
-        /* 276 a bit special (not used, bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
-        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
-        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
-        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
-        {  18,  50 },
-
-        /* 338 - 398 */
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
-        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
-        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
-        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
-        {  11,  83 },
-
-        /* 399 - 435 */
-        {  25,  32 }, {  21,  49 }, {  21,  54 },
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
-        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
-        {  -4,  67 }, {  -7,  82 },
-
-        /* 436 - 459 */
-        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
-        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
-        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
-        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-    },
-
-    /* i_cabac_init_idc == 2 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
-        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
-        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
-        {  14,  57 },
-
-        /* 24 - 39 */
-        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
-        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
-        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
-        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
-
-        /* 40 - 53 */
-        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
-        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
-        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
-        {  -3,  90 },{  -1,  101 },
-
-        /* 54 - 59 */
-        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
-        {  -7,  50 },{   1,  60 },
-
-        /* 60 - 69 */
-        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-        { 13, 41 },  { 3, 62 },
-
-        /* 70 - 104 */
-        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
-        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
-        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
-        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
-        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
-        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
-        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
-        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
-        {   3,  68 }, {  -8,  71 }, { -13,  98 },
-
-        /* 105 -> 165 */
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
-        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
-        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
-        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
-        { -22, 127 },
-
-        /* 166 - 226 */
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
-        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
-        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
-        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
-        { -24, 127 },
-
-        /* 227 - 275 */
-        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
-        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
-        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
-        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
-        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
-        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
-        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
-        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
-        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
-        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
-        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
-        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
-        { -10,  87 },
-
-        /* 276 a bit special (not used, bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
-        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
-        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
-        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
-        {  25,  42 },
-
-        /* 338 - 398 */
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
-        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
-        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
-        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
-        {  25,  61 },
-
-        /* 399 - 435 */
-        {  21,  33 }, {  19,  50 }, {  17,  61 },
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
-        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
-        {  -6,  68 }, { -10,  79 },
-
-        /* 436 - 459 */
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-    }
-};
-
-static const uint8_t left_block_options[4][16]={
-    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
-    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
-    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
-    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
-};
-
-static const uint8_t rem6[52]={
-0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-};
-
-static const uint8_t div6[52]={
-0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
-};
-
-static void init_dequant8_coeff_table(H264Slice *s, EntropyContext *ec){
-    int i,q,x;
-    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
-    ec->dequant8_coeff[0] = ec->dequant8_buffer[0];
-    ec->dequant8_coeff[1] = ec->dequant8_buffer[1];
-
-    for(i=0; i<2; i++){
-        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
-            ec->dequant8_coeff[1] = ec->dequant8_buffer[0];
-            break;
-        }
-
-        for(q=0; q<52; q++){
-            int shift = div6[q];
-            int idx = rem6[q];
-            for(x=0; x<64; x++)
-                ec->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
-                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
-                    s->pps.scaling_matrix8[i][x]) << shift;
-        }
-    }
-}
-
-static void init_dequant4_coeff_table(H264Slice *s, EntropyContext *ec){
-    int i,j,q,x;
-    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
-    for(i=0; i<6; i++ ){
-        ec->dequant4_coeff[i] = ec->dequant4_buffer[i];
-        for(j=0; j<i; j++){
-            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
-                ec->dequant4_coeff[i] = ec->dequant4_buffer[j];
-                break;
-            }
-        }
-        if(j<i)
-            continue;
-
-        for(q=0; q<52; q++){
-            int shift = div6[q] + 2;
-            int idx = rem6[q];
-            for(x=0; x<16; x++)
-                ec->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
-                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
-                    s->pps.scaling_matrix4[i][x]) << shift;
-        }
-    }
-}
-
-void init_dequant_tables(H264Slice *s, EntropyContext *ec){
-    int i,x;
-
-    init_dequant4_coeff_table(s, ec);
-    if(s->pps.transform_8x8_mode)
-        init_dequant8_coeff_table(s, ec);
-    if(s->transform_bypass){
-        for(i=0; i<6; i++)
-            for(x=0; x<16; x++)
-                ec->dequant4_coeff[i][0][x] = 1<<6;
-        if(s->pps.transform_8x8_mode)
-            for(i=0; i<2; i++)
-                for(x=0; x<64; x++)
-                    ec->dequant8_coeff[i][0][x] = 1<<6;
-    }
-}
-
-void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c) {
-    int i;
-    const int8_t (*tab)[2];
-
-    if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I;
-    else                                 tab = cabac_context_init_PB[s->cabac_init_idc];
-
-    /* calculate pre-state */
-    for( i= 0; i < 460; i++ ) {
-        int pre = 2*(((tab[i][0] * ec->curr_qscale) >>4 ) + tab[i][1]) - 127;
-
-        pre^= pre>>31;
-        if(pre > 124)
-            pre= 124 + (pre&1);
-
-        c->cabac_state[i] =  pre;
-    }
-}
-
-static void fill_decode_neighbors(EntropyContext *ec, H264Slice *s){
-    H264Mb *m = ec->m;
-	const int mb_x = m->mb_x;
-
-    if (m->mb_y){
-        ec->top_type     = ec->mb_type_top[mb_x];
-        ec->topright_type= ec->mb_type_top[mb_x+1];
-        ec->topleft_type = ec->mb_type_top[mb_x-1];
-        m->qscale_top_mb_xy = ec->qscale_top[mb_x];
-    } else {
-        ec->top_type     = 0;
-        ec->topright_type= 0;
-        ec->topleft_type = 0;
-        m->qscale_top_mb_xy = 0;
-    }
-
-    ec->left_type    = ec->mb_type[mb_x-1] ;
-    m->qscale_left_mb_xy = ec->qscale[mb_x-1];
-
-}
-
-static void fill_decode_caches(EntropyContext *ec, H264Slice *s, int mb_type){
-    H264Mb *m = ec->m;
-    int topleft_type, top_type, topright_type, left_type;
-    const uint8_t * left_block= left_block_options[0];
-	const int mb_x = m->mb_x;
-    int i;
-
-    topleft_type = ec->topleft_type;
-	top_type     = ec->top_type;
-    topright_type= ec->topright_type;
-	left_type    = ec->left_type;
-
-    if(!IS_SKIP(mb_type)){
-        if(top_type){
-            AV_COPY32(&ec->non_zero_count_cache[4+8*0], &ec->non_zero_count_top[mb_x][0]);
-            ec->non_zero_count_cache[1+8*0]= ec->non_zero_count_top[mb_x][4];
-            ec->non_zero_count_cache[2+8*0]= ec->non_zero_count_top[mb_x][5];
-            ec->non_zero_count_cache[1+8*3]= ec->non_zero_count_top[mb_x][6];
-            ec->non_zero_count_cache[2+8*3]= ec->non_zero_count_top[mb_x][7];
-
-        }else {
-            ec->non_zero_count_cache[1+8*0]=
-            ec->non_zero_count_cache[2+8*0]=
-            ec->non_zero_count_cache[1+8*3]=
-            ec->non_zero_count_cache[2+8*3]=
-            AV_WN32A(&ec->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040);
-        }
-
-        if(left_type){
-            for (i=0; i<2; i++) {
-                ec->non_zero_count_cache[3+8*1 + 2*8*i]= ec->non_zero_count_left[i*2+0];
-                ec->non_zero_count_cache[3+8*2 + 2*8*i]= ec->non_zero_count_left[i*2+1];
-                ec->non_zero_count_cache[0+8*1 + 3*8*i]= ec->non_zero_count_left[4+i*2+0];
-                ec->non_zero_count_cache[0+8*2 + 3*8*i]= ec->non_zero_count_left[4+i*2+1];
-            }
-        }
-        else{
-            for (i=0; i<2; i++) {
-                ec->non_zero_count_cache[3+8*1 + 2*8*i]=
-                ec->non_zero_count_cache[3+8*2 + 2*8*i]=
-                ec->non_zero_count_cache[0+8*1 + 3*8*i]=
-                ec->non_zero_count_cache[0+8*2 + 3*8*i]= !IS_INTRA(mb_type) ? 0 : 64;
-            }
-        }
-
-		// top_cbp
-		if(top_type) {
-			ec->top_cbp = ec->cbp_top[mb_x];
-		} else {
-			ec->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
-		}
-		// left_cbp
-		if (left_type) {
-			ec->left_cbp = (ec->cbp[mb_x-1] & 0x1f0)
-			|  ((ec->cbp[mb_x-1]>>(left_block[0]&(~1)))&2)
-			| (((ec->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2);
-		} else {
-			ec->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
-		}
-    }
-
-    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
-        int list;
-
-        ec->ref_cache[0][scan8[5 ]+1] = ec->ref_cache[0][scan8[7 ]+1] = ec->ref_cache[0][scan8[13]+1] =
-        ec->ref_cache[1][scan8[5 ]+1] = ec->ref_cache[1][scan8[7 ]+1] = ec->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
-
-        for(list=0; list<s->list_count; list++){
-            if(!USES_LIST(mb_type, list)){
-                continue;
-            }
-            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
-
-            if(USES_LIST(top_type, list)){
-                ec->ref_cache[list][scan8[0] + 0 - 1*8]=
-                ec->ref_cache[list][scan8[0] + 1 - 1*8]= ec->ref_index_top[list][4*mb_x + 2];
-                ec->ref_cache[list][scan8[0] + 2 - 1*8]=
-                ec->ref_cache[list][scan8[0] + 3 - 1*8]= ec->ref_index_top[list][4*mb_x + 3];
-            }else{
-                AV_WN32A(&ec->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
-            }
-
-            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
-                for(i=0; i<2; i++){
-                    int cache_idx = scan8[0] - 1 + i*2*8;
-                    if(USES_LIST(left_type, list)){
-                        const int b8_x= 4*(mb_x-1) + 1;
-                        ec->ref_cache[list][cache_idx  ]= ec->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
-                        ec->ref_cache[list][cache_idx+8]= ec->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
-                    }else{
-                        ec->ref_cache[list][cache_idx  ]=
-                        ec->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
-                    }
-                }
-            }else{
-                if(USES_LIST(left_type, list)){
-                    const int b8_x= 4*(mb_x-1) + 1;
-                    ec->ref_cache[list][scan8[0] - 1]= ec->ref_index[list][b8_x + (left_block[0]&~1)];
-                }else{
-                    ec->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if(USES_LIST(topright_type, list)){
-                ec->ref_cache[list][scan8[0] + 4 - 1*8]= ec->ref_index_top[list][4*(mb_x+1) + 2];
-            }else{
-                ec->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-            }
-            if(ec->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
-                int topleft_partition= -1;
-                if(USES_LIST(topleft_type, list)){
-                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
-                    ec->ref_cache[list][scan8[0] - 1 - 1*8]= ec->ref_index_top[list][b8_x];
-                }else{
-                    ec->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
-                continue;
-
-            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
-                ec->ref_cache[list][scan8[4 ]] =
-                ec->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
-
-				/* XXX beurk, Load mvd */
-				if(USES_LIST(top_type, list)){
-					AV_COPY64(ec->mvd_cache[list][scan8[0] + 0 - 1*8], ec->mvd_top[list][8*mb_x + 0]);
-				}else{
-					AV_ZERO64(ec->mvd_cache[list][scan8[0] + 0 - 1*8]);
-				}
-				if(USES_LIST(left_type, list)){
-					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 0*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[0]]);
-					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 1*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[1]]);
-				}else{
-					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 0*8]);
-					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 1*8]);
-				}
-				if(USES_LIST(left_type, list)){
-					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 2*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[2]]);
-					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 3*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[3]]);
-				}else{
-					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 2*8]);
-					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 3*8]);
-				}
-				AV_ZERO16(ec->mvd_cache [list][scan8[4 ]]);
-				AV_ZERO16(ec->mvd_cache [list][scan8[12]]);
-				if(s->slice_type_nos == FF_B_TYPE){
-					fill_rectangle(&ec->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
-
-					if(IS_DIRECT(top_type)){
-						AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1));
-					}else if(IS_8X8(top_type)){
-						int b8_x = 4*mb_x;
-						ec->direct_cache[scan8[0] + 0 - 1*8]= ec->direct_top[b8_x + 2];
-						ec->direct_cache[scan8[0] + 2 - 1*8]= ec->direct_top[b8_x + 3];
-					}else{
-						AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1));
-					}
-
-					if(IS_DIRECT(left_type))
-						ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1;
-					else if(IS_8X8(left_type))
-						ec->direct_cache[scan8[0] - 1 + 0*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)];
-					else
-						ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1;
-
-					if(IS_DIRECT(left_type))
-						ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1;
-					else if(IS_8X8(left_type))
-						ec->direct_cache[scan8[0] - 1 + 2*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)];
-					else
-						ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1;
-				}
-            }
-        }
-    }
-    ec->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type);
-}
-
-static inline void write_back_non_zero_count(EntropyContext *ec, H264Slice *s){
-    H264Mb *m = ec->m;
-    const int mb_x= m->mb_x;
-
-    //bottom nnz
-    AV_COPY32(&ec->non_zero_count[mb_x][0], &ec->non_zero_count_cache[4+8*4] );
-    ec->non_zero_count[mb_x][4] = ec->non_zero_count_cache[1+8*2];
-    ec->non_zero_count[mb_x][5] = ec->non_zero_count_cache[2+8*2];
-    ec->non_zero_count[mb_x][6] = ec->non_zero_count_cache[1+8*5];
-    ec->non_zero_count[mb_x][7] = ec->non_zero_count_cache[2+8*5];
-
-    for (int i=0; i<2; i++) {
-        ec->non_zero_count_left[i*2+0]   = ec->non_zero_count_cache[7+8*1 + 2*8*i];
-        ec->non_zero_count_left[i*2+1]   = ec->non_zero_count_cache[7+8*2 + 2*8*i];
-        ec->non_zero_count_left[4+i*2+0] = ec->non_zero_count_cache[2+8*1 + 3*8*i];
-        ec->non_zero_count_left[4+i*2+1] = ec->non_zero_count_cache[2+8*2 + 3*8*i];
-    }
-
-    AV_COPY32(&m->non_zero_count[ 0], &ec->non_zero_count_cache[4+8*1]);
-    AV_COPY32(&m->non_zero_count[ 4], &ec->non_zero_count_cache[4+8*2]);
-    AV_COPY32(&m->non_zero_count[ 8], &ec->non_zero_count_cache[4+8*3]);
-    AV_COPY32(&m->non_zero_count[12], &ec->non_zero_count_cache[4+8*4]);
-
-    for (int i=0; i<2; i++) {
-        m->non_zero_count[16 + i*2   ] = ec->non_zero_count_cache[8*1 + 8*i + 1];
-        m->non_zero_count[16 + i*2 +1] = ec->non_zero_count_cache[8*1 + 8*i + 2];
-        m->non_zero_count[20 + i*2   ] = ec->non_zero_count_cache[8*4 + 8*i + 1];
-        m->non_zero_count[20 + i*2 +1] = ec->non_zero_count_cache[8*4 + 8*i + 2];
-    }
-}
-
-static inline void write_back_motion(EntropyContext *ec, H264Slice *s, int mb_type){
-    H264Mb *m = ec->m;
-	const int mb_x = m->mb_x;
-    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
-    int list;
-
-    for(list=0; list<s->list_count; list++){
-        if(!USES_LIST(mb_type, list))
-            continue;
-
-        {
-            uint8_t (*mvd_dst)[2] = (void *) ec->mvd[list][8*mb_x];
-            uint8_t (*mvd_src)[2] = &ec->mvd_cache[list][scan8[0]];
-            if(IS_SKIP(mb_type))
-                AV_ZERO128(mvd_dst);
-            else{
-				AV_COPY64(mvd_dst, mvd_src + 8*3);
-                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
-                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
-                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
-            }
-        }
-        int8_t *ref_index = &ec->ref_index[list][b_x];
-        {
-            ref_index[0+0*2]= ec->ref_cache[list][scan8[0]];
-            ref_index[1+0*2]= ec->ref_cache[list][scan8[4]];
-            ref_index[0+1*2]= ec->ref_cache[list][scan8[8]];
-            ref_index[1+1*2]= ec->ref_cache[list][scan8[12]];
-        }
-    }
-
-    if(s->slice_type_nos == FF_B_TYPE){
-        if(IS_8X8(mb_type)){
-            uint8_t *direct = &ec->direct[4*mb_x];
-            direct[1] = m->sub_mb_type[1]>>1;
-            direct[2] = m->sub_mb_type[2]>>1;
-            direct[3] = m->sub_mb_type[3]>>1;
-        }
-    }
-}
-
-static inline int get_dct8x8_allowed(EntropyContext *ec, H264Slice *s){
-    H264Mb *m = ec->m;
-    if(s->direct_8x8_inference_flag)
-        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
-    else
-        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
-}
-
-/**
- * decodes a P_SKIP or B_SKIP macroblock
- */
-static void decode_mb_skip(EntropyContext *ec, H264Slice *s){
-    H264Mb *m = ec->m;
-	const int mb_x = m->mb_x;
-    int mb_type;
-
-    if( s->slice_type_nos == FF_B_TYPE )
-        mb_type= MB_TYPE_16x16|MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
-    else
-        mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
-
-    fill_rectangle(&ec->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
-    write_back_motion(ec, s, mb_type);
-    m->mb_type = ec->mb_type[mb_x] = mb_type;
-    m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale;
-
-    AV_ZERO64(ec->non_zero_count[mb_x]);
-    AV_ZERO64(ec->non_zero_count_left);
-    memset(m->non_zero_count, 0, 24);
-}
-
-static int decode_cabac_intra_mb_type(EntropyContext *ec, H264Slice *s, CABACContext *c, int ctx_base, int intra_slice) {
-    uint8_t *state= &c->cabac_state[ctx_base];
-    int mb_type;
-
-    if(intra_slice){
-        int ctx=0;
-        if( ec->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
-            ctx++;
-        if( ec->top_type     & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
-            ctx++;
-        if( get_cabac_noinline( c, &state[ctx] ) == 0 )
-            return 0;   /* I4x4 */
-        state += 2;
-    }else{
-        if( get_cabac_noinline( c, state ) == 0 )
-            return 0;   /* I4x4 */
-    }
-
-    if( get_cabac_terminate( c ) )
-        return 25;  /* PCM */
-
-    mb_type = 1; /* I16x16 */
-    mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */
-    if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */
-        mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] );
-    mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] );
-    mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] );
-    return mb_type;
-}
-
-static int decode_cabac_mb_skip(EntropyContext *ec, H264Slice *s, H264Mb *m, CABACContext *c) {
-    int ctx = 0;
-
-	if( m->mb_x>0 && !IS_SKIP( ec->left_type ))
-        ctx++;
-	if( m->mb_y>0 && !IS_SKIP( ec->top_type ))
-        ctx++;
-
-    if( s->slice_type_nos == FF_B_TYPE )
-        ctx += 13;
-    return get_cabac_noinline(c, &c->cabac_state[11+ctx] );
-}
-
-static int decode_cabac_mb_intra4x4_pred_mode_delta( CABACContext *c) {
-    int mode = 0;
-
-    if( get_cabac(c, &c->cabac_state[68] ) )
-        return -1;
-
-    mode += 1 * get_cabac(c, &c->cabac_state[69] );
-    mode += 2 * get_cabac(c, &c->cabac_state[69] );
-    mode += 4 * get_cabac(c, &c->cabac_state[69] );
-
-    return mode;
-}
-
-static int decode_cabac_mb_chroma_pre_mode(EntropyContext *ec, H264Slice *s, CABACContext *c) {
-    H264Mb *m = ec->m;
-	const int mb_x = m->mb_x;
-
-    int ctx = 0;
-
-    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */
-    if( ec->left_type && ec->chroma_pred_mode[mb_x-1] != 0 )
-        ctx++;
-
-    if( ec->top_type     && ec->chroma_pred_mode_top[mb_x] != 0 )
-        ctx++;
-
-    if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 )
-        return 0;
-
-    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
-        return 1;
-    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
-        return 2;
-    else
-        return 3;
-}
-
-static int decode_cabac_mb_cbp_luma(EntropyContext *ec, CABACContext *c) {
-    int cbp_b, cbp_a, ctx, cbp = 0;
-
-    cbp_a = ec->left_cbp;
-    cbp_b = ec->top_cbp;
-
-    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]);
-    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1;
-    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2;
-    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
-    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3;
-    return cbp;
-}
-static int decode_cabac_mb_cbp_chroma(EntropyContext *ec, CABACContext *c) {
-    int ctx;
-    int cbp_a, cbp_b;
-
-    cbp_a = (ec->left_cbp>>4)&0x03;
-    cbp_b = (ec-> top_cbp>>4)&0x03;
-
-    ctx = 0;
-    if( cbp_a > 0 ) ctx++;
-    if( cbp_b > 0 ) ctx += 2;
-    if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 )
-        return 0;
-
-    ctx = 4;
-    if( cbp_a == 2 ) ctx++;
-    if( cbp_b == 2 ) ctx += 2;
-    return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] );
-}
-
-static int decode_cabac_p_mb_sub_type( CABACContext *c) {
-    if( get_cabac(c, &c->cabac_state[21] ) )
-        return 0;   /* 8x8 */
-    if( !get_cabac(c, &c->cabac_state[22] ) )
-        return 1;   /* 8x4 */
-    if( get_cabac(c, &c->cabac_state[23] ) )
-        return 2;   /* 4x8 */
-    return 3;       /* 4x4 */
-}
-static int decode_cabac_b_mb_sub_type(CABACContext *c) {
-    int type;
-    if( !get_cabac(c, &c->cabac_state[36] ) )
-        return 0;   /* B_Direct_8x8 */
-    if( !get_cabac(c, &c->cabac_state[37] ) )
-        return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
-    type = 3;
-    if( get_cabac(c, &c->cabac_state[38] ) ) {
-        if( get_cabac(c, &c->cabac_state[39] ) )
-            return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
-        type += 4;
-    }
-    type += 2*get_cabac(c, &c->cabac_state[39] );
-    type +=   get_cabac(c, &c->cabac_state[39] );
-    return type;
-}
-
-static int decode_cabac_mb_ref(EntropyContext *ec, H264Slice *s, CABACContext *c, int list, int n ) {
-    int refa = ec->ref_cache[list][scan8[n] - 1];
-    int refb = ec->ref_cache[list][scan8[n] - 8];
-    int ref  = 0;
-    int ctx  = 0;
-
-    if( s->slice_type_nos == FF_B_TYPE) {
-        if( refa > 0 && !(ec->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) )
-            ctx++;
-        if( refb > 0 && !(ec->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) )
-            ctx += 2;
-    } else {
-        if( refa > 0 )
-            ctx++;
-        if( refb > 0 )
-            ctx += 2;
-    }
-
-    while( get_cabac(c, &c->cabac_state[54+ctx] ) ) {
-        ref++;
-        ctx = (ctx>>2)+4;
-        if(ref >= 32 /*h->ref_list[list]*/){
-            return -1;
-        }
-    }
-    return ref;
-}
-
-static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) {
-    int mvd;
-
-    if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
-        *mvda= 0;
-        return 0;
-    }
-
-    mvd= 1;
-    ctxbase+= 3;
-    while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) {
-        if( mvd < 4 )
-            ctxbase++;
-        mvd++;
-    }
-
-    if( mvd >= 9 ) {
-        int k = 3;
-        while( get_cabac_bypass(c ) ) {
-            mvd += 1 << k;
-            k++;
-            if(k>24){
-                av_log(AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
-                return INT_MIN;
-            }
-        }
-        while( k-- ) {
-            mvd += get_cabac_bypass(c )<<k;
-        }
-        *mvda=mvd < 70 ? mvd : 70;
-    }else
-        *mvda=mvd;
-    return get_cabac_bypass_sign(c, -mvd );
-}
-
-#define DECODE_CABAC_MB_MVD( ec, c, list,  n )\
-{\
-    int amvd0 = ec->mvd_cache[list][scan8[n] - 1][0] +\
-                ec->mvd_cache[list][scan8[n] - 8][0];\
-    int amvd1 = ec->mvd_cache[list][scan8[n] - 1][1] +\
-                ec->mvd_cache[list][scan8[n] - 8][1];\
-\
-    m->mvd[list][mp][0] = decode_cabac_mb_mvd( c, 40, amvd0, &mpx ); \
-    m->mvd[list][mp][1] = decode_cabac_mb_mvd( c, 47, amvd1, &mpy ); \
-    mp++; \
-}
-
-static av_always_inline int get_cabac_cbf_ctx(EntropyContext *ec, H264Slice *s, int cat, int idx, int is_dc ) {
-    int nza, nzb;
-    int ctx = 0;
-
-    if( is_dc ) {
-        if( cat == 0 ) {
-            nza = ec->left_cbp&0x100;
-            nzb = ec-> top_cbp&0x100;
-        } else {
-            nza = (ec->left_cbp>>(6+idx))&0x01;
-            nzb = (ec-> top_cbp>>(6+idx))&0x01;
-        }
-    } else {
-        assert(cat == 1 || cat == 2 || cat == 4);
-        nza = ec->non_zero_count_cache[scan8[idx] - 1];
-        nzb = ec->non_zero_count_cache[scan8[idx] - 8];
-    }
-
-    if( nza > 0 )
-        ctx++;
-
-    if( nzb > 0 )
-        ctx += 2;
-
-    return ctx + 4 * cat;
-}
-
-DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
-};
-
-static const int significant_coeff_flag_offset[2][6] = {
-    { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
-    { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
-};
-static const int last_coeff_flag_offset[2][6] = {
-    { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
-    { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
-};
-static const int coeff_abs_level_m1_offset[6] = {
-    227+0, 227+10, 227+20, 227+30, 227+39, 426
-};
-static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
-    { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
-    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
-    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
-    12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
-    { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
-    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
-    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
-    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
-};
-/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
-* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
-* map node ctx => cabac ctx for level=1 */
-static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
-/* map node ctx => cabac ctx for level>1 */
-static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
-static const uint8_t coeff_abs_level_transition[2][8] = {
-    /* update node ctx after decoding a level=1 */
-    { 1, 2, 3, 3, 4, 5, 6, 7 },
-    /* update node ctx after decoding a level>1 */
-    { 4, 4, 4, 4, 5, 6, 7, 7 }
-};
-
-static av_always_inline void decode_cabac_residual_internal(EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
-    H264Mb *m = ec->m;
-	const int mb_x = m->mb_x;
-    int index[64];
-
-    int av_unused last;
-    int coeff_count = 0;
-    int node_ctx = 0;
-
-    uint8_t *significant_coeff_ctx_base;
-    uint8_t *last_coeff_ctx_base;
-    uint8_t *abs_level_m1_ctx_base;
-
-    /* read coded block flag */
-    if( is_dc || cat != 5 ) {
-        if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( ec, s, cat, n, is_dc ) ] ) == 0 ) {
-            if( !is_dc )
-                ec->non_zero_count_cache[scan8[n]] = 0;
-            return;
-        }
-    }
-
-    significant_coeff_ctx_base = c->cabac_state
-        + significant_coeff_flag_offset[0][cat];
-    last_coeff_ctx_base = c->cabac_state
-        + last_coeff_flag_offset[0][cat];
-    abs_level_m1_ctx_base = c->cabac_state
-        + coeff_abs_level_m1_offset[cat];
-
-    if( !is_dc && cat == 5 ) {
-#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
-        for(last= 0; last < coefs; last++) { \
-            uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
-            if( get_cabac( c, sig_ctx )) { \
-                uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
-                index[coeff_count++] = last; \
-                if( get_cabac( c, last_ctx ) ) { \
-                    last= max_coeff; \
-                    break; \
-                } \
-            } \
-        }\
-        if( last == max_coeff -1 ) {\
-            index[coeff_count++] = last;\
-        }
-
-        const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0];
-        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
-    } else {
-        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
-    }
-    assert(coeff_count > 0);
-
-    if( is_dc ) {
-        if( cat == 0 )
-            ec->cbp[mb_x] |= 0x100;
-        else
-            ec->cbp[mb_x] |= 0x40 << n;
-    } else {
-        if( cat == 5 )
-            fill_rectangle(&ec->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
-        else {
-            assert( cat == 1 || cat == 2 || cat == 4 );
-            ec->non_zero_count_cache[scan8[n]] = coeff_count;
-        }
-    }
-
-    do {
-        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
-
-        int j= scantable[index[--coeff_count]];
-
-        if( get_cabac( c, ctx ) == 0 ) {
-            node_ctx = coeff_abs_level_transition[0][node_ctx];
-            if( is_dc ) {
-                block[j] = get_cabac_bypass_sign( c, -1);
-            }else{
-                block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6;
-            }
-        } else {
-            int coeff_abs = 2;
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
-            node_ctx = coeff_abs_level_transition[1][node_ctx];
-
-            while( coeff_abs < 15 && get_cabac( c, ctx ) ) {
-                coeff_abs++;
-            }
-
-            if( coeff_abs >= 15 ) {
-                int j = 0;
-                while( get_cabac_bypass( c ) ) {
-                    j++;
-                }
-
-                coeff_abs=1;
-                while( j-- ) {
-                    coeff_abs += coeff_abs + get_cabac_bypass( c );
-                }
-                coeff_abs+= 14;
-            }
-
-            if( is_dc ) {
-                block[j] = get_cabac_bypass_sign( c, -coeff_abs );
-            }else{
-                block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6;
-            }
-        }
-    } while( coeff_count );
-
-}
-
-static void decode_cabac_residual_dc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
-    decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, NULL, max_coeff, 1);
-}
-
-static void decode_cabac_residual_nondc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
-    decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, qmul, max_coeff, 0);
-}
-
-/**
- * decodes a macroblock
- * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
- */
-int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c) {
-    H264Mb *m = ec->m;
-	int mb_x = m->mb_x;
-    int mb_type, partition_count, cbp = 0;
-    int dct8x8_allowed= s->pps.transform_8x8_mode;
-
-    fill_decode_neighbors(ec, s);
-
-    if( s->slice_type_nos != FF_I_TYPE ) {
-        int skip;
-        /* a skipped mb needs the aff flag from the following mb */
-        skip = decode_cabac_mb_skip( ec, s, m, c);
-
-        /* read skip flags */
-        if( skip ) {
-            decode_mb_skip(ec, s);
-            m->cbp = ec->cbp[mb_x] = 0;
-            ec->chroma_pred_mode[mb_x] = 0;
-            ec->last_qscale_diff = 0;
-            return 0;
-        }
-    }
-
-    if( s->slice_type_nos == FF_B_TYPE ) {
-        int ctx = 0;
-
-        if( !IS_DIRECT( ec->left_type-1 ) )
-            ctx++;
-        if( !IS_DIRECT( ec->top_type-1 ) )
-            ctx++;
-
-        if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){
-            mb_type= 0; /* B_Direct_16x16 */
-        }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) {
-            mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */
-        }else{
-            int bits;
-            bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3;
-            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2;
-            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1;
-            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] );
-            if( bits < 8 ){
-                mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
-            }else if( bits == 13 ){
-                mb_type= decode_cabac_intra_mb_type(ec, s, c, 32, 0);
-                goto decode_intra_mb;
-            }else if( bits == 14 ){
-                mb_type= 11; /* B_L1_L0_8x16 */
-            }else if( bits == 15 ){
-                mb_type= 22; /* B_8x8 */
-            }else{
-                bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] );
-                mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
-            }
-        }
-        partition_count= b_mb_type_info[mb_type].partition_count;
-        mb_type=         b_mb_type_info[mb_type].type;
-    } else if( s->slice_type_nos == FF_P_TYPE ) {
-        if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) {
-            /* P-type */
-            if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) {
-                /* P_L0_D16x16, P_8x8 */
-                mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] );
-            } else {
-                /* P_L0_D8x16, P_L0_D16x8 */
-                mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] );
-            }
-            partition_count= p_mb_type_info[mb_type].partition_count;
-            mb_type=         p_mb_type_info[mb_type].type;
-        } else {
-            mb_type= decode_cabac_intra_mb_type(ec, s, c, 17, 0);
-            goto decode_intra_mb;
-        }
-    } else {
-        mb_type= decode_cabac_intra_mb_type(ec, s ,c, 3, 1);
-        if(s->slice_type == FF_SI_TYPE && mb_type)
-            mb_type--;
-        assert(s->slice_type_nos == FF_I_TYPE);
-decode_intra_mb:
-        partition_count = 0;
-        cbp= i_mb_type_info[mb_type].cbp;
-        m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
-        mb_type= i_mb_type_info[mb_type].type;
-    }
-
-    if(IS_INTRA_PCM(mb_type)) {
-        const uint8_t *ptr;
-        // We assume these blocks are very rare so we do not optimize it.
-        // FIXME The two following lines get the bitstream position in the cabac
-        // decode, I think it should be done by a function in cabac.h (or cabac.c).
-        ptr=c->bytestream;
-        if(c->low&0x1) ptr--;
-        if(CABAC_BITS==16){
-            if(c->low&0x1FF) ptr--;
-        }
-		//printf("pcm\n");
-        // The pixels are stored in the same order as levels in h->mb array.
-        memcpy(m->mb, ptr, 256); ptr+=256;
-		memcpy(m->mb+128, ptr, 128); ptr+=128;
-
-        ff_init_cabac_decoder(c, ptr, c->bytestream_end - ptr);
-
-        // All blocks are present
-        m->cbp= ec->cbp[mb_x] = 0x1ef;
-        ec->chroma_pred_mode[mb_x] = 0;
-        // In deblocking, the quantizer is 0
-        m->qscale_mb_xy = ec->qscale[mb_x]= 0;
-        // All coeffs are present
-        memset(ec->non_zero_count[mb_x], 16, 8);
-		m->mb_type = ec->mb_type[mb_x]=  mb_type;
-        ec->last_qscale_diff = 0;
-
-        return 0;
-    }
-
-    fill_decode_caches(ec, s, mb_type);
-
-    int mp = 0;
-    if( IS_INTRA( mb_type ) ) {
-        int i, pred_mode;
-        if( IS_INTRA4x4( mb_type ) ) {
-            if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ) ) {
-                mb_type |= MB_TYPE_8x8DCT;
-                for( i = 0; i < 16; i+=4 ) {
-                    m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c);
-                }
-            } else {
-                for( i = 0; i < 16; i++ ) {
-                    m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c);
-                }
-            }
-        }
-
-        m->chroma_pred_mode= ec->chroma_pred_mode[mb_x] =
-		pred_mode = decode_cabac_mb_chroma_pre_mode( ec, s, c );
-
-    } else if( partition_count == 4 ) {
-        int i, j, sub_partition_count[4], list;
-
-        if( s->slice_type_nos == FF_B_TYPE ) {
-            for( i = 0; i < 4; i++ ) {
-                m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c );
-                sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
-                m->sub_mb_type[i]=      b_sub_mb_type_info[ m->sub_mb_type[i] ].type;
-            }
-            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
-                          m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
-                ec->ref_cache[0][scan8[4]] =
-                ec->ref_cache[1][scan8[4]] =
-                ec->ref_cache[0][scan8[12]] =
-                ec->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
-
-                for( i = 0; i < 4; i++ )
-                    fill_rectangle( &ec->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 );
-            }
-        } else {
-            for( i = 0; i < 4; i++ ) {
-                m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c );
-                sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
-                m->sub_mb_type[i]=      p_sub_mb_type_info[ m->sub_mb_type[i] ].type;
-            }
-        }
-
-        for( list = 0; list < s->list_count; list++ ) {
-            for( i = 0; i < 4; i++ ) {
-                if(IS_DIRECT(m->sub_mb_type[i])) continue;
-                if(IS_DIR(m->sub_mb_type[i], 0, list)){
-                    if( s->ref_count[list] > 1 ){
-                        m->ref_index[list][i] = decode_cabac_mb_ref(ec, s, c, list, 4*i );
-                        if(m->ref_index[list][i] >= s->ref_count[list]){
-                            av_log(AV_LOG_ERROR, "Reference %d >= %d\n", m->ref_index[list][i], s->ref_count[list]);
-                            return -1;
-                        }
-                    }else
-                        m->ref_index[list][i] = 0;
-                } else {
-                    m->ref_index[list][i] = -1;
-                }
-                ec->ref_cache[list][ scan8[4*i]   ]=ec->ref_cache[list][ scan8[4*i]+1 ]=
-                ec->ref_cache[list][ scan8[4*i]+8 ]=ec->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i];
-            }
-        }
-
-        if(dct8x8_allowed){
-//             assert(0);
-            dct8x8_allowed = get_dct8x8_allowed(ec, s);
-        }
-
-        for(list=0; list<s->list_count; list++){
-            for(i=0; i<4; i++){
-//                 ec->ref_cache[list][ scan8[4*i]   ]=ec->ref_cache[list][ scan8[4*i]+1 ];
-                if(IS_DIRECT(m->sub_mb_type[i])){
-                    fill_rectangle(ec->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
-                    continue;
-                }
-
-                if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){
-                    const int sub_mb_type= m->sub_mb_type[i];
-                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
-                    for(j=0; j<sub_partition_count[i]; j++){
-                        int mpx, mpy;
-                        const int index= 4*i + block_width*j;
-                        uint8_t (* mvd_cache)[2]= &ec->mvd_cache[list][ scan8[index]];
-
-                        DECODE_CABAC_MB_MVD( ec, c, list, index)
-
-                        if(IS_SUB_8X8(sub_mb_type)){
-                            mvd_cache[ 1 ][0]=
-                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx;
-                            mvd_cache[ 1 ][1]=
-                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy;
-                        }else if(IS_SUB_8X4(sub_mb_type)){
-                            mvd_cache[ 1 ][0]=  mpx;
-                            mvd_cache[ 1 ][1]= mpy;
-                        }else if(IS_SUB_4X8(sub_mb_type)){
-                            mvd_cache[ 8 ][0]= mpx;
-                            mvd_cache[ 8 ][1]= mpy;
-                        }
-                        mvd_cache[ 0 ][0]= mpx;
-                        mvd_cache[ 0 ][1]= mpy;
-                    }
-                }else{
-                    fill_rectangle(ec->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
-                }
-            }
-        }
-    } else if( IS_DIRECT(mb_type) ) {
-        mb_type |= MB_TYPE_16x16;
-        fill_rectangle(ec->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
-        fill_rectangle(ec->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
-        dct8x8_allowed &= s->direct_8x8_inference_flag;
-    } else {
-        int list, i;
-        if(IS_16X16(mb_type)){
-            for(list=0; list<s->list_count; list++){
-                if(IS_DIR(mb_type, 0, list)){
-                    int ref;
-                    if(s->ref_count[list] > 1){
-                        ref= decode_cabac_mb_ref(ec, s, c, list, 0);
-                        if(ref >= s->ref_count[list]){
-                            av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
-                            return -1;
-                        }
-                    }else
-                        ref=0;
-                    m->ref_index[list][0]= ref;
-                    fill_rectangle(&ec->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
-                }
-            }
-            for(list=0; list<s->list_count; list++){
-                if(IS_DIR(mb_type, 0, list)){
-                    int mpx,mpy;
-                    DECODE_CABAC_MB_MVD( ec, c, list, 0)
-
-                    fill_rectangle(ec->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
-                }
-
-            }
-        }
-        else if(IS_16X8(mb_type)){
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){
-                        int ref;
-                        if(s->ref_count[list] > 1){
-                            ref= decode_cabac_mb_ref(ec, s, c, list, 8*i );
-                            if(ref >= s->ref_count[list]){
-                                av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
-                                return -1;
-                            }
-                        }else
-                            ref=0;
-                        m->ref_index[list][i]= ref;
-                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
-                    }else{
-                        m->ref_index[list][i]= LIST_NOT_USED;
-                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
-                    }
-                }
-            }
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){
-                        int mpx,mpy;
-                        DECODE_CABAC_MB_MVD( ec, c, list, 8*i)
-
-                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
-                    }else{
-                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
-                    }
-                }
-            }
-        }else{
-            assert(IS_8X16(mb_type));
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                        int ref;
-                        if(s->ref_count[list] > 1){
-                            ref= decode_cabac_mb_ref(ec, s, c, list, 4*i );
-                            if(ref >= s->ref_count[list]){
-                                av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
-                                return -1;
-                            }
-                        }else
-                            ref=0;
-                        m->ref_index[list][i]= ref;
-                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
-                    }else{
-                        m->ref_index[list][i]= LIST_NOT_USED;
-                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
-                    }
-                }
-            }
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){
-                        int mpx,mpy;
-                        DECODE_CABAC_MB_MVD( ec, c, list, 4*i)
-
-                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
-                    }else{
-                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
-                    }
-                }
-            }
-        }
-    }
-
-    if( IS_INTER( mb_type ) ||(IS_DIRECT(mb_type))) {
-        ec->chroma_pred_mode[mb_x] = 0;
-        write_back_motion( ec, s, mb_type );
-    }
-
-    if( !IS_INTRA16x16( mb_type ) ) {
-        cbp  = decode_cabac_mb_cbp_luma( ec, c);
-		cbp |= decode_cabac_mb_cbp_chroma( ec, c ) << 4;
-    }
-
-    ec->cbp[mb_x] = m->cbp = cbp;
-
-    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
-        int t = get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] );
-        mb_type |= MB_TYPE_8x8DCT * t;
-    }
-    m->mb_type = ec->mb_type[mb_x] = mb_type;
-
-    if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *scan8x8, *dc_scan;
-        const uint32_t *qmul;
-
-
-        if (s->transform_bypass && ec->curr_qscale){
-            scan8x8= ff_zigzag_direct;
-            scan= zigzag_scan;
-        }else{
-            scan8x8= ec->zigzag_scan8x8;
-            scan= ec->zigzag_scan;
-        }
-        dc_scan= luma_dc_zigzag_scan;
-
-        // decode_cabac_mb_dqp
-        if(get_cabac_noinline(c, &c->cabac_state[60 + (ec->last_qscale_diff != 0)])){
-            int val = 1;
-            int ctx= 2;
-
-            while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) {
-                ctx= 3;
-                val++;
-                if(val > 102){ //prevent infinite loop
-                    av_log(AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", m->mb_x, m->mb_y);
-                    return -1;
-                }
-            }
-
-            if( val&0x01 )
-                val=   (val + 1)>>1 ;
-            else
-                val= -((val + 1)>>1);
-            ec->last_qscale_diff = val;
-            ec->curr_qscale += val;
-            if(((unsigned)ec->curr_qscale) > 51){
-                if(ec->curr_qscale<0) ec->curr_qscale+= 52;
-                else            ec->curr_qscale-= 52;
-            }
-            ec->chroma_qp[0] = get_chroma_qp( s, 0, ec->curr_qscale);
-            ec->chroma_qp[1] = get_chroma_qp( s, 1, ec->curr_qscale);
-        }else
-            ec->last_qscale_diff=0;
-
-        memset(m->mb, 0, 16*16 * sizeof(DCTELEM));
-        if( IS_INTRA16x16( mb_type ) ) {
-            int i;
-
-            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
-            decode_cabac_residual_dc( ec, s, c, m->mb, 0, 0, dc_scan, 16);
-            qmul = ec->dequant4_coeff[0][ec->curr_qscale];
-            if( cbp&15 ) {
-                for( i = 0; i < 16; i++ ) {
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    decode_cabac_residual_nondc( ec, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15);
-                }
-            } else {
-                fill_rectangle(&ec->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
-            }
-            h264_luma_dc_dequant_idct_c(m->mb, qmul[0]);
-        } else {
-
-            int i8x8, i4x4;
-            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
-                if( cbp & (1<<i8x8) ) {
-                    if( IS_8x8DCT(mb_type) ) {
-                        decode_cabac_residual_nondc(ec, s, c, m->mb + 64*i8x8, 5, 4*i8x8,
-                            scan8x8, ec->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][ec->curr_qscale], 64);
-                    } else {
-                        qmul = ec->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][ec->curr_qscale];
-                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
-                            const int index = 4*i8x8 + i4x4;
-                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
-//START_TIMER
-                            decode_cabac_residual_nondc(ec, s, c, m->mb + 16*index, 2, index, scan, qmul, 16);
-//STOP_TIMER("decode_residual")
-                        }
-                    }
-                } else {
-                    uint8_t * const nnz= &ec->non_zero_count_cache[ scan8[4*i8x8] ];
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-                }
-            }
-        }
-
-        if( cbp&0x30 ){
-            memset(m->mb + 256, 0, 2*64 * sizeof(DCTELEM));
-            for( int i = 0; i < 2; i++ ) {
-                const uint32_t dequant4_coeff = ec->dequant4_coeff[IS_INTRA(mb_type) ? 1+i:4+i][ec->chroma_qp[i]][0];
-
-                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                decode_cabac_residual_dc(ec, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4);
-                chroma_dc_dequant_idct_c(m->mb + 256 + 16*4*i, dequant4_coeff);
-            }
-        }
-
-        if( cbp&0x20 ) {
-            int i, j;
-            for( i = 0; i < 2; i++ ) {
-                qmul = ec->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][ec->chroma_qp[i]];
-                for( j = 0; j < 4; j++ ) {
-                    const int index = 16 + 4 * i + j;
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    decode_cabac_residual_nondc( ec, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15);
-                }
-            }
-        } else {
-            uint8_t * const nnz= &ec->non_zero_count_cache[0];
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
-        }
-
-    } else {
-        uint8_t * const nnz= &ec->non_zero_count_cache[0];
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
-        ec->last_qscale_diff = 0;
-    }
-
-    m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale;
-    write_back_non_zero_count(ec, s);
-
-
-    return 0;
-}
-
-void free_entropy_context(EntropyContext *ec){
-    av_freep(&ec->non_zero_count_row[0]);
-    av_freep(&ec->non_zero_count_row[1]);
-    av_freep(&ec->mvd_table[0][0]);
-    av_freep(&ec->mvd_table[0][1]);
-    av_freep(&ec->mvd_table[1][0]);
-    av_freep(&ec->mvd_table[1][1]);
-
-    av_freep(&ec->direct_table[0]);
-    av_freep(&ec->direct_table[1]);
-    av_freep(&ec->chroma_pred_mode_table[0]);
-    av_freep(&ec->chroma_pred_mode_table[1]);
-    av_freep(&ec->cbp_table[0]);
-    av_freep(&ec->cbp_table[1]);
-    av_freep(&ec->qscale_table[0]);
-    av_freep(&ec->qscale_table[1]);
-
-    av_freep(&ec->mb_type_table[0]);
-    av_freep(&ec->mb_type_table[1]);
-    av_freep(&ec->ref_index_table[0][0]);
-    av_freep(&ec->ref_index_table[0][1]);
-    av_freep(&ec->ref_index_table[1][0]);
-    av_freep(&ec->ref_index_table[1][1]);
-
-
-    av_free(ec);
-}
-
-EntropyContext *get_entropy_context(H264Context *h){
-    const int mb_height = h->mb_height;
-    const int mb_width  = h->mb_width;
-    const int mb_stride = h->mb_stride;
-
-    EntropyContext *ec = av_mallocz(sizeof(EntropyContext));
-
-    ec->mb_width = mb_width;
-    ec->mb_height = mb_height;
-    ec->b_stride  = mb_width*4;
-    ec->mb_stride = mb_stride;
-
-    FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[0], mb_stride * 8 * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[1], mb_stride * 8 * sizeof(uint8_t), fail)
-
-    FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][0], 16*mb_stride * sizeof(uint8_t), fail);
-    FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][1], 16*mb_stride * sizeof(uint8_t), fail);
-    FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][0], 16*mb_stride * sizeof(uint8_t), fail);
-    FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][1], 16*mb_stride * sizeof(uint8_t), fail);
-
-    FF_ALLOCZ_OR_GOTO(ec->direct_table[0], 4*mb_stride * sizeof(uint8_t) , fail);
-    FF_ALLOCZ_OR_GOTO(ec->direct_table[1], 4*mb_stride * sizeof(uint8_t) , fail);
-
-    FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[0], mb_stride * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[1], mb_stride * sizeof(uint8_t), fail)
-
-    FF_ALLOCZ_OR_GOTO(ec->cbp_table[0], mb_stride * sizeof(uint16_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->cbp_table[1], mb_stride * sizeof(uint16_t), fail)
-
-    FF_ALLOCZ_OR_GOTO(ec->qscale_table[0], mb_stride * sizeof(uint8_t) , fail)
-    FF_ALLOCZ_OR_GOTO(ec->qscale_table[1], mb_stride * sizeof(uint8_t) , fail)
-
-    FF_ALLOCZ_OR_GOTO(ec->mb_type_table[0] , (mb_stride+1) * sizeof(uint32_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->mb_type_table[1] , (mb_stride+1) * sizeof(uint32_t), fail)
-
-    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][0], 4*mb_stride * sizeof(int8_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][0], 4*mb_stride * sizeof(int8_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][1], 4*mb_stride * sizeof(int8_t), fail)
-    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][1], 4*mb_stride * sizeof(int8_t), fail)
-
-    ec->zigzag_scan = h->zigzag_scan;
-    ec->zigzag_scan8x8 = h->zigzag_scan8x8;
-
-    return ec;
-fail:
-    free_entropy_context(ec);
-    return NULL;
-}
-
-void init_entropy_buf(EntropyContext *ec, H264Slice *s, int line){
-    int top = (line+1)%2;
-    int cur = line%2;
-
-    ec->non_zero_count_top      = ec->non_zero_count_row[top];
-    ec->non_zero_count          = ec->non_zero_count_row[cur];
-    ec->mvd_top[0]              = ec->mvd_table[0][top];
-    ec->mvd[0]                  = ec->mvd_table[0][cur];
-    ec->mvd_top[1]              = ec->mvd_table[1][top];
-    ec->mvd[1]                  = ec->mvd_table[1][cur];
-    ec->direct_top              = ec->direct_table[top];
-    ec->direct                  = ec->direct_table[cur];
-    ec->chroma_pred_mode_top    = ec->chroma_pred_mode_table[top];
-    ec->chroma_pred_mode        = ec->chroma_pred_mode_table[cur];
-    ec->cbp_top                 = ec->cbp_table[top];
-    ec->cbp                     = ec->cbp_table[cur];
-    ec->qscale_top              = ec->qscale_table[top] +1;
-    ec->qscale                  = ec->qscale_table[cur] +1;
-    ec->mb_type_top             = ec->mb_type_table[top]+1;
-    ec->mb_type                 = ec->mb_type_table[cur]+1;
-    ec->ref_index_top[0]        = ec->ref_index_table[0][top];
-    ec->ref_index_top[1]        = ec->ref_index_table[1][top];
-    ec->ref_index[0]            = ec->ref_index_table[0][cur];
-    ec->ref_index[1]            = ec->ref_index_table[1][cur];
-
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_entropy.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_entropy.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-#ifndef H264_CABAC_H
-#define H264_CABAC_H
-
-#include "h264_types.h"
-#include "cabac.h"
-
-/**
- * decodes a CABAC coded macroblock
- * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
- */
-
-int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c);
-void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c);
-
-int init_entropy_buf(EntropyContext *ec, H264Slice *s, int line);
-EntropyContext * get_entropy_context(H264Context *h);
-void init_dequant_tables(H264Slice *s, EntropyContext *ec);
-void free_entropy_context(EntropyContext *ec);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_idct.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_idct.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,270 +0,0 @@
-/*
- * H.264 IDCT
- * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 IDCT.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "dsputil.h"
-#include "h264_data.h"
-
-static av_always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    block[0] += 1<<(shift-1);
-
-    for(i=0; i<4; i++){
-        const int z0=  block[0 + block_stride*i]     +  block[2 + block_stride*i];
-        const int z1=  block[0 + block_stride*i]     -  block[2 + block_stride*i];
-        const int z2= (block[1 + block_stride*i]>>1) -  block[3 + block_stride*i];
-        const int z3=  block[1 + block_stride*i]     + (block[3 + block_stride*i]>>1);
-
-        block[0 + block_stride*i]= z0 + z3;
-        block[1 + block_stride*i]= z1 + z2;
-        block[2 + block_stride*i]= z1 - z2;
-        block[3 + block_stride*i]= z0 - z3;
-    }
-
-    for(i=0; i<4; i++){
-        const int z0=  block[i + block_stride*0]     +  block[i + block_stride*2];
-        const int z1=  block[i + block_stride*0]     -  block[i + block_stride*2];
-        const int z2= (block[i + block_stride*1]>>1) -  block[i + block_stride*3];
-        const int z3=  block[i + block_stride*1]     + (block[i + block_stride*3]>>1);
-
-        dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ];
-        dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ];
-        dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ];
-        dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ];
-    }
-}
-
-void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){
-    idct_internal(dst, block, stride, 4, 6, 1);
-}
-
-void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){
-    idct_internal(dst, block, stride, 8, 3, 1);
-}
-
-void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){
-    idct_internal(dst, block, stride, 8, 3, 0);
-}
-
-void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    block[0] += 32;
-
-    for( i = 0; i < 8; i++ )
-    {
-        const int a0 =  block[0+i*8] + block[4+i*8];
-        const int a2 =  block[0+i*8] - block[4+i*8];
-        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
-        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
-
-        const int b0 = a0 + a6;
-        const int b2 = a2 + a4;
-        const int b4 = a2 - a4;
-        const int b6 = a0 - a6;
-
-        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
-        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
-        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
-        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
-
-        const int b1 = (a7>>2) + a1;
-        const int b3 =  a3 + (a5>>2);
-        const int b5 = (a3>>2) - a5;
-        const int b7 =  a7 - (a1>>2);
-
-        block[0+i*8] = b0 + b7;
-        block[7+i*8] = b0 - b7;
-        block[1+i*8] = b2 + b5;
-        block[6+i*8] = b2 - b5;
-        block[2+i*8] = b4 + b3;
-        block[5+i*8] = b4 - b3;
-        block[3+i*8] = b6 + b1;
-        block[4+i*8] = b6 - b1;
-    }
-    for( i = 0; i < 8; i++ )
-    {
-        const int a0 =  block[i+0*8] + block[i+4*8];
-        const int a2 =  block[i+0*8] - block[i+4*8];
-        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
-        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
-
-        const int b0 = a0 + a6;
-        const int b2 = a2 + a4;
-        const int b4 = a2 - a4;
-        const int b6 = a0 - a6;
-
-        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
-        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
-        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
-        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
-
-        const int b1 = (a7>>2) + a1;
-        const int b3 =  a3 + (a5>>2);
-        const int b5 = (a3>>2) - a5;
-        const int b7 =  a7 - (a1>>2);
-
-        dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
-        dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
-        dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
-        dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
-        dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
-        dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
-        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
-        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
-    }
-}
-
-// assumes all AC coefs are 0
-void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
-    int i, j;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int dc = (block[0] + 32) >> 6;
-    for( j = 0; j < 4; j++ )
-    {
-        for( i = 0; i < 4; i++ )
-            dst[i] = cm[ dst[i] + dc ];
-        dst += stride;
-    }
-}
-
-void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
-    int i, j;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int dc = (block[0] + 32) >> 6;
-    for( j = 0; j < 8; j++ )
-    {
-        for( i = 0; i < 8; i++ )
-            dst[i] = cm[ dst[i] + dc ];
-        dst += stride;
-    }
-}
-
-void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i++){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
-            else                      idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
-        }
-    }
-}
-
-void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i++){
-        if(nnzc[ scan8[i] ]) idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
-        else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
-    }
-}
-
-void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i+=4){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride);
-            else                      ff_h264_idct8_add_c   (dst + block_offset[i], block + i*16, stride);
-        }
-    }
-}
-
-void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=16; i<16+8; i++){
-        if(nnzc[ scan8[i] ])
-            ff_h264_idct_add_c   (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
-        else if(block[i*16])
-            ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
-    }
-}
-
-/**
-* IDCT transforms the 16 dc values and dequantizes them.
-* @param qp quantization parameter
-*/
-void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul){
-	#define stride 16
-	int i;
-	int temp[16]; //FIXME check if this is a good idea
-	static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
-	static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
-
-	//return;
-	for(i=0; i<4; i++){
-		const int offset= y_offset[i];
-		const int z0= block[offset+stride*0] + block[offset+stride*4];
-		const int z1= block[offset+stride*0] - block[offset+stride*4];
-		const int z2= block[offset+stride*1] - block[offset+stride*5];
-		const int z3= block[offset+stride*1] + block[offset+stride*5];
-
-		temp[4*i+0]= z0+z3;
-		temp[4*i+1]= z1+z2;
-		temp[4*i+2]= z1-z2;
-		temp[4*i+3]= z0-z3;
-	}
-
-	for(i=0; i<4; i++){
-		const int offset= x_offset[i];
-		const int z0= temp[4*0+i] + temp[4*2+i];
-		const int z1= temp[4*0+i] - temp[4*2+i];
-		const int z2= temp[4*1+i] - temp[4*3+i];
-		const int z3= temp[4*1+i] + temp[4*3+i];
-
-		block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
-		block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
-		block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
-		block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
-	}
-}
-
-#undef xStride
-#undef stride
-
-void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){
-	const int stride= 16*2;
-	const int xStride= 16;
-	int a,b,c,d,e;
-
-	a= block[stride*0 + xStride*0];
-	b= block[stride*0 + xStride*1];
-	c= block[stride*1 + xStride*0];
-	d= block[stride*1 + xStride*1];
-
-	e= a-b;
-	a= a+b;
-	b= c-d;
-	c= c+d;
-
-	block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
-	block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
-	block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
-	block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_idct.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_idct.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-#ifndef H264_IDCT_H
-#define H264_IDCT_H
-
-#include "avcodec.h"
-
-void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
-void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
-void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul);
-void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_mc.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_mc.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,272 +0,0 @@
-#include "h264_types.h"
-#include "h264_data.h"
-
-static inline void mc_dir_part(MBRecContext *d, MBRecState *mrs, H264Mb *m, DecodedPicture *pic, int n, int square,
-							   int chroma_height, int delta, int list,uint8_t *dest_y,
-							   uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset,
-							   qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
-	const int mx= mrs->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
-	const int my= mrs->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
-	const int luma_xy= (mx&3) + ((my&3)<<2);
-	const int pic_width  = 16*d->mb_width;
-	const int pic_height = 16*d->mb_height;
-
-	uint8_t *src_y, *src_cb, *src_cr;
-	int ymx= mx>>2;
-	int ymy= my>>2;
-	int cmy= my>>3;
-	int cmx= mx>>3;
-
-	//truncate the motion vectors references
-	if(ymy>= pic_height+2){
-		ymy=pic_height+1;
-	}else if(ymy <=-19){
-		ymy=-18;
-	}
-	if(ymx>= pic_width+2){
-		ymx= pic_width+1;
-	}else if(ymx<=-19){
-		ymx=-19;
-	}
-
-	src_y = pic->data[0] + ymx + ymy*d->linesize;
-	qpix_op[luma_xy](dest_y, src_y, d->linesize); //FIXME try variable height perhaps?
-	if(!square){
-		qpix_op[luma_xy](dest_y + delta, src_y + delta, d->linesize);
-	}
-
-	if(cmy >= pic_height>>1){
-		cmy = (pic_height>>1) -1;
-	}else if(cmy<=-9){
-		cmy=-8;
-	}
-	if(cmx >= pic_width>>1){
-		cmx = (pic_width>>1) -1;
-	}else if(cmx<=-9){
-		cmx=-8;
-	}
-
-	src_cb= pic->data[1] + cmx + cmy*d->uvlinesize;
-	src_cr= pic->data[2] + cmx + cmy*d->uvlinesize;
-
-	chroma_op(dest_cb, src_cb, d->uvlinesize, chroma_height, mx&7, my&7);
-	chroma_op(dest_cr, src_cr, d->uvlinesize, chroma_height, mx&7, my&7);
-}
-
-static inline void mc_part_std(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
-								uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-								int x_offset, int y_offset,
-								qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-								qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-								int list0, int list1){
-	qpel_mc_func *qpix_op=  qpix_put;
-	h264_chroma_mc_func chroma_op= chroma_put;
-
-	dest_y  += 2*x_offset + 2*y_offset*d->  linesize;
-	dest_cb +=   x_offset +   y_offset*d->uvlinesize;
-	dest_cr +=   x_offset +   y_offset*d->uvlinesize;
-	x_offset += 8*m->mb_x;
-	y_offset += 8*m->mb_y;
-
-	if(list0){
-		DecodedPicture *ref= s->dp_ref_list[0][ mrs->ref_cache[0][ scan8[n] ] ];
-		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 0,
-					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op);
-
-		qpix_op=  qpix_avg;
-		chroma_op= chroma_avg;
-	}
-
-	if(list1){
-		DecodedPicture *ref= s->dp_ref_list[1][ mrs->ref_cache[1][ scan8[n] ] ];
-		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 1,
-					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op);
-	}
-}
-
-static inline void mc_part_weighted(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
-									uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-									int x_offset, int y_offset,
-									qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-									h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
-									h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-									int list0, int list1){
-	dest_y  += 2*x_offset + 2*y_offset*d->  linesize;
-	dest_cb +=   x_offset +   y_offset*d->uvlinesize;
-	dest_cr +=   x_offset +   y_offset*d->uvlinesize;
-	x_offset += 8*m->mb_x;
-	y_offset += 8*m->mb_y;
-
-	if(list0 && list1){
-		/* don't optimize for luma-only case, since B-frames usually
-		* use implicit weights => chroma too. */
-		uint8_t *tmp_y  = d->scratchpad_y  + 2*x_offset +16 ;
-		uint8_t *tmp_cb = d->scratchpad_cb + x_offset + 8;
-		uint8_t *tmp_cr = d->scratchpad_cr + x_offset + 8;
-
-/*
-		uint8_t *tmp_cb = d->scratchpad;
-		uint8_t *tmp_cr = d->scratchpad + 8;
-		uint8_t *tmp_y  = d->scratchpad + 8*d->uvlinesize;*/
-		int refn0 = mrs->ref_cache[0][ scan8[n] ];
-		int refn1 = mrs->ref_cache[1][ scan8[n] ];
-
-		mc_dir_part(d, mrs, m, s->dp_ref_list[0][refn0], n, square, chroma_height, delta, 0,
-					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put);
-		mc_dir_part(d, mrs, m, s->dp_ref_list[1][refn1], n, square, chroma_height, delta, 1,
-					tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put);
-
-		if(s->use_weight == 2){
-			int weight0 = s->implicit_weight[refn0][refn1][m->mb_y&1];
-			int weight1 = 64 - weight0;
-			luma_weight_avg(  dest_y,  tmp_y,  d->  linesize, 5, weight0, weight1, 0);
-			chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, 5, weight0, weight1, 0);
-			chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, 5, weight0, weight1, 0);
-		}else{
-			luma_weight_avg(dest_y, tmp_y, d->linesize, s->luma_log2_weight_denom,
-							s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0],
-							s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]);
-			chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, s->chroma_log2_weight_denom,
-							s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0],
-							s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]);
-			chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, s->chroma_log2_weight_denom,
-							s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0],
-							s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]);
-		}
-	}else{
-		int list = list1 ? 1 : 0;
-		int refn = mrs->ref_cache[list][ scan8[n] ];
-		DecodedPicture *ref= s->dp_ref_list[list][refn];
-		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, list,
-					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put);
-
-		luma_weight_op(dest_y, d->linesize, s->luma_log2_weight_denom,
-						s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]);
-		if(s->use_weight_chroma){
-			chroma_weight_op(dest_cb, d->uvlinesize, s->chroma_log2_weight_denom,
-							s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]);
-			chroma_weight_op(dest_cr, d->uvlinesize, s->chroma_log2_weight_denom,
-							s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]);
-		}
-	}
-}
-
-static inline void mc_part(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
-							uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-							int x_offset, int y_offset,
-							qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-							qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-							h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-							int list0, int list1){
-	if((s->use_weight==2 && list0 && list1
-		&& (s->implicit_weight[ mrs->ref_cache[0][scan8[n]] ][ mrs->ref_cache[1][scan8[n]] ][m->mb_y&1] != 32))
-		|| s->use_weight==1)
-		mc_part_weighted(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
-						x_offset, y_offset, qpix_put, chroma_put,
-						weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
-	else
-		mc_part_std(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
-					x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
-}
-
-static inline void prefetch_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int list){
-	/* fetch pixels for estimated mv 4 macroblocks ahead
-	* optimized for 64byte cache lines */
-	const int refn = mrs->ref_cache[list][scan8[0]];
-
-	if(refn >= 0){
-		const int mx= (mrs->mv_cache[list][scan8[0]][0]>>2) + 16*m->mb_x + 8;
-		const int my= (mrs->mv_cache[list][scan8[0]][1]>>2) + 16*m->mb_y;
-		uint8_t **src= s->dp_ref_list[list][refn]->data;
-		int off= mx + (my + (m->mb_x&3)*4)*d->linesize + 64;
-
-		d->dsp.prefetch(src[0]+off, d->linesize, 4);
-		off= (mx>>1) + ((my>>1) + (m->mb_x&7))*d->uvlinesize + 64;
-		d->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
-	}
-}
-
-void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-					qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
-					qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
-					h264_weight_func *weight_op, h264_biweight_func *weight_avg){
-	const int mb_type= m->mb_type;
-	assert(IS_INTER(mb_type));
-
-	if (mb_type & MB_TYPE_L0)
-		prefetch_motion(d, mrs, s, m, 0);
-	if (mb_type & MB_TYPE_L1)
-		prefetch_motion(d, mrs, s, m, 1);
-
-	if(IS_16X16(mb_type)){
-		mc_part(d, mrs, s, m, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
-				qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
-				weight_op, weight_avg,
-				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-	}else if(IS_16X8(mb_type)){
-		mc_part(d, mrs, s, m, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
-				qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-				&weight_op[1], &weight_avg[1],
-				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-		mc_part(d, mrs, s, m, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
-				qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-				&weight_op[1], &weight_avg[1],
-				IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
-	}else if(IS_8X16(mb_type)){
-		mc_part(d, mrs, s, m, 0, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 0, 0,
-				qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-				&weight_op[2], &weight_avg[2],
-				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-		mc_part(d, mrs, s, m, 4, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 4, 0,
-				qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-				&weight_op[2], &weight_avg[2],
-				IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
-	}else{
-		int i;
-
-		assert(IS_8X8(mb_type));
-
-		for(i=0; i<4; i++){
-			const int sub_mb_type= m->sub_mb_type[i];
-			const int n= 4*i;
-			int x_offset= (i&1)<<2;
-			int y_offset= (i&2)<<1;
-
-			if(IS_SUB_8X8(sub_mb_type)){
-				mc_part(d, mrs, s, m, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
-						qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-						&weight_op[3], &weight_avg[3],
-						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-			}else if(IS_SUB_8X4(sub_mb_type)){
-				mc_part(d, mrs, s, m, n, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
-						qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-						&weight_op[4], &weight_avg[4],
-						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-				mc_part(d, mrs, s, m, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
-						qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-						&weight_op[4], &weight_avg[4],
-						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-			}else if(IS_SUB_4X8(sub_mb_type)){
-				mc_part(d, mrs, s, m, n, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
-						qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-						&weight_op[5], &weight_avg[5],
-						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-				mc_part(d, mrs, s, m, n+1, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
-						qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-						&weight_op[5], &weight_avg[5],
-						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-			}else{
-				int j;
-				assert(IS_SUB_4X4(sub_mb_type));
-				for(j=0; j<4; j++){
-					int sub_x_offset= x_offset + 2*(j&1);
-					int sub_y_offset= y_offset +   (j&2);
-					mc_part(d, mrs, s, m, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
-							qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-							&weight_op[6], &weight_avg[6],
-							IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-				}
-			}
-		}
-	}
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_mc.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_mc.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-#ifndef H264_MC_H
-#define H264_MC_H
-
-#include "dsputil.h"
-#include "h264_types.h"
-
-void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-					qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
-					qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
-					h264_weight_func *weight_op, h264_biweight_func *weight_avg);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_misc.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_misc.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,944 +0,0 @@
-#include "config.h"
-
-#include "h264_types.h"
-
-#include <unistd.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <time.h>
-#include <pthread.h>
-#undef NDEBUG
-#include <assert.h>
-
-#if HAVE_LIBSDL2
-#include <SDL2/SDL.h>
-#if HAVE_LIBSDL_TTF
-#include <SDL/SDL_ttf.h>
-#endif
-#endif
-
-void start_timer(H264Context *h, int stage){
-    clock_gettime(CLOCK_REALTIME, &h->start_time[stage]);
-}
-
-void stop_timer(H264Context *h, int stage){
-    clock_gettime(CLOCK_REALTIME, &h->end_time[stage]);
-    double time = (double) 1.e3*(h->end_time[stage].tv_sec - h->start_time[stage].tv_sec) + 1.e-6*(h->end_time[stage].tv_nsec - h->start_time[stage].tv_nsec);
-    h->last_time [stage]  = time;
-    h->total_time[stage] += time;
-}
-
-void init_sb_entry(H264Context *h, SliceBufferEntry *sbe){
-    sbe->mbs = av_malloc(h->mb_width*h->mb_height* sizeof(H264Mb));
-    sbe->initialized = 1;
-}
-
-void free_sb_entry(SliceBufferEntry *sbe){
-    av_free(sbe->mbs);
-    av_freep(&sbe->gb.raw);
-    if (sbe->gb.rbsp)
-        av_freep(&sbe->gb.rbsp);
-    sbe->initialized = 0;
-}
-
-SliceBufferEntry *get_sb_entry(H264Context *h){
-    SliceBufferEntry *sb = NULL;
-
-    pthread_mutex_lock(&h->lock[PARSE]);
-    while (h->free_sb_cnt<=0)
-        pthread_cond_wait(&h->cond[PARSE], &h->lock[PARSE]);
-    /* use first free picture */
-    for(int i=0; i<h->sb_size; i++){
-        if(h->sb[i].state==0){
-            sb= &h->sb[i];
-            sb->state=1;
-            sb->lines_taken=0;
-            sb->lines_total=h->mb_height;
-            break;
-        }
-    }
-    h->free_sb_cnt--;
-
-    pthread_mutex_unlock(&h->lock[PARSE]);
-
-    memset (&sb->slice, 0, sizeof(H264Slice));
-
-    return sb;
-}
-
-void release_sb_entry(H264Context *h, SliceBufferEntry *sb){
-    pthread_mutex_lock(&h->lock[PARSE]);
-
-    sb->state = 0;
-    h->free_sb_cnt++;
-    pthread_cond_signal(&h->cond[PARSE]);
-
-    pthread_mutex_unlock(&h->lock[PARSE]);
-}
-
-int init_dpb_entry(H264Context *h, DecodedPicture *pic, H264Slice *s, int width, int height){
-    int i;
-
-    s->curr_pic=pic;
-    pic->poc = s->poc;
-    pic->key_frame = s->key_frame;
-    pic->mmco_reset = s->mmco_reset;
-    pic->reference = s->nal_ref_idc? 3:1;
-    pic->cpn = s->coded_pic_num;
-
-    if(pic->data[0]==NULL) {
-        int size[3] = {0};
-
-        width+= EDGE_WIDTH*2;
-        height+= EDGE_WIDTH*2;
-
-        pic->linesize[0]= width;
-        pic->linesize[1]=  pic->linesize[2] = width>>1;
-
-        size[0] = width*height;
-        size[1] = size[2] = width*height>>2;
-
-        for(i=0; i<3; i++){
-            pic->base[i]= av_malloc(size[i]);
-        }
-
-        pic->data[0] = pic->base[0] + (pic->linesize[0]*EDGE_WIDTH) + EDGE_WIDTH;
-        pic->data[1] = pic->base[1] + (pic->linesize[1]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1);
-        pic->data[2] = pic->base[2] + (pic->linesize[2]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1);
-    }
-
-    const int big_mb_num= h->mb_stride*(h->mb_height+1) + 1; //the +1 is needed so memset(,,stride*height) does not sig11
-    const int mb_array_size= h->mb_stride*h->mb_height;
-    const int b4_array_size= h->b4_stride*h->mb_height*4;
-
-    if(pic->mb_type_base==NULL){
-        FF_ALLOCZ_OR_GOTO(pic->mb_type_base , big_mb_num * sizeof(uint32_t), fail)
-        pic->mb_type= pic->mb_type_base + h->mb_stride+1;
-
-        for(int i=0; i<2; i++){
-            FF_ALLOCZ_OR_GOTO(pic->motion_val_base[i], 2 * (b4_array_size+4)  * sizeof(int16_t), fail)
-            pic->motion_val[i]= pic->motion_val_base[i]+4;
-            FF_ALLOCZ_OR_GOTO(pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail)
-        }
-        FF_ALLOCZ_OR_GOTO(pic->intra4x4_pred_mode, h->mb_width*h->mb_height * 4* sizeof(int8_t), fail)
-    }
-
-    return 0;
-    fail:
-    return -1;
-}
-
-void free_dp(DecodedPicture *pic){
-    if(pic->base[0]){
-        for (int i=0; i<3; i++){
-            av_free(pic->base[i]);
-            pic->data[i]= NULL;
-        }
-    }
-    if (pic->mb_type_base){
-        av_free(pic->mb_type_base);
-        pic->mb_type= NULL;
-        for(int i=0; i<2; i++){
-            av_free(pic->motion_val_base[i]);
-            av_free(pic->ref_index[i]);
-        }
-        av_free(pic->intra4x4_pred_mode);
-    }
-}
-
-DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s){
-    DecodedPicture *dp = NULL;
-
-    pthread_mutex_lock(&h->lock[REORDER2]);
-    while (h->free_dpb_cnt<=0){
-    #if OMPSS
-        assert(0);
-    #endif
-        pthread_cond_wait(&h->cond[REORDER2], &h->lock[REORDER2]);
-    }
-    /* use first free picture */
-    for(int i=0; i<h->max_dpb_cnt; i++){
-        if(h->dpb[i].reference==0){
-            dp= &h->dpb[i];
-            break;
-        }
-    }
-    assert(dp);
-    init_dpb_entry(h, dp, s, h->width, h->height);
-    h->free_dpb_cnt--;
-    h->acdpb_cnt++; //debug
-    pthread_mutex_unlock(&h->lock[REORDER2]);
-
-    return dp;
-}
-
-void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode){
-    pthread_mutex_lock(&h->lock[REORDER2]);
-    pic->reference &= ~mode;
-    if (pic->reference == 0){
-        h->free_dpb_cnt++;
-        h->reldpb_cnt++; //debug
-        pthread_cond_signal(&h->cond[REORDER2]);
-    }
-    pthread_mutex_unlock(&h->lock[REORDER2]);
-}
-
-
-/**
-*   Extends the edges of a macroblock line.
-*/
-void draw_edges(MBRecContext *d, H264Slice *s, int line){
-    int i;
-    int mb_width=d->mb_width;
-    int mb_height=d->mb_height;
-    int last = (line+1 == mb_height);
-    int lines = last?16:12;
-    int linesize = d->linesize;
-    int uvlinesize = d->uvlinesize;
-    uint8_t *y = s->curr_pic->data[0] + 16*line*linesize;
-    uint8_t *cb = s->curr_pic->data[1] + 8*line*uvlinesize;
-    uint8_t *cr = s->curr_pic->data[2] + 8*line*uvlinesize;
-
-    for (i=-4; i<lines; i++){
-        memset(y + i*linesize - EDGE_WIDTH, y[i*linesize], EDGE_WIDTH);
-        memset(y + i*linesize + mb_width*16, y[i*linesize +mb_width*16 -1], EDGE_WIDTH);
-    }
-    for (i=-2; i<lines/2; i++){
-        memset(cb + i*uvlinesize - EDGE_WIDTH/2, cb[i*uvlinesize], EDGE_WIDTH/2);
-        memset(cb + i*uvlinesize + mb_width*8, cb[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2);
-        memset(cr + i*uvlinesize - EDGE_WIDTH/2, cr[i*uvlinesize], EDGE_WIDTH/2);
-        memset(cr + i*uvlinesize + mb_width*8, cr[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2);
-    }
-
-    if (line==0){
-        y -= EDGE_WIDTH;
-        cb -= EDGE_WIDTH/2;
-        cr -= EDGE_WIDTH/2;
-        for (i=1; i<=21; i++){
-            memcpy(y -i*linesize, y, linesize);
-        }
-        for (i=1; i<=9; i++){
-            memcpy(cb -i*uvlinesize, cb, uvlinesize);
-            memcpy(cr -i*uvlinesize, cr, uvlinesize);
-        }
-    }else if (last){
-        y += -EDGE_WIDTH + 15*linesize;
-        cb += -EDGE_WIDTH/2 + 7*uvlinesize;
-        cr += -EDGE_WIDTH/2 + 7*uvlinesize;
-        for (i=1; i<=21; i++){
-            memcpy(y +i*linesize, y, linesize);
-        }
-        for (i=1; i<=9; i++){
-            memcpy(cb +i*uvlinesize, cb, uvlinesize);
-            memcpy(cr +i*uvlinesize, cr, uvlinesize);
-        }
-    }
-}
-
-static int64_t timer_start;
-int64_t av_gettime(void) {
-    struct timeval tv;
-    gettimeofday(&tv,NULL);
-    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
-void av_start_timer(){
-    timer_start = av_gettime();
-}
-
-void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose) {
-    static int64_t last_time = -1;
-    static int64_t last_frame_number = 0;
-    float t=0, t2=0;
-    int64_t cur_time=0;
-    
-    if (!is_last_report) {    
-        /* display the report every 0.5 seconds */
-        cur_time = av_gettime();
-        if (last_time == -1) {
-            last_time = cur_time;
-            return;
-        }
-        if ((cur_time - last_time) < 500000)
-            return;
-        t = (cur_time-timer_start) / 1000000.0;
-        t2 = (cur_time-last_time) / 1000000.0;        
-    }
-
-    if (verbose){
-        fprintf(stderr, "frame=%5d avgfps=%3d curfps=%3d\r", frame_number, (int)(frame_number/t+0.5), (int)((frame_number - last_frame_number)/t2+0.5) );
-        fflush(stderr);
-    }
-    last_frame_number = frame_number;
-    last_time = cur_time;
-
-    if (is_last_report){
-        t = (av_gettime()-timer_start) / 1000000.0;
-        fprintf(stderr, "%c[2Kframe=%5d avgfps=%3d\r", 27, frame_number, (int)(frame_number/t+0.5));
-        fprintf(stderr, "\n");
-        fprintf(stderr, "video:%1.0fkB\n", video_size/1024.0);
-        fflush(stderr);
-    }
-}
-
-/* Sort B-frames into display order */
-static DecodedPicture *get_reordered_picture(OutputContext *w, int flush){
-    int i;
-    int out_idx = 0;
-    DecodedPicture *out = w->delayed_pic[0];
-
-    if (!out)
-        return NULL;
-
-    for(i=1; w->delayed_pic[i] && !w->delayed_pic[i]->key_frame && !w->delayed_pic[i]->mmco_reset; i++){
-        if(w->delayed_pic[i]->poc < out->poc){
-            out = w->delayed_pic[i];
-            out_idx = i;
-        }
-    }
-
-    if(w->dp_cnt > MAX_DELAYED_PIC_COUNT || flush) {
-        for(i=out_idx; w->delayed_pic[i]; i++)
-            w->delayed_pic[i] = w->delayed_pic[i+1];
-        w->dp_cnt--;
-        return out;
-    }
-    return NULL;
-}
-
-/**
-*  Remove the extra borders, and places the three parts of the image after each other.
-*/
-static int raw_encode(const DecodedPicture* src, int width, int height, unsigned char *dest) {
-    int i, j;
-/** To write entire image including extra borders*/
-//  int w = src->linesize[0];
-//  int h = height+64;
-//  int w2 = w>>1;
-//  int h2 = h>>1;
-//     int data_planes=3;
-//     int size = w * h + 2 *w2*h2;
-//     const unsigned char* s;
-//     for (i=0; i<data_planes; i++) {
-//         if (i == 1) {
-//             w = w2;
-//             h = h2;
-//         }
-//         s = src->base[i];
-//         for(j=0; j<h; j++) {
-//             memcpy(dest, s, src->linesize[i]);
-//             dest += w;
-//             s += src->linesize[i];
-//         }
-//     }
-
-    int w = (width*8 + 7)/8;
-    int h = height;
-    int w2 =((width >>1) * 8 + 7) / 8;
-    int h2 = ((height+1) >>1); //not sure about +1
-    int data_planes=3;
-    int size = w * h + 2 *w2*h2;
-    const unsigned char* s;
-
-
-    for (i=0; i<data_planes; i++) {
-        if (i == 1) {
-            w = w2;
-            h = h2;
-        }
-        s = src->data[i];
-        for(j=0; j<h; j++) {
-            memcpy(dest, s, w);
-            dest += w;
-            s += src->linesize[i];
-        }
-    }
-    return size;
-}
-
-#ifdef HAVE_LIBSDL2
-static SDL_Texture *get_next_texture(H264Context *h, int side){
-    SDLTextureQueue *sdlq = &h->sdlq;
-    SDL_Texture *texture;
-    pthread_mutex_lock (&sdlq->sdl_lock);
-    if (side ){ //send
-        while (sdlq->ready >= sdlq->size)
-            pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock);
-        texture = sdlq->queue[sdlq->fi];
-        sdlq->fi++; sdlq->fi %= sdlq->size;
-    } else { //recv
-        while (sdlq->ready <= 0 && !sdlq->exit)
-            pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock);
-
-        if (sdlq->ready == 0 && sdlq->exit){
-            texture = NULL;
-        }else{
-            texture = sdlq->queue[sdlq->fo];
-            sdlq->fo++; sdlq->fo %= sdlq->size;
-        }
-    }
-    pthread_mutex_unlock(&sdlq->sdl_lock);
-
-    return texture;
-}
-
-static void signal_texture(H264Context *h, int side){
-    SDLTextureQueue *sdlq = &h->sdlq;
-    pthread_mutex_lock (&sdlq->sdl_lock);
-    if (side)
-        sdlq->ready++;
-    else
-        sdlq->ready--;
-    pthread_cond_signal(&sdlq->sdl_cond);
-    pthread_mutex_unlock(&sdlq->sdl_lock);
-}
-
-void signal_sdl_exit(H264Context *h){
-    SDLTextureQueue *sdlq = &h->sdlq;
-    pthread_mutex_lock (&sdlq->sdl_lock);
-    sdlq->exit=1;
-    pthread_cond_signal(&sdlq->sdl_cond);
-    pthread_mutex_unlock(&sdlq->sdl_lock);
-}
-
-static void display_frame(H264Context *h, OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height, int dropable){
-    static int64_t last_time = -1;
-    int64_t cur_time;
-//     SDLContext *sdlc = h->sdlc;
-    uint8_t *iyuv_pixels;
-    int pitch;
-
-
-    if (last_time == -1){
-        last_time = av_gettime();
-    }
-
-    
-    /* do not display frames that are less than 8.125 ms apart (120fps)*/
-    if (dropable){
-        cur_time = av_gettime();
-
-        if ((cur_time - last_time) < 8125)
-            return;
-
-        last_time =cur_time;
-    }
-
-    if(in_picture){
-        
-        SDL_Texture *texture= get_next_texture(h, 1);
-
-        SDL_LockTexture( texture, NULL, (void **)&iyuv_pixels, &pitch );
-
-        raw_encode(in_picture, frame_width, frame_height, iyuv_pixels);
-
-        signal_texture(h, 1);
-    }
-}
-#endif
-
-// TODO: Parallelize the raw_encode (either split frame or over frames)
-static void do_video_out(OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height) {
-    int size=0;
-    //remove extra borders
-
-    if(in_picture)
-        size= raw_encode(in_picture, frame_width, frame_height, w->bit_buffer);
-
-    if (size < 0) {
-        fprintf(stderr, "Video encoding failed\n");
-    }else {
-        if (write(fd, w->bit_buffer, size)<0)
-            fprintf(stderr, "Write frame failed\n");
-    }
-
-    w->video_size += size;
-}
-
-DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height) {
-    DecodedPicture *out;
-
-    if (pic){
-        oc->delayed_pic[oc->dp_cnt++]=pic;
-        out = get_reordered_picture(oc, 0);
-    }else{
-        out = get_reordered_picture(oc, 1);
-    }
-
-    if (out){
-        if (fd){
-            do_video_out(oc, fd, out, frame_width, frame_height);
-        }else{
-#ifdef HAVE_LIBSDL2
-            if (h->display){
-                display_frame(h, oc, fd, out, frame_width, frame_height, !(pic==NULL));
-            }
-#endif
-        }
-        oc->frame_number++;
-    }
-
-    return out;
-}
-
-OutputContext *get_output_context(H264Context *h){
-    const int frame_width=h->frame_width;
-    const int frame_height=h->frame_height;
-    const int frame_size = frame_width*frame_height;
-
-    OutputContext *oc = av_mallocz(sizeof(OutputContext));
-    oc->bit_buffer_size= FFMAX(1024*256, frame_size*2); // oversize a little bit to allow extra border write
-    oc->bit_buffer=  av_mallocz(oc->bit_buffer_size);
-
-    return oc;
-}
-
-void free_output_context(OutputContext *oc){
-
-    av_free(oc->bit_buffer);
-    av_free(oc);
-}
-
-SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height){
-    SuperMBContext *smbc = av_mallocz(sizeof(SuperMBContext));
-
-    smbc->smb_width = smb_width;
-    smbc->smb_height = smb_height;
-
-    smbc->nsmb_height = h->mb_height / smbc->smb_height +  (h->mb_height%smbc->smb_height ? 1:0);    //only need one extra if mb_height was not dividable
-    smbc->nsmb_width  = h->mb_width / smbc->smb_width;
-    while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width )
-        smbc->nsmb_width++;
-
-    smbc->nsmb_3dheight= smbc->nsmb_height - ((h->mb_height/2)/smbc->smb_height +1); //assuming max motion vector of half the height
-
-    smbc->smbs[0] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask));
-    smbc->smbs[1] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask));
-    for (int y=0, i=0; i<smbc->nsmb_height; i++, y+=smbc->smb_height){
-        for (int x=0, j=0; j<smbc->nsmb_width; j++, x+=smbc->smb_width){
-            smbc->smbs[0][i*smbc->nsmb_width +j].smb_y = y;
-            smbc->smbs[0][i*smbc->nsmb_width +j].smb_x = x;
-            smbc->smbs[1][i*smbc->nsmb_width +j].smb_y = y;
-            smbc->smbs[1][i*smbc->nsmb_width +j].smb_x = x;
-        }
-    }
-
-    smbc->refcount = 1;
-
-    return smbc;
-}
-
-void freeSuperMBContext(SuperMBContext *smbc){
-    av_free(smbc->smbs[0]);
-    av_free(smbc->smbs[1]);
-    av_free(smbc);
-}
-
-SuperMBContext * acquire_smbc(H264Context *h ){
-    SuperMBContext *smbc;
-
-    pthread_mutex_lock (&h->smb_lock);
-    smbc = h->smbc;
-    smbc->refcount++;
-    pthread_mutex_unlock(&h->smb_lock);
-    return smbc;
-}
-
-void release_smbc(H264Context *h, SuperMBContext *smbc){
-    pthread_mutex_lock (&h->smb_lock);
-    smbc->refcount--;
-    if (smbc->refcount==0){
-        freeSuperMBContext(smbc);
-    }
-    pthread_mutex_unlock(&h->smb_lock);
-
-}
-
-
-#ifdef HAVE_LIBSDL2
-
-// #if OMPSS
-static void draw_sb_border(H264Context *h, uint32_t *rgba_pixels, int smb_x, int smb_y){
-    int mb_width = h->mb_width;
-    int mb_height = h->mb_height;
-    int width = h->frame_width;
-    int height = h->frame_height;
-
-    int mb_x = smb_x * h->smb_width;
-    int mb_y = smb_y * h->smb_height;
-
-    uint32_t pix= 0x0000FFC0;
-
-    for (int k=0, i=mb_y; i< mb_y + h->smb_height; i++, k++){
-        for (int l=0, j=mb_x -k ; j< mb_x - k + h->smb_width; j++, l++){
-            //outside frame
-            if (i<0 || i>=mb_height || j<0 || j>=mb_width) {
-                continue;
-            }
-
-            //draw top
-            if (i==0 || k==0 || l==0){
-                int mx = j*16;
-                int my = i*16;
-                uint32_t *top = rgba_pixels + my*width + mx;
-                int endx = mx+16 < width? 16: width-mx;
-
-                for (int x = 0; x<endx; x++){
-                    top[x] = pix;
-                }
-            }
-
-            //draw bottom
-            if (i==mb_height-1 || k==h->smb_height-1 || l==h->smb_width-1){
-                int mx = j*16;
-                int my = i*16 + 15; my = my < height ? my: height-1;
-                uint32_t *bottom = rgba_pixels + my*width + mx;
-                int endx = mx+16 < width? 16: width-mx;
-
-                for (int x = 0; x<endx; x++){
-                    bottom[x] = pix;
-                }
-            }
-
-            //draw left
-            if (j==0 || l==0 ){
-                int mx = j*16;
-                int my = i*16;
-                uint32_t *left = rgba_pixels + my*width + mx;
-                int endy = my +16 < height ? 16: height - my;
-
-                for (int y = 0; y<endy; y++){
-                    left[y*width] = pix;
-                }
-            }
-
-            //draw right
-            if (j==mb_width -1 || l==h->smb_width-1 ){
-                int mx = j*16 + 15; mx = mx < width ? mx: width-1;
-                int my = i*16;
-                uint32_t *right = rgba_pixels + my*width + mx;
-                int endy = my +16 < height ? 16: height - my;
-
-                for (int y = 0; y<endy; y++){
-                    right[y*width] = pix;
-                }
-            }
-        }
-    }
-}
-
-static void draw_sbmap (H264Context *h, SuperMBContext *smbc, SDLContext *sdlc){
-    int pitch;
-    uint32_t *rgba_pixels;
-    SDL_Texture *sbmap= sdlc->sbmap_texture;
-
-    SDL_LockTexture( sbmap, NULL, (void **)&rgba_pixels, &pitch );
-
-    memset (rgba_pixels, 0, pitch * h->height);
-    for (int i=0; i< smbc->nsmb_height; i++){
-        for (int j=0; j< smbc->nsmb_width; j++){
-            draw_sb_border(h, rgba_pixels, j, i);
-        }
-    }
-
-    SDL_UnlockTexture( sbmap );
-}
-// #endif
-
-// static void calc_sb_sizes (H264Context *h, SuperMBContext *smbc){
-//     smbc->smb_height = h->smb_height;
-//     smbc->smb_width = h->smb_width;
-//
-//     smbc->nsmb_height = h->mb_height / smbc->smb_height +  (h->mb_height%smbc->smb_height ? 1:0);    //only need one extra if mb_height was not dividable
-//     smbc->nsmb_width  = h->mb_width / smbc->smb_width;
-//     while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width )
-//         smbc->nsmb_width++;
-// }
-
-
-static void handle_key_event(H264Context *h, SDLContext *sdlc, SDL_Keysym keysym){
-    int arrow=0;
-
-    switch (keysym.sym){
-        case SDLK_ESCAPE:
-            if (sdlc->fullscreen){
-                SDL_SetWindowFullscreen(sdlc->window, SDL_FALSE);
-                sdlc->fullscreen = 0;
-            }
-            break;
-        case SDLK_SPACE:
-            pthread_mutex_lock(&h->sdl_lock);
-            sdlc->pause = !sdlc->pause;
-            pthread_cond_signal(&h->sdl_cond);
-            pthread_mutex_unlock(&h->sdl_lock);
-            break;
-        case SDLK_f:
-            if (!sdlc->fullscreen){
-                if (keysym.mod == KMOD_LCTRL){
-//                     SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full);
-                    SDL_SetWindowFullscreen(sdlc->window, SDL_TRUE);
-
-                    sdlc->fullscreen = 1;
-                }
-            }
-            break;
-        case SDLK_m:
-            sdlc->showmap = !sdlc->showmap;
-            break;
-        case SDLK_UP:
-            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height < h->mb_height && h->smb_height < h->smb_width){
-                h->smb_height++;
-                arrow =1;
-            }
-            break;
-        case SDLK_DOWN:
-            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height > 1 ){
-                h->smb_height--;
-                arrow =1;
-            }
-            break;
-        case SDLK_LEFT:
-            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width > 1 && h->smb_width > h->smb_height){
-                h->smb_width--;
-                arrow =1;
-            }
-            break;
-        case SDLK_RIGHT:
-            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width < h->mb_width){
-                h->smb_width++;
-                arrow =1;
-            }
-            break;
-    }
-
-    if (arrow){
-        SuperMBContext *smbc = getSuperMBContext(h, h->smb_width, h->smb_height);
-        pthread_mutex_lock(&h->smb_lock);
-        h->smbc->refcount--;
-        if (h->smbc->refcount == 0)
-            freeSuperMBContext(h->smbc);
-        h->smbc = smbc;
-        sdlc->updatemap =1;
-        pthread_mutex_unlock(&h->smb_lock);
-    }
-}
-
-void handle_window_event(H264Context *h, SDLContext *sdlc, SDL_WindowEvent winevent){
-    SDL_Rect nrect;
-    switch (winevent.event){
-        case SDL_WINDOWEVENT_RESIZED:
-
-            sdlc->win_w =  winevent.data1;
-            sdlc->win_h =  winevent.data2;
-
-            double aspect = (double) sdlc->win_w/ sdlc->win_h;
-            if ( aspect < sdlc->aspect){
-                double r = (double) sdlc->win_w / sdlc->rect.w;
-                double h = (double) sdlc->rect.h * r;
-
-                nrect.y = lrint(( (double) sdlc->win_h - h)/2);
-                nrect.h = lrint(h);
-
-                nrect.x=0;
-                nrect.w= sdlc->win_w;
-
-            }else {
-                double r = (double) sdlc->win_h / sdlc->rect.h;
-                double w = (double) sdlc->rect.w * r;
-
-                nrect.x = lrint(( (double) sdlc->win_w - w)/2);
-                nrect.w = lrint(w);
-
-                nrect.y=0;
-                nrect.h= sdlc->win_h;
-            }
-            //prob better to lock
-            sdlc->win_rect = nrect;
-            sdlc->resized=1;
-            break;
-    }
-}
-
-void *sdl_event_listen_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-    SDLContext *sdlc = h->sdlc;
-    SDL_Event event;
-
-    while ( SDL_WaitEvent(&event) ) {
-        switch (event.type) {
-            case SDL_KEYDOWN:
-                handle_key_event(h, sdlc, event.key.keysym);
-                break;
-            case SDL_WINDOWEVENT:
-                handle_window_event(h, sdlc, event.window);
-                break;
-            case SDL_QUIT:
-                h->quit=1;
-                goto finish;
-        }
-    }
-finish:
-    pthread_exit(NULL);
-    return NULL;
-}
-
-//XInitThreads not called in SDL2 library, causes crash
-//remove in future when fixed ...
-#include <X11/Xlib.h>
-
-SDLContext *get_SDL_context(H264Context *h){
-    const int frame_width=h->frame_width;
-    const int frame_height=h->frame_height;
-
-    SDLContext *sdlc = av_mallocz(sizeof(SDLContext));
-    sdlc->display = h->display;
-    sdlc->fullscreen = h->fullscreen;
-
-    sdlc->aspect = (double) frame_width / (double) frame_height;
-    sdlc->rect.x =0;
-    sdlc->rect.y =0;
-    sdlc->rect.w =frame_width;
-    sdlc->rect.h =frame_height;
-
-    XInitThreads(); //workaround
-
-    // Initializes the video subsystem
-    if (SDL_Init(SDL_INIT_VIDEO) < 0) {
-        fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError());
-        #undef exit
-        exit(-1);
-    }
-    SDL_SetHint("SDL_HINT_RENDER_SCALE_QUALITY", "best");
-    SDL_SetHint("SDL_HINT_RENDER_OPENGL_SHADERS", "1");
-
-    SDL_GetDesktopDisplayMode(0, &sdlc->full);
-    sdlc->full.format = SDL_PIXELFORMAT_IYUV;
-
-    sdlc->wind = sdlc->full;
-    if (sdlc->wind.w > frame_width) sdlc->wind.w = frame_width;
-    if (sdlc->wind.h > frame_height) sdlc->wind.h = frame_height;
-
-    sdlc->win_rect.x =0;
-    sdlc->win_rect.y =0;
-    sdlc->win_rect.w =sdlc->wind.w;
-    sdlc->win_rect.h =sdlc->wind.h;
-
-    if (sdlc->fullscreen){
-        sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED,  SDL_WINDOWPOS_UNDEFINED, sdlc->full.w, sdlc->full.h, SDL_WINDOW_FULLSCREEN|SDL_WINDOW_SHOWN|SDL_WINDOW_RESIZABLE);
-        SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full);
-    } else {
-        sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED,  SDL_WINDOWPOS_UNDEFINED, sdlc->wind.w, sdlc->wind.h, SDL_WINDOW_RESIZABLE|SDL_WINDOW_SHOWN);
-        SDL_SetWindowDisplayMode (sdlc->window, &sdlc->wind);
-    }
-
-    sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_ACCELERATED);
-//     sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_SOFTWARE);
-
-    h->sdlq.queue[0] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
-    h->sdlq.queue[1] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
-
-    sdlc->sbmap_texture = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
-    SDL_SetTextureBlendMode(sdlc->sbmap_texture, SDL_BLENDMODE_BLEND);
-    sdlc->updatemap = 1;
-
-#if HAVE_LIBSDL_TTF
-    //not working with SDL 2.0, try again in future when supported
-    if(TTF_Init()==-1) {
-        printf("TTF_Init: %s\n", TTF_GetError());
-        exit(2);
-    }
-
-    // Load a font
-    TTF_Font *font;
-    font = TTF_OpenFont("/usr/share/fonts/truetype/freefont/FreeSans.ttf", 24);
-    if (font == NULL)
-    {
-        printf("TTF_OpenFont() Failed: %s\n", TTF_GetError());
-        TTF_Quit();
-        exit(1);
-    }
-#endif
-    
-    pthread_create(&sdlc->listen_thread, NULL, sdl_event_listen_thread, h);
-
-    return sdlc;
-
-}
-
-void free_SDL_context(H264Context *h){
-    SDLContext *sdlc = h->sdlc;
-    pthread_join(sdlc->listen_thread, NULL);
-
-#if HAVE_LIBSDL_TTF
-    TTF_Quit();
-#endif
-    SDL_DestroyTexture(h->sdlq.queue[0]);
-    SDL_DestroyTexture(h->sdlq.queue[1]);
-    SDL_DestroyTexture(sdlc->sbmap_texture);
-    SDL_DestroyRenderer(sdlc->renderer);
-    SDL_DestroyWindow(sdlc->window);
-    SDL_Quit();
-
-}
-
-void *sdl_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-
-    SDLContext *sdlc = get_SDL_context(h);
-    h->sdlc = sdlc;
-
-    signal_texture(h, 0);
-    signal_texture(h, 0);
-
-    SDL_Texture *texture;
-    for (;;){
-        pthread_mutex_lock(&h->sdl_lock);
-        while (sdlc->pause){
-            pthread_cond_wait(&h->sdl_cond, &h->sdl_lock);
-        }
-        pthread_mutex_unlock(&h->sdl_lock);
-
-        texture = get_next_texture(h, 0);
-        if (texture == NULL)
-            break;
-        
-        SDL_UnlockTexture(texture);
-
-        //clear if resized
-        if (sdlc->resized){
-            // KDE bug prob, reset viewport change after resize from max
-            SDL_RenderSetViewport(sdlc->renderer, NULL);
-            SDL_SetRenderDrawColor(sdlc->renderer, 0, 0, 0, 255);
-            SDL_RenderClear(sdlc->renderer);
-            sdlc->resized = 0;
-        }
-
-        SDL_RenderCopy(sdlc->renderer, texture, &sdlc->rect, &sdlc->win_rect);
-
-        if (sdlc->showmap){
-            if (sdlc->updatemap){
-                SuperMBContext *smbc;
-                pthread_mutex_lock (&h->smb_lock);
-                smbc = h->smbc;
-                smbc->refcount++;
-                sdlc->updatemap=0;
-                pthread_mutex_unlock(&h->smb_lock);
-
-                draw_sbmap(h, smbc, sdlc);
-
-                release_smbc(h, smbc);
-            }
-            SDL_RenderCopy(sdlc->renderer, sdlc->sbmap_texture, &sdlc->rect, &sdlc->win_rect);
-        }
-
-        SDL_RenderPresent(sdlc->renderer);
-        signal_texture(h, 0);
-    }
-
-    free_SDL_context(h);
-
-    pthread_exit(NULL);
-    return NULL;
-}
-#endif
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_misc.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_misc.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-#ifndef H264_MISC_H
-#define H264_MISC_H
-
-#include "avcodec.h"
-#include "h264_types.h"
-
-void start_timer(H264Context *h, int stage);
-void stop_timer(H264Context *h, int stage);
-
-void init_sb_entry(H264Context *h, SliceBufferEntry *sbe);
-void free_sb_entry(SliceBufferEntry *sb);
-SliceBufferEntry *get_sb_entry(H264Context *h);
-void release_sb_entry(H264Context *h, SliceBufferEntry *sb);
-
-DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s);
-void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode);
-
-void draw_edges(MBRecContext *d, H264Slice *s, int line);
-
-int ff_init_slice(NalContext *n, H264Slice *s);
-void free_picture(PictureInfo *pic);
-void free_dp(DecodedPicture *pic);
-
-void av_start_timer();
-int copyEDtoH264Slice(H264Slice *ms, H264Slice *es);
-void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose);
-
-int ff_alloc_picture_info(NalContext *n, H264Slice *s, PictureInfo *pic);
-DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height);
-OutputContext *get_output_context(H264Context *h);
-void free_output_context(OutputContext *oc);
-
-void freeSuperMBContext(SuperMBContext *smbc);
-SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height);
-void release_smbc(H264Context *h, SuperMBContext *smbc);
-SuperMBContext * acquire_smbc(H264Context *h );
-
-#if HAVE_LIBSDL2
-void signal_sdl_exit(H264Context *h);
-void *sdl_thread(void *arg);
-SDLContext *get_SDL_context(H264Context *h);
-void free_SDL_context(SDLContext *sdlc);
-#endif
-
-/**
-* gets the chroma qp.
-*/
-static inline int get_chroma_qp(H264Slice *s, int t, int qscale){
-    return s->pps.chroma_qp_table[t][qscale];
-}
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_nal.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_nal.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,628 +0,0 @@
-#include "h264_types.h"
-#include "h264_data.h"
-
-#include "golomb.h"
-#include "h264_sei.h"
-#include "h264_refs.h"
-#include "h264_ps.h"
-#include "h264_pred_mode.h"
-#include "h264_misc.h"
-
-static int ff_h264_decode_rbsp_trailing(const uint8_t *src){
-    int v= *src;
-    int r;
-
-    for(r=1; r<9; r++){
-        if(v&1) return r;
-        v>>=1;
-    }
-    return 0;
-}
-
-static int pred_weight_table(H264Slice *s, GetBitContext *gb){
-    int luma_def, chroma_def;
-
-    s->use_weight= 0;
-    s->use_weight_chroma= 0;
-    s->luma_log2_weight_denom= get_ue_golomb(gb);
-    s->chroma_log2_weight_denom= get_ue_golomb(gb);
-    luma_def = 1<<s->luma_log2_weight_denom;
-    chroma_def = 1<<s->chroma_log2_weight_denom;
-
-    for(int list=0; list<2; list++){
-        for(int i=0; i<s->ref_count[list]; i++){
-            int luma_weight_flag, chroma_weight_flag;
-
-            luma_weight_flag= get_bits1(gb);
-            if(luma_weight_flag){
-                s->luma_weight[i][list][0]= get_se_golomb(gb);
-                s->luma_weight[i][list][1]= get_se_golomb(gb);
-                if(   s->luma_weight[i][list][0] != luma_def
-                    || s->luma_weight[i][list][1] != 0) {
-                    s->use_weight= 1;
-                }
-            }else{
-                s->luma_weight[i][list][0]= luma_def;
-                s->luma_weight[i][list][1]= 0;
-            }
-
-            chroma_weight_flag= get_bits1(gb);
-            if(chroma_weight_flag){
-                int j;
-                for(j=0; j<2; j++){
-                    s->chroma_weight[i][list][j][0]= get_se_golomb(gb);
-                    s->chroma_weight[i][list][j][1]= get_se_golomb(gb);
-                    if(   s->chroma_weight[i][list][j][0] != chroma_def
-                    || s->chroma_weight[i][list][j][1] != 0) {
-                        s->use_weight_chroma= 1;
-                    }
-                }
-            }else{
-                int j;
-                for(j=0; j<2; j++){
-                    s->chroma_weight[i][list][j][0]= chroma_def;
-                    s->chroma_weight[i][list][j][1]= 0;
-                }
-            }
-        }
-        if(s->slice_type_nos != FF_B_TYPE) break;
-    }
-    s->use_weight= s->use_weight || s->use_weight_chroma;
-    return 0;
-}
-
-/**
-* Initialize implicit_weight table.
-*/
-static void implicit_weight_table(H264Slice *s){
-    int ref0, ref1, cur_poc, ref_start, ref_count0, ref_count1;
-
-    cur_poc = s->poc;
-    if(   s->ref_count[0] == 1 && s->ref_count[1] == 1  && s->ref_list[0][0]->poc + s->ref_list[1][0]->poc == 2*cur_poc){
-        s->use_weight= 0;
-        s->use_weight_chroma= 0;
-        return;
-    }
-    ref_start= 0;
-    ref_count0= s->ref_count[0];
-    ref_count1= s->ref_count[1];
-
-    s->use_weight= 2;
-    s->use_weight_chroma= 2;
-    s->luma_log2_weight_denom= 5;
-    s->chroma_log2_weight_denom= 5;
-
-    for(ref0=ref_start; ref0 < ref_count0; ref0++){
-        int poc0 = s->ref_list[0][ref0]->poc;
-        for(ref1=ref_start; ref1 < ref_count1; ref1++){
-            int poc1 = s->ref_list[1][ref1]->poc;
-            int td = av_clip(poc1 - poc0, -128, 127);
-            int w= 32;
-            if(td){
-                int tb = av_clip(cur_poc - poc0, -128, 127);
-                int tx = (16384 + (FFABS(td) >> 1)) / td;
-                int dist_scale_factor = (tb*tx + 32) >> 8;
-                if(dist_scale_factor >= -64 && dist_scale_factor <= 128)
-                    w = 64 - dist_scale_factor;
-            }
-            s->implicit_weight[ref0][ref1][0]=
-            s->implicit_weight[ref0][ref1][1]= w;
-        }
-    }
-}
-
-/**
-* instantaneous decoder refresh.
-*/
-static void idr(NalContext *n, H264Slice *s){
-    ff_h264_remove_all_refs(n, s);
-    n->prev_frame_num= 0;
-    n->prev_frame_num_offset= 0;
-    n->poc_offset +=  (n->prev_poc_msb<<16) + n->prev_poc_lsb;
-    n->prev_poc_msb=
-    n->prev_poc_lsb= 0;
-}
-
-static int init_poc(NalContext *n, H264Slice *s, GetBitContext *gb){
-    const int max_frame_num= 1<<n->sps.log2_max_frame_num;
-    int frame_poc;
-
-    if(n->sps.poc_type==0){
-        n->poc_lsb= get_bits(gb, n->sps.log2_max_poc_lsb);
-    }
-
-    if(n->sps.poc_type==1 && !n->sps.delta_pic_order_always_zero_flag){
-        n->delta_poc= get_se_golomb(gb);
-    }
-
-    n->frame_num_offset= n->prev_frame_num_offset;
-    if(n->frame_num < n->prev_frame_num)
-        n->frame_num_offset += max_frame_num;
-
-    if(n->sps.poc_type==0){
-        const int max_poc_lsb= 1<<n->sps.log2_max_poc_lsb;
-
-        if(n->poc_lsb < n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb >= max_poc_lsb/2)
-            n->poc_msb = n->prev_poc_msb + max_poc_lsb;
-        else if(n->poc_lsb > n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb < -max_poc_lsb/2)
-            n->poc_msb = n->prev_poc_msb - max_poc_lsb;
-        else
-            n->poc_msb = n->prev_poc_msb;
-
-        frame_poc = n->poc_msb + n->poc_lsb;
-    }else if(n->sps.poc_type==1){
-        int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
-        int i;
-
-        if(n->sps.poc_cycle_length != 0)
-            abs_frame_num = n->frame_num_offset + n->frame_num;
-        else
-            abs_frame_num = 0;
-
-        if(s->nal_ref_idc==0 && abs_frame_num > 0)
-            abs_frame_num--;
-
-        expected_delta_per_poc_cycle = 0;
-        for(i=0; i < n->sps.poc_cycle_length; i++)
-            expected_delta_per_poc_cycle += n->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
-
-        if(abs_frame_num > 0){
-            int poc_cycle_cnt          = (abs_frame_num - 1) / n->sps.poc_cycle_length;
-            int frame_num_in_poc_cycle = (abs_frame_num - 1) % n->sps.poc_cycle_length;
-
-            expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
-            for(i = 0; i <= frame_num_in_poc_cycle; i++)
-                expectedpoc = expectedpoc + n->sps.offset_for_ref_frame[ i ];
-        } else
-            expectedpoc = 0;
-        if(s->nal_ref_idc == 0)
-            expectedpoc = expectedpoc + n->sps.offset_for_non_ref_pic;
-        frame_poc = expectedpoc + n->delta_poc;
-    }else{
-        int poc= 2*(n->frame_num_offset + n->frame_num);
-        if(!s->nal_ref_idc)
-            poc--;
-        frame_poc= poc;
-    }
-    s->current_picture_info->poc= s->poc = frame_poc + n->poc_offset;
-    s->coded_pic_num = n->coded_pic_num++;
-
-    return 0;
-}
-
-static void ref2frame(NalContext *n, H264Slice *s){
-    for(int j=0; j<s->list_count; j++){
-        int *ref2frm= s->ref2frm[j];
-
-        ref2frm[0]=
-        ref2frm[1]= -1;
-
-        for(int i=0; i<s->ref_count[j]; i++){
-            ref2frm[i+2]= 15;
-            if(s->ref_list[j][i]->cpn >=0){
-                int k;
-                for(k=0; k<n->short_ref_count; k++){
-                    if(n->short_ref[k]->cpn == s->ref_list[j][i]->cpn){
-                        ref2frm[i+2]= k;
-                        break;
-                    }
-                }
-            }
-        }
-    }
-}
-
-/**
-* decodes a slice header.
-* This will also call MPV_common_init() and frame_start() as needed.
-*
-* @param h h264context
-* @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
-*
-* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
-*/
-static int decode_slice_header(NalContext *n, H264Slice *s, GetBitContext *gb){
-    unsigned int first_mb_in_slice;
-    unsigned int pps_id;
-    int num_ref_idx_active_override_flag;
-    unsigned int slice_type, tmp;
-
-    first_mb_in_slice= get_ue_golomb(gb);
-    (void) first_mb_in_slice;
-
-    slice_type= get_ue_golomb_31(gb);
-    if(slice_type > 9){
-        av_log(AV_LOG_ERROR, "slice type too large (%d)\n", s->slice_type);
-        return -1;
-    }
-    if(slice_type > 4)
-        slice_type -= 5;
-
-    slice_type= golomb_to_pict_type[ slice_type ];
-
-    s->slice_type= slice_type;
-    s->slice_type_nos= slice_type & 3;
-    s->current_picture_info->slice_type_nos = s->slice_type_nos;
-    s->current_picture_info->reference= s->nal_ref_idc? 2:0;
-    s->key_frame = s->slice_type == FF_I_TYPE;
-
-    pps_id= get_ue_golomb(gb);
-
-    if(pps_id>=MAX_PPS_COUNT){
-        av_log(AV_LOG_ERROR, "pps_id out of range\n");
-        return -1;
-    }
-    if(!n->pps_buffers[pps_id]) {
-        av_log(AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
-        return -1;
-    }
-    s->pps= *n->pps_buffers[pps_id];
-
-    if(!n->sps_buffers[s->pps.sps_id]) {
-        av_log(AV_LOG_ERROR, "non-existing SPS %u referenced\n", s->pps.sps_id);
-        return -1;
-    }
-    n->sps = *n->sps_buffers[s->pps.sps_id];
-
-    n->mb_width= n->sps.mb_width;
-    n->mb_height= n->sps.mb_height;
-
-    int chroma444 = (n->sps.chroma_format_idc == 3);
-    n->width = 16*n->mb_width - (2>>chroma444)*FFMIN(n->sps.crop_right, (8<<chroma444)-1);
-    if(n->sps.frame_mbs_only_flag)
-        n->height= 16*n->mb_height - (2>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1);
-    else
-        n->height= 16*n->mb_height - (4>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1);
-
-    s->direct_8x8_inference_flag = n->sps.direct_8x8_inference_flag;
-    s->transform_bypass = n->sps.transform_bypass;
-
-    n->frame_num= get_bits(gb, n->sps.log2_max_frame_num);
-    if(n->frame_num !=  n->prev_frame_num && n->frame_num != (n->prev_frame_num+1)%(1<<n->sps.log2_max_frame_num)){
-        av_log(AV_LOG_ERROR, "unexpected frame_num \n");
-    }
-
-    s->current_picture_info->frame_num= n->frame_num; //FIXME frame_num cleanup
-    n->max_pic_num= 1<< n->sps.log2_max_frame_num;
-
-    if(s->nal_unit_type == NAL_IDR_SLICE){
-        get_ue_golomb(gb); /* idr_pic_id */
-    }
-
-    init_poc(n, s, gb);
-
-    if(s->pps.redundant_pic_cnt_present){
-        n->redundant_pic_count= get_ue_golomb(gb);
-    }
-
-    //set defaults, might be overridden a few lines later
-    s->ref_count[0]= s->pps.ref_count[0];
-    s->ref_count[1]= s->pps.ref_count[1];
-
-    if(s->slice_type_nos != FF_I_TYPE){
-        if(s->slice_type_nos == FF_B_TYPE){
-            s->direct_spatial_mv_pred= get_bits1(gb);
-        }
-        num_ref_idx_active_override_flag= get_bits1(gb);
-
-        if(num_ref_idx_active_override_flag){
-            s->ref_count[0]= get_ue_golomb(gb) + 1;
-            if(s->slice_type_nos==FF_B_TYPE)
-                s->ref_count[1]= get_ue_golomb(gb) + 1;
-
-            if(s->ref_count[0]-1 > 32-1 || s->ref_count[1]-1 > 32-1){
-                av_log(AV_LOG_ERROR, "reference overflow\n");
-                s->ref_count[0]= s->ref_count[1]= 1;
-                return -1;
-            }
-        }
-        if(s->slice_type_nos == FF_B_TYPE)
-            s->list_count= 2;
-        else
-            s->list_count= 1;
-    }else
-        s->list_count= 0;
-
-
-    if(s->slice_type_nos!=FF_I_TYPE){
-        ff_h264_fill_default_ref_list(n, s);
-        ff_h264_decode_ref_pic_list_reordering(n, s, gb);
-        ref2frame(n, s);
-
-        for(int i=0; i<2; i++){
-            for(int j=0; j<s->ref_count[i]; j++){
-                if (s->ref_list[i][j]==NULL || s->ref_list[i][j]->reference < 2) // Don't know why sometimes the ref_count=1 while there are no references
-                    s->ref_list_cpn[i][j] = -1;
-                else
-                    s->ref_list_cpn[i][j] = s->ref_list[i][j]->cpn;
-            }
-        }
-    }
-
-    if(   (s->pps.weighted_pred          && s->slice_type_nos == FF_P_TYPE )
-    ||  (s->pps.weighted_bipred_idc==1 && s->slice_type_nos== FF_B_TYPE ) ){
-        pred_weight_table(s, gb);
-    }
-    else if(s->pps.weighted_bipred_idc==2 && s->slice_type_nos== FF_B_TYPE){
-        implicit_weight_table( s);
-    }else {
-        s->use_weight = 0;
-    }
-
-    if(s->nal_ref_idc){
-        ff_h264_ref_pic_marking(n, s, gb);
-        n->prev_poc_msb= n->poc_msb;
-        n->prev_poc_lsb= n->poc_lsb;
-    }
-
-    n->prev_frame_num_offset= n->frame_num_offset;
-    n->prev_frame_num= n->frame_num;
-
-    if(s->slice_type_nos != FF_B_TYPE){
-        s->ip_id= n->ip_id++;
-    }
-
-    if(s->slice_type_nos==FF_B_TYPE && !s->direct_spatial_mv_pred){
-        ff_h264_direct_dist_scale_factor(s);
-    }
-    ff_h264_direct_ref_list_init(s);
-
-
-    if( s->slice_type_nos != FF_I_TYPE && s->pps.cabac ){
-        tmp = get_ue_golomb_31(gb);
-        if(tmp > 2){
-            av_log(AV_LOG_ERROR, "cabac_init_idc overflow\n");
-            return -1;
-        }
-        s->cabac_init_idc= tmp;
-    }
-
-    tmp = s->pps.init_qp + get_se_golomb(gb);
-    if(tmp>51){
-        av_log(AV_LOG_ERROR, "QP %u out of range\n", tmp);
-        return -1;
-    }
-    s->qscale= tmp;
-
-    //FIXME qscale / qp ... stuff
-    if(s->slice_type == FF_SP_TYPE){
-        get_bits1(gb); /* sp_for_switch_flag */
-    }
-    if(s->slice_type==FF_SP_TYPE || s->slice_type == FF_SI_TYPE){
-        get_se_golomb(gb); /* slice_qs_delta */
-    }
-
-    s->slice_alpha_c0_offset = 52;
-    s->slice_beta_offset = 52;
-    if( s->pps.deblocking_filter_parameters_present ) {
-        tmp= get_ue_golomb_31(gb);
-        if(tmp > 1){
-            av_log(AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
-            return -1;
-        }
-
-        if(tmp < 2)
-            tmp^= 1; // 1<->0
-
-        if( tmp ) {
-            s->slice_alpha_c0_offset += get_se_golomb(gb) << 1;
-            s->slice_beta_offset     += get_se_golomb(gb) << 1;
-            if( (unsigned) s->slice_alpha_c0_offset > 104U
-            ||(unsigned) s->slice_beta_offset    > 104U){
-                av_log(AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", s->slice_alpha_c0_offset, s->slice_beta_offset);
-                return -1;
-            }
-        }
-    }
-
-    s->qp_thresh= 15 + 52 - FFMIN(s->slice_alpha_c0_offset, s->slice_beta_offset) - FFMAX3(0, s->pps.chroma_qp_index_offset[0], s->pps.chroma_qp_index_offset[1]);
-
-    return 0;
-}
-
-PictureInfo *get_pib_entry(NalContext *nc, int coded_pic_num){
-    PictureInfo *pic = NULL;
-
-    for(int i=0; i<MAX_REF_PIC_COUNT+1; i++){
-        if(nc->picture[i].reference==0){
-            pic= &nc->picture[i];
-            break;
-        }
-    }
-    pic->cpn = coded_pic_num;
-
-    return pic;
-}
-
-int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb1){
-    GetBitContext *gb = gb1;
-    uint8_t *buf = gb1->raw;
-    int buf_size = gb1->buf_size;
-    int next_avc = buf_size;
-    int buf_index=0;
-    uint8_t *dst=NULL;
-//     gb->raw = gb1->raw;
-//     gb->rbsp = NULL;
-    s->release_cnt=0;
-    ff_h264_reset_sei(n);
-
-    s->current_picture_info = get_pib_entry(n, n->coded_pic_num);
-
-    for(;;){
-        int consumed;
-        int dst_length;
-        int bit_length;
-        const uint8_t *ptr;
-        int err;
-
-        if (buf_index >= buf_size){
-            break;
-        } else {
-            // start code prefix search
-            for(; buf_index + 3 < buf_size; buf_index++){
-                // This should always succeed in the first iteration.
-                if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
-                    break;
-            }
-            if(buf_index+3 >= buf_size) break;
-            buf_index+=3;
-        }
-
-        {
-            int length = next_avc - buf_index;
-            int i, si, di;
-            uint8_t *src= buf+buf_index;
-            //    src[0]&0x80;                //forbidden bit
-            s->nal_ref_idc= src[0]>>5;
-            s->nal_unit_type= src[0]&0x1F;
-
-            src++; length--;
-
-            for(i=0; i+1<length; i+=2){
-                if(src[i]) continue;
-                if(i>0 && src[i-1]==0) i--;
-                if(i+2<length && src[i+1]==0 && src[i+2]<=3){
-                    if(src[i+2]!=3){
-                        /* startcode, so we must be past the end */
-                        length=i;
-                    }
-                    break;
-                }
-            }
-
-            if(i>=length-1){ //no escaped 0
-                dst_length= length;
-                consumed= length+1; //+1 for the header
-                ptr=src;
-            }else{
-                av_fast_malloc(&gb->rbsp, &gb->rbsp_size, length+FF_INPUT_BUFFER_PADDING_SIZE);
-                dst = gb->rbsp;
-//                 if (dst){
-//                     av_free(dst);
-//                 }
-//                 dst = av_malloc(length+FF_INPUT_BUFFER_PADDING_SIZE);
-
-                if (dst == NULL){
-                    return -1;
-                }
-
-                //printf("decoding esc\n");
-                memcpy(dst, src, i);
-                si=di=i;
-                while(si+2<length){
-                    //remove escapes (very rare 1:2^22)
-                    if(src[si+2]>3){
-                        dst[di++]= src[si++];
-                        dst[di++]= src[si++];
-                    }else if(src[si]==0 && src[si+1]==0){
-                        if(src[si+2]==3){ //escape
-                            dst[di++]= 0;
-                            dst[di++]= 0;
-                            si+=3;
-                            continue;
-                        }else //next start code
-                            goto nsc;
-                    }
-
-                    dst[di++]= src[si++];
-                }
-                while(si<length)
-                    dst[di++]= src[si++];
-                nsc:
-
-                memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
-
-                dst_length= di;
-                consumed= si + 1;//+1 for the header
-                //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
-                ptr=dst;
-//                 gb->rbsp=ptr;
-            }
-        }
-        if (ptr==NULL || dst_length < 0){
-            return -1;
-        }
-
-        //error prevention, should not touch dst_length
-        while(ptr[dst_length - 1] == 0 && dst_length > 0)
-            dst_length--;
-
-        bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(ptr + dst_length - 1));
-        buf_index += consumed;
-
-        err = 0;
-        init_get_bits(gb, ptr, bit_length);
-        switch(s->nal_unit_type){
-            case NAL_IDR_SLICE:
-                idr(n, s); //FIXME ensure we don't loose some frames if there is reordering
-            case NAL_SLICE:
-                if((err = decode_slice_header(n, s, gb)))
-                    break;
-                s->key_frame |= (s->nal_unit_type == NAL_IDR_SLICE) || (n->sei_recovery_frame_cnt >= 0);
-                break;
-            case NAL_DPA:
-            case NAL_DPB:
-            case NAL_DPC:
-                av_log(AV_LOG_ERROR,"no slices/data partitioning support\n");
-                break;
-            case NAL_SEI:
-                ff_h264_decode_sei(n, gb);
-                break;
-            case NAL_SPS:
-                ff_h264_decode_seq_parameter_set(n, gb);
-                break;
-            case NAL_PPS:
-                ff_h264_decode_picture_parameter_set(n, gb, bit_length);
-                break;
-            case NAL_AUD:
-            case NAL_END_SEQUENCE:
-            case NAL_END_STREAM:
-            case NAL_FILLER_DATA:
-            case NAL_SPS_EXT:
-            case NAL_AUXILIARY_SLICE:
-                break;
-            default:
-                av_log(AV_LOG_ERROR, "Unknown NAL code: %d (%d bits)\n", s->nal_unit_type, bit_length);
-        }
-        if (err < 0)
-            av_log(AV_LOG_ERROR, "decode_slice_header error\n");
-
-    }
-
-    return buf_index;
-}
-
-NalContext *get_nal_context(int width, int height){
-    const int mb_height = (height + 15) / 16;
-    const int mb_width  = (width  + 15) / 16;
-    const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16
-
-    NalContext *nc = av_mallocz(sizeof(NalContext));
-    nc->width = width;
-    nc->height = height;
-    nc->mb_height = mb_height;
-    nc->mb_width  = mb_width;
-    nc->b4_stride = mb_width*4 + 1;
-    nc->mb_stride = mb_stride;
-    nc->outputed_poc = INT_MIN;
-
-    for(int i=0; i<16; i++){
-        nc->picture[i].cpn =-1;
-    }
-
-    return nc;
-}
-
-void free_nal_context(NalContext *nc){
-    for(int i = 0; i < MAX_SPS_COUNT; i++){
-        if (nc->sps_buffers[i]){
-            av_free( nc->sps_buffers[i]);
-        }
-    }
-    for(int i = 0; i < MAX_PPS_COUNT; i++){
-        if (nc->pps_buffers[i]){
-            av_free( nc->pps_buffers[i]);
-        }
-    }
-    av_free(nc);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_nal.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_nal.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-#ifndef H264_NAL_H
-#define H264_NAL_H
-
-#include "avcodec.h"
-#include "h264_types.h"
-
-int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb);
-NalContext *get_nal_context(int width, int height);
-void free_nal_context(NalContext *nc);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_numa.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_numa.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-
-#include <pthread.h>
-#include "h264.h"
-#include "malloc.h"
-
-/*
-* Pthread version with affinity lock for ED and MBD threads. Deprecated
-*/
-int av_transcode_pthread_affinity(int ifile, int ofile, int frame_width, int frame_height, h264_options *opts) {
-	H264Context *h;
-	pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;
-
-	h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height, opts);	
-	timer_start = av_gettime();
-
-	pthread_create(&read_thr, NULL, read_thread, h);
-	pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
-	pthread_create(&entropy_thr, NULL, entropy_IPB_thread, h);
-	pthread_create(&mbdec_thr, NULL, mbdec_thread, h);
-	pthread_create(&write_thr, NULL, write_thread, h);
-
-
-	pthread_join(read_thr, NULL);
-	pthread_join(parsenal_thr, NULL);
-	pthread_join(entropy_thr, NULL);
-	pthread_join(mbdec_thr, NULL);
-	pthread_join(write_thr, NULL);
-
-	/* finished ! */
-	ff_h264_decode_end(h);
-
-	return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_ompss.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_ompss.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,400 +0,0 @@
-/*
-* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
-* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
-*
-* This file is part of FFmpeg.
-*
-* FFmpeg is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* FFmpeg is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with FFmpeg; if not, write to the Free Software
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-#include "h264_types.h"
-#include "h264_parser.h"
-#include "h264_nal.h"
-#include "h264_entropy.h"
-#include "h264_rec.h"
-#include "h264_pred_mode.h"
-#include "h264_misc.h"
-// #undef NDEBUG
-#include <assert.h>
-
-#pragma omp task inout(*pc, *nc) output(*sbe)
-static void parse_task(H264Context *h, ParserContext *pc, NalContext *nc, SliceBufferEntry *sbe){
-    H264Slice *s;
-
-    if (!sbe->initialized){
-        init_sb_entry(h, sbe);
-        sbe->lines_total=h->mb_height;
-    }
-
-    av_read_frame_internal(pc, &sbe->gb);
-    s = &sbe->slice;
-
-    decode_nal_units(nc, s, &sbe->gb);
-}
-
-#pragma omp task inout(*ec) inout(*sbe)
-static void decode_slice_entropy_task(H264Context *h, EntropyContext *ec, SliceBufferEntry *sbe){
-    int i,j;
-    H264Slice *s = &sbe->slice;
-    GetBitContext *gb = &sbe->gb;
-    H264Mb *mbs = sbe->mbs;
-//     GetBitContext *gb = s->gb;
-    CABACContext *c = &ec->c;
-
-    if( !s->pps.cabac ){
-        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
-        return ;
-    }
-
-    init_dequant_tables(s, ec);
-    ec->curr_qscale = s->qscale;
-    ec->last_qscale_diff = 0;
-    ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale);
-    ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale);
-
-    /* realign */
-    align_get_bits( gb );
-    /* init cabac */
-    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
-
-    ff_h264_init_cabac_states(ec, s, c);
-
-    for(j=0; j<ec->mb_height; j++){
-        init_entropy_buf(ec, s, j);
-        for(i=0; i<ec->mb_width; i++){
-            int eos,ret;
-            H264Mb *m = &mbs[i + j*ec->mb_width];
-            m->mb_x=i;
-            m->mb_y=j;
-            ec->m = m;
-
-            ret = ff_h264_decode_mb_cabac(ec, s, c);
-            eos = get_cabac_terminate( c);
-            (void) eos;
-            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
-                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
-                return ;
-            }
-        }
-    }
-}
-
-static void decode_super_mb_block(MBRecContext *d, H264Slice *s, SuperMBContext *smbc, H264Mb *mbs, int smb_x, int smb_y){
-    MBRecState mrs;
-//     memset(&mrs, 0, sizeof(MBRecState));
-
-    for (int k=0, i= smb_y; i< smb_y + smbc->smb_height; i++, k++){
-        init_mbrec_context(d, &mrs, s, i);
-        for (int j= smb_x -k ; j< smb_x - k + smbc->smb_width; j++){
-            if (i< d->mb_height && j >= 0 && j < d->mb_width){
-                h264_decode_mb_internal (d, &mrs, s, &mbs[i*d->mb_width+j]);
-            }
-        }
-    }
-}
-
-#pragma omp task input(*d, *sbe, *ml, *mur) inout(*m)
-static void decode_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml,
-SuperMBTask *mur, SuperMBTask *m){
-    H264Slice *s = &sbe->slice;
-    H264Mb *mbs = sbe->mbs;
-    decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y);
-}
-
-#pragma omp task input(*d, *sbe) inout(*sm)
-static void draw_edges_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *sm, int line){
-    H264Slice *s = &sbe->slice;
-    for (int i=line*smbc->smb_height; i< (line+1)*smbc->smb_height && i< d->mb_height; i++)
-        draw_edges(d, s, i);
-}
-
-static void decode_mb_in_slice(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){
-    int i,j;
-
-    SuperMBContext *smbc = acquire_smbc(h);
-    int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width;
-    SuperMBTask *smbs = smbc->smbs[0];
-
-    SuperMBTask *sm=NULL, *sml, *smur;
-    for(j=0; j< smb_height; j++){
-        for(i=0; i< smb_width; i++){
-            sm = smbs + j*smb_width + i;
-            sml  = sm - ((i > 0) ? 1: 0);
-            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
-            decode_super_mb_task(d, sbe, smbc, sml, smur, sm);
-        }
-        draw_edges_task(d, sbe, smbc, sm, j);
-    }
-    #pragma omp taskwait on(*sm)
-
-    release_smbc(h, smbc);
-}
-
-#pragma omp task inout(*d) inout(*sbe)
-static void decode_slice_mb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){
-    H264Slice *s = &sbe->slice;
-
-    for (int i=0; i<2; i++){
-        for(int j=0; j< s->ref_count[i]; j++){
-            if (s->ref_list_cpn[i][j] ==-1)
-                continue;
-            int k;
-            for (k=0; k< h->max_dpb_cnt; k++){
-                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
-                    s->dp_ref_list[i][j] = &h->dpb[k];
-                    break;
-                }
-            }
-        }
-    }
-
-    #pragma omp critical (dpb)
-    get_dpb_entry(h, s);
-
-    if (!h->no_mbd){
-        decode_mb_in_slice (h, d, sbe);
-    }
-
-    for (int i=0; i<s->release_cnt; i++){
-        for(int j=0; j<h->max_dpb_cnt; j++){
-            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
-                #pragma omp critical (dpb)
-                release_dpb_entry(h, &h->dpb[j], 2);
-                break;
-            }
-        }
-    }
-    s->release_cnt=0;
-}
-
-// for static 3d wave
-/*-------------------------------------------------------------------------------*/
-#pragma omp task input(*d, *sbe, *ml, *mur, *mprev) inout(*m)
-static void decode_3dwave_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml,
-SuperMBTask *mur, SuperMBTask *mprev, SuperMBTask *m){
-    H264Slice *s = &sbe->slice;
-    H264Mb *mbs = sbe->mbs;
-
-    decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y);
-}
-
-// int init_ref_count=0;
-#pragma omp task inout(*d, *sbe, *init)
-static void init_ref_list_and_get_dpb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe, int *init){
-    H264Slice *s = &sbe->slice;
-    for (int i=0; i<2; i++){
-        for(int j=0; j< s->ref_count[i]; j++){
-            if (s->ref_list_cpn[i][j] ==-1)
-                continue;
-            int k;
-            for (k=0; k<h->max_dpb_cnt; k++){
-                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
-                    s->dp_ref_list[i][j] = &h->dpb[k];
-                    break;
-                }
-            }
-        }
-    }
-
-    #pragma omp critical (dpb)
-    get_dpb_entry(h, s);
-
-}
-
-static SuperMBTask* add_decode_slice_3dwave_tasks(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc){
-    int i,j;
-    
-    int smb_3d_height =smbc->nsmb_3dheight;
-    int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width;
-    int smb_diff_prev = smb_height - smb_3d_height;
-    SuperMBTask *sm=NULL, *sml, *smur, *smprev;
-
-    SuperMBTask *smbs = smbc->smbs[smbc->index++]; smbc->index%=2; 
-    SuperMBTask *smbs_prev = smbc->smbs[smbc->index]; // index rotates -> next == prev
-    
-    for(j=0; j<smb_3d_height ; j++){
-        for(i=0; i< smb_width; i++){
-            sm = smbs + j*smb_width + i;
-            sml  = sm - ((i > 0) ? 1: 0);
-            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
-            smprev = smbs_prev + (j + smb_diff_prev+1)*smb_width -1;
-            decode_3dwave_super_mb_task(d, sbe, smbc, sml, smur, smprev, sm);
-        }
-        draw_edges_task(d, sbe, smbc, sm, j);
-    }
-
-    for(; j< smb_height; j++){
-        for(i=0; i< smb_width; i++){
-            sm = smbs + j*smb_width + i;
-            sml  = sm - ((i > 0) ? 1: 0);
-            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
-            decode_super_mb_task(d, sbe, smbc, sml, smur, sm);
-        }
-        draw_edges_task(d, sbe, smbc, sm, j);
-    }
-    return sm;
-}
-
-#pragma omp task inout(*d, *sbe, *release) input (*lastsmb)
-static void release_ref_list_task(H264Context *h, SuperMBContext *smbc, MBRecContext *d, SliceBufferEntry *sbe, SuperMBTask *lastsmb, int *release){
-    H264Slice *s = &sbe->slice;
-    for (int i=0; i<s->release_cnt; i++){
-        for(int j=0; j<h->max_dpb_cnt; j++){
-            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
-                #pragma omp critical (dpb)
-                release_dpb_entry(h, &h->dpb[j], 2);
-                break;
-            }
-        }
-    }
-    s->release_cnt=0;
-
-    release_smbc(h, smbc);
-    
-}
-
-// static void decode_mb_static_3dwave(H264Context *h, int mb_height, int mb_width, MBRecContext *d, H264Slice *s, H264Mb *mbs, SuperMBTask *smbs, SuperMBTask *smbs_prev){
-//
-// }
-/*-------------------------------------------------------------------------------*/
-//end for static 3d wave
-
-#pragma omp task inout (*oc) input(*sbe)
-static void output_task(H264Context *h, OutputContext *oc, SliceBufferEntry *sbe){
-    DecodedPicture* out =output_frame(h, oc, sbe->slice.curr_pic, h->ofile, h->frame_width, h->frame_height);
-    if (out){
-        #pragma omp critical (dpb)
-        release_dpb_entry(h, out, 1);
-    }
-    print_report(oc->frame_number, oc->video_size, 0, h->verbose);
-}
-
-/*
-* The following code is the main loop of the file converter
-*/
-int h264_decode_ompss( H264Context *h) {
-    const int bufs = h->pipe_bufs;
-
-    ParserContext *pc;
-    NalContext *nc;
-    EntropyContext *ec[bufs];
-    MBRecContext *rc[2];
-    OutputContext *oc;
-    SliceBufferEntry *sbe;
-    SuperMBContext *smbc;
-
-    DecodedPicture *out;
-    int frames=0;
-
-#if HAVE_LIBSDL2
-    pthread_t sdl_thr;
-    if (h->display){
-        pthread_create(&sdl_thr, NULL, sdl_thread, h);
-    }
-#endif
-    sbe= av_mallocz(sizeof(SliceBufferEntry) * bufs);
-
-
-    pc = get_parse_context(h->ifile);
-    nc = get_nal_context(h->width, h->height);
-
-    for(int i=0; i<bufs; i++){
-        ec[i] = get_entropy_context( h );
-    }
-
-    for(int i=0; i<2; i++){
-        rc[i] = get_mbrec_context(h);
-    }
-
-    oc = get_output_context( h );
-
-    av_start_timer();
-    int k=0; int init, release;
-    if (h->static_3d && bufs < h->num_frames ){
-        int num_pre_ed =0;
-        for (num_pre_ed=0; num_pre_ed< bufs -1 && !pc->final_frame; num_pre_ed++){
-            parse_task( h, pc, nc, &sbe[k%bufs] );
-            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
-            #pragma omp taskwait on(*pc)
-            k++;
-        }
-
-        while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
-            parse_task( h, pc, nc, &sbe[k%bufs] );
-            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
-
-            k++;
-
-            init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init);
-            smbc = acquire_smbc(h);
-            SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc);
-            release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release);
-
-            output_task (h, oc, &sbe[k%bufs]);
-            #pragma omp taskwait on(*pc)
-        }
-
-        for (int i=0; i< num_pre_ed; i++){
-            k++;
-            init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init);
-            smbc = acquire_smbc(h);
-            SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc);
-            release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release);
-
-            output_task (h, oc, &sbe[k%bufs]);
-        }
-
-    } else {
-        while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
-            parse_task( h, pc, nc, &sbe[k%bufs] );
-
-            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
-
-            decode_slice_mb_task(h, rc[0], &sbe[k%bufs]);
-
-            output_task (h, oc, &sbe[k%bufs]);
-            #pragma omp taskwait on(*pc)
-            k++;
-        }
-    }
-    #pragma omp taskwait
-
-    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
-
-    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
-    h->num_frames = oc->frame_number;
-    /* finished ! */
-
-    free_parse_context(pc);
-    free_nal_context  (nc);
-    free_output_context(oc);
-    for (int i=0; i<bufs; i++){
-        free_sb_entry(&sbe[i]);
-        free_entropy_context(ec[i]);
-    }
-    av_free(sbe);
-
-    for (int i=0; i<2; i++){
-        free_mbrec_context(rc[i]);
-    }
-
-#if HAVE_LIBSDL2
-    if (h->display){
-        signal_sdl_exit(h);
-        pthread_join(sdl_thr, NULL);
-    }
-#endif
-
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_parser.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_parser.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,224 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... parser
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 parser.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include <unistd.h>
-
-#include "golomb.h"
-#include "libavutil/error.h"
-#include "h264_types.h"
-
-#undef NDEBUG
-#include <assert.h>
-
-#define END_NOT_FOUND (-100)
-
-static int ff_h264_find_frame_end(ParserContext *s, const uint8_t *buf, int buf_size)
-{
-    int i;
-    uint32_t state;
-
-    state= s->state;
-    if(state>13)
-        state= 7;
-
-    for(i=0; i<buf_size; i++){
-        if(state==7){
-        /* we check i<buf_size instead of i+3/7 because its simpler
-         * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end
-         */
-            while(i<buf_size && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL))
-                i+=8;
-
-            for(; i<buf_size; i++){
-                if(!buf[i]){
-                    state=2;
-                    break;
-                }
-            }
-        }else if(state<=2){
-            if(buf[i]==1)   state^= 5; //2->7, 1->4, 0->5
-            else if(buf[i]) state = 7;
-            else            state>>=1; //2->1, 1->0, 0->0
-        }else if(state<=5){
-            int v= buf[i] & 0x1F;
-            if(v==6 || v==7 || v==8 || v==9){
-                if(s->frame_start_found){
-                    i++;
-                    goto found;
-                }
-            }else if(v==1 || v==2 || v==5){
-                if(s->frame_start_found){
-                    state+=8;
-                    continue;
-                }else
-                    s->frame_start_found = 1;
-            }
-            state= 7;
-        }else{
-            if(buf[i] & 0x80)
-                goto found;
-            state= 7;
-        }
-    }
-    s->state= state;
-    return END_NOT_FOUND;
-
-found:
-    s->state=7;
-    s->frame_start_found= 0;
-    return i-(state&5);
-}
-
-static int ff_combine_frame(ParserContext *s, GetBitContext *gb, int next, uint8_t **buf, int *buf_size)
-{
-    int i;
-    /* Copy overread bytes from last frame into buffer. */
-    for(i =0; s->overread_cnt>0; s->overread_cnt--, i++){
-        gb->raw[s->index++]= s->overread[i];
-    }
-
-    /* EOF - END_NOT_FOUND means no next frame start is found in current partial read. If buf_size of the partial read is 0 we are at EOF */
-    if(!*buf_size && next == END_NOT_FOUND){
-        next= 0;
-    }
-    s->last_index= s->index;
-
-    /* copy into buffer end return */
-    if(next == END_NOT_FOUND){
-        gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, (*buf_size) + s->index + FF_INPUT_BUFFER_PADDING_SIZE);
-        memcpy(&gb->raw[s->index], *buf, *buf_size);
-        s->index += *buf_size;
-        return -1;
-    }
-
-    ///end found
-    *buf_size=  s->index + next;
-    /* append to buffer */
-
-    gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, next + s->index + FF_INPUT_BUFFER_PADDING_SIZE);
-    memcpy(&gb->raw[s->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE );
-    s->index = 0;
-
-    /* store overread bytes */
-    for(i=0; next < 0; next++, i++){
-        s->state = (s->state<<8) | gb->raw[s->last_index + next];
-        s->overread[i] = gb->raw[s->last_index + next];
-        s->overread_cnt++;
-    }
-
-    return 0;
-}
-
-static int h264_parse(ParserContext *s, GetBitContext *gb,
-                      uint8_t *buf, int buf_size)
-{
-    int next;
-
-    next= ff_h264_find_frame_end(s, buf, buf_size);
-
-    if (ff_combine_frame(s, gb, next, &buf, &buf_size) < 0) {
-        gb->buf_size = 0;
-        return buf_size;
-    }
-
-    if(next<0 && next != END_NOT_FOUND){
-        assert(s->last_index + next >= 0 );
-        ff_h264_find_frame_end(s, &gb->raw[s->last_index + next], -next); //update state
-    }
-
-    gb->buf_size = buf_size;
-    return next;
-}
-
-static int ff_raw_read_partial_packet(ParserContext *pc)
-{
-    int len= -1;
-
-    if (!pc->eof_reached){
-        len = read( pc->ifile, pc->data, pc->buffer_size);
-//         printf("read task %d\t%d\n", pc->ifile, len); fflush(NULL);
-        if (len < pc->buffer_size) {
-            pc->eof_reached = 1;
-        }
-    }
-
-    return len;
-}
-
-void av_read_frame_internal(ParserContext *pc, GetBitContext *gb){
-    int len;
-    uint8_t dummy_buf[FF_INPUT_BUFFER_PADDING_SIZE]={0};
-    av_fast_malloc(&gb->raw, &gb->alloc_size, 2048+FF_INPUT_BUFFER_PADDING_SIZE);
-
-    //Parsing is performed before read, since there are ussually leftovers from parsing the previous frame.
-    for(;;) {
-        if (pc->cur_len>0){
-            len = h264_parse(pc, gb, pc->cur_ptr, pc->cur_len);
-            if (len<0)
-                len =0;
-            //* increment read pointer */
-            pc->cur_ptr += len;
-            pc->cur_len -= len;
-
-            if (gb->buf_size) {
-                break;
-            }
-        }
-
-        //check for ret and not parser->eof_reached as one "read" can contain more than 1 frame
-        pc->size= ff_raw_read_partial_packet(pc);
-        if (pc->size < 0) {
-            pc->final_frame =1;
-            /* return the last frames, if any */
-            h264_parse(pc, gb, dummy_buf, 0);
-            break;
-        }
-        pc->cur_ptr = pc->data;
-        pc->cur_len = pc->size;
-    }
-
-    assert(gb->raw!=NULL);
-
-}
-
-ParserContext *get_parse_context(int ifile){
-    ParserContext *pc = av_mallocz(sizeof(ParserContext));
-    pc->buffer_size = 2048;
-    pc->final_frame = 0;
-    pc->cur_len= 0;
-    pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE);
-    pc->size = 2048;
-    pc->eof_reached =0;
-    pc->ifile = ifile;
-
-    return pc;
-}
-
-void free_parse_context(ParserContext *pc){
-    av_free(pc->data);
-    av_free(pc);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_parser.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_parser.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-#ifndef H264_PARSER_H
-#define H264_PARSER_H
-
-#include "h264_types.h"
-
-void av_read_frame_internal(ParserContext *pc, GetBitContext *gb);
-ParserContext *get_parse_context(int ifile);
-void free_parse_context(ParserContext *pc);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_pred.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,945 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 prediction functions.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "avcodec.h"
-#include "h264_pred.h"
-//#include "dsputil.h"
-
-static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    ((uint32_t*)(src+0*stride))[0]= a;
-    ((uint32_t*)(src+1*stride))[0]= a;
-    ((uint32_t*)(src+2*stride))[0]= a;
-    ((uint32_t*)(src+3*stride))[0]= a;
-}
-
-static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
-    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
-    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
-    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
-}
-
-static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
-                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
-}
-
-
-#define LOAD_TOP_RIGHT_EDGE\
-    const int av_unused t4= topright[0];\
-    const int av_unused t5= topright[1];\
-    const int av_unused t6= topright[2];\
-    const int av_unused t7= topright[3];\
-
-#define LOAD_DOWN_LEFT_EDGE\
-    const int av_unused l4= src[-1+4*stride];\
-    const int av_unused l5= src[-1+5*stride];\
-    const int av_unused l6= src[-1+6*stride];\
-    const int av_unused l7= src[-1+7*stride];\
-
-#define LOAD_LEFT_EDGE\
-    const int av_unused l0= src[-1+0*stride];\
-    const int av_unused l1= src[-1+1*stride];\
-    const int av_unused l2= src[-1+2*stride];\
-    const int av_unused l3= src[-1+3*stride];\
-
-#define LOAD_TOP_EDGE\
-    const int av_unused t0= src[ 0-1*stride];\
-    const int av_unused t1= src[ 1-1*stride];\
-    const int av_unused t2= src[ 2-1*stride];\
-    const int av_unused t3= src[ 3-1*stride];\
-
-static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-
-    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
-    src[0+2*stride]=
-    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
-    src[0+1*stride]=
-    src[1+2*stride]=
-    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
-    src[0+0*stride]=
-    src[1+1*stride]=
-    src[2+2*stride]=
-    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[1+0*stride]=
-    src[2+1*stride]=
-    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[2+0*stride]=
-    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-}
-
-static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_TOP_EDGE
-    LOAD_TOP_RIGHT_EDGE
-//    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
-    src[1+0*stride]=
-    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
-    src[2+0*stride]=
-    src[1+1*stride]=
-    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
-    src[3+0*stride]=
-    src[2+1*stride]=
-    src[1+2*stride]=
-    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+2*stride]=
-    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
-    src[3+2*stride]=
-    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
-    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
-}
-
-static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=
-    src[1+2*stride]=(lt + t0 + 1)>>1;
-    src[1+0*stride]=
-    src[2+2*stride]=(t0 + t1 + 1)>>1;
-    src[2+0*stride]=
-    src[3+2*stride]=(t1 + t2 + 1)>>1;
-    src[3+0*stride]=(t2 + t3 + 1)>>1;
-    src[0+1*stride]=
-    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[1+1*stride]=
-    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[2+1*stride]=
-    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
-    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-}
-
-static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_TOP_EDGE
-    LOAD_TOP_RIGHT_EDGE
-
-    src[0+0*stride]=(t0 + t1 + 1)>>1;
-    src[1+0*stride]=
-    src[0+2*stride]=(t1 + t2 + 1)>>1;
-    src[2+0*stride]=
-    src[1+2*stride]=(t2 + t3 + 1)>>1;
-    src[3+0*stride]=
-    src[2+2*stride]=(t3 + t4+ 1)>>1;
-    src[3+2*stride]=(t4 + t5+ 1)>>1;
-    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[1+1*stride]=
-    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-    src[2+1*stride]=
-    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
-    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
-}
-
-static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=(l0 + l1 + 1)>>1;
-    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-    src[2+0*stride]=
-    src[0+1*stride]=(l1 + l2 + 1)>>1;
-    src[3+0*stride]=
-    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
-    src[2+1*stride]=
-    src[0+2*stride]=(l2 + l3 + 1)>>1;
-    src[3+1*stride]=
-    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
-    src[3+2*stride]=
-    src[1+3*stride]=
-    src[0+3*stride]=
-    src[2+2*stride]=
-    src[2+3*stride]=
-    src[3+3*stride]=l3;
-}
-
-
-static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
-	(void) topright;
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=
-    src[2+1*stride]=(lt + l0 + 1)>>1;
-    src[1+0*stride]=
-    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[0+1*stride]=
-    src[2+2*stride]=(l0 + l1 + 1)>>1;
-    src[1+1*stride]=
-    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
-    src[0+2*stride]=
-    src[2+3*stride]=(l1 + l2+ 1)>>1;
-    src[1+2*stride]=
-    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-    src[0+3*stride]=(l2 + l3 + 1)>>1;
-    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
-}
-
-static void pred16x16_vertical_c(uint8_t *src, int stride){
-    int i;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    const uint32_t b= ((uint32_t*)(src-stride))[1];
-    const uint32_t c= ((uint32_t*)(src-stride))[2];
-    const uint32_t d= ((uint32_t*)(src-stride))[3];
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]= a;
-        ((uint32_t*)(src+i*stride))[1]= b;
-        ((uint32_t*)(src+i*stride))[2]= c;
-        ((uint32_t*)(src+i*stride))[3]= d;
-    }
-}
-
-static void pred16x16_horizontal_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
-    }
-}
-
-static void pred16x16_dc_c(uint8_t *src, int stride){
-    int i, dc=0;
-
-    for(i=0;i<16; i++){
-        dc+= src[-1+i*stride];
-    }
-
-    for(i=0;i<16; i++){
-        dc+= src[i-stride];
-    }
-
-    dc= 0x01010101*((dc + 16)>>5);
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-static void pred16x16_left_dc_c(uint8_t *src, int stride){
-    int i, dc=0;
-
-    for(i=0;i<16; i++){
-        dc+= src[-1+i*stride];
-    }
-
-    dc= 0x01010101*((dc + 8)>>4);
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-static void pred16x16_top_dc_c(uint8_t *src, int stride){
-    int i, dc=0;
-
-    for(i=0;i<16; i++){
-        dc+= src[i-stride];
-    }
-    dc= 0x01010101*((dc + 8)>>4);
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-static void pred16x16_128_dc_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
-    }
-}
-
-static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){
-  int i, j, k;
-  int a;
-  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-  const uint8_t * const src0 = src+7-stride;
-  const uint8_t *src1 = src+8*stride-1;
-  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
-  int H = src0[1] - src0[-1];
-  int V = src1[0] - src2[ 0];
-  for(k=2; k<=8; ++k) {
-    src1 += stride; src2 -= stride;
-    H += k*(src0[k] - src0[-k]);
-    V += k*(src1[0] - src2[ 0]);
-  }
-  if(svq3){
-    H = ( 5*(H/4) ) / 16;
-    V = ( 5*(V/4) ) / 16;
-
-    /* required for 100% accuracy */
-    i = H; H = V; V = i;
-  }else if(rv40){
-    H = ( H + (H>>2) ) >> 4;
-    V = ( V + (V>>2) ) >> 4;
-  }else{
-    H = ( 5*H+32 ) >> 6;
-    V = ( 5*V+32 ) >> 6;
-  }
-
-  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
-  for(j=16; j>0; --j) {
-    int b = a;
-    a += V;
-    for(i=-16; i<0; i+=4) {
-      src[16+i] = cm[ (b    ) >> 5 ];
-      src[17+i] = cm[ (b+  H) >> 5 ];
-      src[18+i] = cm[ (b+2*H) >> 5 ];
-      src[19+i] = cm[ (b+3*H) >> 5 ];
-      b += 4*H;
-    }
-    src += stride;
-  }
-}
-
-static void pred16x16_plane_c(uint8_t *src, int stride){
-    pred16x16_plane_compat_c(src, stride, 0, 0);
-}
-
-
-static void pred8x8_vertical_c(uint8_t *src, int stride){
-    int i;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    const uint32_t b= ((uint32_t*)(src-stride))[1];
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= a;
-        ((uint32_t*)(src+i*stride))[1]= b;
-    }
-}
-
-static void pred8x8_horizontal_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
-    }
-}
-
-static void pred8x8_128_dc_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
-    }
-}
-
-static void pred8x8_left_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc2;
-
-    dc0=dc2=0;
-    for(i=0;i<4; i++){
-        dc0+= src[-1+i*stride];
-        dc2+= src[-1+(i+4)*stride];
-    }
-    dc0= 0x01010101*((dc0 + 2)>>2);
-    dc2= 0x01010101*((dc2 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= dc0;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= dc2;
-    }
-}
-
-
-static void pred8x8_top_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc1;
-
-    dc0=dc1=0;
-    for(i=0;i<4; i++){
-        dc0+= src[i-stride];
-        dc1+= src[4+i-stride];
-    }
-    dc0= 0x01010101*((dc0 + 2)>>2);
-    dc1= 0x01010101*((dc1 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-}
-
-static void pred8x8_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc1, dc2, dc3;
-
-    dc0=dc1=dc2=0;
-    for(i=0;i<4; i++){
-        dc0+= src[-1+i*stride] + src[i-stride];
-        dc1+= src[4+i-stride];
-        dc2+= src[-1+(i+4)*stride];
-    }
-    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
-    dc0= 0x01010101*((dc0 + 4)>>3);
-    dc1= 0x01010101*((dc1 + 2)>>2);
-    dc2= 0x01010101*((dc2 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc2;
-        ((uint32_t*)(src+i*stride))[1]= dc3;
-    }
-}
-
-//the following 4 function should not be optimized!
-static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){
-    pred8x8_top_dc_c(src, stride);
-    pred4x4_dc_c(src, NULL, stride);
-}
-
-static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){
-    pred8x8_dc_c(src, stride);
-    pred4x4_top_dc_c(src, NULL, stride);
-}
-
-static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){
-    pred8x8_left_dc_c(src, stride);
-    pred4x4_128_dc_c(src + 4*stride    , NULL, stride);
-    pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride);
-}
-
-static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){
-    pred8x8_left_dc_c(src, stride);
-    pred4x4_128_dc_c(src    , NULL, stride);
-    pred4x4_128_dc_c(src + 4, NULL, stride);
-}
-
-static void pred8x8_plane_c(uint8_t *src, int stride){
-  int j, k;
-  int a;
-  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-  const uint8_t * const src0 = src+3-stride;
-  const uint8_t *src1 = src+4*stride-1;
-  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
-  int H = src0[1] - src0[-1];
-  int V = src1[0] - src2[ 0];
-  for(k=2; k<=4; ++k) {
-    src1 += stride; src2 -= stride;
-    H += k*(src0[k] - src0[-k]);
-    V += k*(src1[0] - src2[ 0]);
-  }
-  H = ( 17*H+16 ) >> 5;
-  V = ( 17*V+16 ) >> 5;
-
-  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
-  for(j=8; j>0; --j) {
-    int b = a;
-    a += V;
-    src[0] = cm[ (b    ) >> 5 ];
-    src[1] = cm[ (b+  H) >> 5 ];
-    src[2] = cm[ (b+2*H) >> 5 ];
-    src[3] = cm[ (b+3*H) >> 5 ];
-    src[4] = cm[ (b+4*H) >> 5 ];
-    src[5] = cm[ (b+5*H) >> 5 ];
-    src[6] = cm[ (b+6*H) >> 5 ];
-    src[7] = cm[ (b+7*H) >> 5 ];
-    src += stride;
-  }
-}
-
-#define SRC(x,y) src[(x)+(y)*stride]
-#define PL(y) \
-    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_LEFT \
-    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
-                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
-    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
-    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
-
-#define PT(x) \
-    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOP \
-    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
-                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
-    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
-    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
-                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
-
-#define PTR(x) \
-    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOPRIGHT \
-    int t8, t9, t10, t11, t12, t13, t14, t15; \
-    if(has_topright) { \
-        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
-        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
-    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
-
-#define PREDICT_8x8_LOAD_TOPLEFT \
-    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
-
-#define PREDICT_8x8_DC(v) \
-    int y; \
-    for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
-        src += stride; \
-    }
-
-static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-	(void) has_topleft; (void) has_topright;
-    PREDICT_8x8_DC(0x80808080);
-}
-
-static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-	(void) has_topleft; (void) has_topright;
-    PREDICT_8x8_LOAD_LEFT;
-    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-
-static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_TOP;
-    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-
-static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOP;
-    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
-                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-
-static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-	(void) has_topleft; (void) has_topright;
-    PREDICT_8x8_LOAD_LEFT;
-#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
-               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
-    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
-#undef ROW
-}
-
-static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    int y;
-    PREDICT_8x8_LOAD_TOP;
-    src[0] = t0;
-    src[1] = t1;
-    src[2] = t2;
-    src[3] = t3;
-    src[4] = t4;
-    src[5] = t5;
-    src[6] = t6;
-    src[7] = t7;
-    for( y = 1; y < 8; y++ )
-        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
-}
-
-static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_TOPRIGHT;
-    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
-    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
-    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
-    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
-    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
-    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
-    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
-    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
-    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
-}
-
-static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
-    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
-    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
-    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
-    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
-    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
-    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
-    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
-    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
-}
-
-static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
-    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
-    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
-    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
-    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
-    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
-    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
-    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
-    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
-    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
-    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
-    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
-    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
-    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
-    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(7,0)= (t6 + t7 + 1) >> 1;
-}
-
-static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,7)= (l6 + l7 + 1) >> 1;
-    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
-    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
-    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
-    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
-    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
-    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
-    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
-    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
-    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
-    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
-    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
-    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
-    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
-    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
-    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
-    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
-    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
-    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
-    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
-    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
-}
-
-static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_TOPRIGHT;
-    SRC(0,0)= (t0 + t1 + 1) >> 1;
-    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
-    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
-    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
-    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
-    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
-    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
-    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
-    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
-    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
-    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
-    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
-    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
-    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
-    SRC(7,6)= (t10 + t11 + 1) >> 1;
-    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
-}
-
-static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride){
-	(void) has_topleft; (void) has_topright;
-    PREDICT_8x8_LOAD_LEFT;
-    SRC(0,0)= (l0 + l1 + 1) >> 1;
-    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
-    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
-    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
-    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
-    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
-    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
-    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
-    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
-    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
-    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
-    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
-    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
-    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
-    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
-    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
-    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
-    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
-}
-#undef PREDICT_8x8_LOAD_LEFT
-#undef PREDICT_8x8_LOAD_TOP
-#undef PREDICT_8x8_LOAD_TOPLEFT
-#undef PREDICT_8x8_LOAD_TOPRIGHT
-#undef PREDICT_8x8_DC
-#undef PTR
-#undef PT
-#undef PL
-#undef SRC
-
-static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
-    int i;
-    pix -= stride;
-    for(i=0; i<4; i++){
-        uint8_t v = pix[0];
-        pix[1*stride]= v += block[0];
-        pix[2*stride]= v += block[4];
-        pix[3*stride]= v += block[8];
-        pix[4*stride]= v +  block[12];
-        pix++;
-        block++;
-    }
-}
-
-static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
-    int i;
-    for(i=0; i<4; i++){
-        uint8_t v = pix[-1];
-        pix[0]= v += block[0];
-        pix[1]= v += block[1];
-        pix[2]= v += block[2];
-        pix[3]= v +  block[3];
-        pix+= stride;
-        block+= 4;
-    }
-}
-
-static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
-    int i;
-    pix -= stride;
-    for(i=0; i<8; i++){
-        uint8_t v = pix[0];
-        pix[1*stride]= v += block[0];
-        pix[2*stride]= v += block[8];
-        pix[3*stride]= v += block[16];
-        pix[4*stride]= v += block[24];
-        pix[5*stride]= v += block[32];
-        pix[6*stride]= v += block[40];
-        pix[7*stride]= v += block[48];
-        pix[8*stride]= v +  block[56];
-        pix++;
-        block++;
-    }
-}
-
-static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
-    int i;
-    for(i=0; i<8; i++){
-        uint8_t v = pix[-1];
-        pix[0]= v += block[0];
-        pix[1]= v += block[1];
-        pix[2]= v += block[2];
-        pix[3]= v += block[3];
-        pix[4]= v += block[4];
-        pix[5]= v += block[5];
-        pix[6]= v += block[6];
-        pix[7]= v +  block[7];
-        pix+= stride;
-        block+= 8;
-    }
-}
-
-static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
-    int i;
-    for(i=0; i<16; i++)
-        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
-}
-
-static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
-    int i;
-    for(i=0; i<16; i++)
-        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
-}
-
-static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
-    int i;
-    for(i=0; i<4; i++)
-        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
-}
-
-static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
-    int i;
-    for(i=0; i<4; i++)
-        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
-}
-
- 
-/**
- * Sets the intra prediction function pointers.
- */
-void ff_h264_pred_init(H264PredContext *h){
-
-    h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
-    h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
-    h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
-    h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
-    h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
-    h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
-    h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
-    h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
-    h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
-    h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
-    h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
-    h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
-
-    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
-    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
-    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
-    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
-    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
-    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
-    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
-    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
-    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
-    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
-    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
-    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
-
-    h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
-    h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
-    h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
-
-    h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
-    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
-    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
-    h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t;
-    h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt;
-    h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00;
-    h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0;
-
-    h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
-
-    h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
-    h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
-    h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
-    h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
-
-    h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
-
-    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
-    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
-    h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
-
-    //special lossless h/v prediction for h264
-    h->pred4x4_add  [VERT_PRED   ]= pred4x4_vertical_add_c;
-    h->pred4x4_add  [ HOR_PRED   ]= pred4x4_horizontal_add_c;
-    h->pred8x8l_add [VERT_PRED   ]= pred8x8l_vertical_add_c;
-    h->pred8x8l_add [ HOR_PRED   ]= pred8x8l_horizontal_add_c;
-    h->pred8x8_add  [VERT_PRED8x8]= pred8x8_vertical_add_c;
-    h->pred8x8_add  [ HOR_PRED8x8]= pred8x8_horizontal_add_c;
-    h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c;
-    h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c;
-
-    if (HAVE_NEON) ff_h264_pred_init_arm(h);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_pred.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 prediction functions.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_H264PRED_H
-#define AVCODEC_H264PRED_H
-
-#include "libavutil/common.h"
-#include "dsputil.h"
-
-/**
- * Prediction types
- */
-//@{
-#define VERT_PRED             0
-#define HOR_PRED              1
-#define DC_PRED               2
-#define DIAG_DOWN_LEFT_PRED   3
-#define DIAG_DOWN_RIGHT_PRED  4
-#define VERT_RIGHT_PRED       5
-#define HOR_DOWN_PRED         6
-#define VERT_LEFT_PRED        7
-#define HOR_UP_PRED           8
-
-#define LEFT_DC_PRED          9
-#define TOP_DC_PRED           10
-#define DC_128_PRED           11
-
-#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN   12
-#define HOR_UP_PRED_RV40_NODOWN           13
-#define VERT_LEFT_PRED_RV40_NODOWN        14
-
-#define DC_PRED8x8            0
-#define HOR_PRED8x8           1
-#define VERT_PRED8x8          2
-#define PLANE_PRED8x8         3
-
-#define LEFT_DC_PRED8x8       4
-#define TOP_DC_PRED8x8        5
-#define DC_128_PRED8x8        6
-
-#define ALZHEIMER_DC_L0T_PRED8x8 7
-#define ALZHEIMER_DC_0LT_PRED8x8 8
-#define ALZHEIMER_DC_L00_PRED8x8 9
-#define ALZHEIMER_DC_0L0_PRED8x8 10
-//@}
-
-/**
- * Context for storing H.264 prediction functions
- */
-typedef struct H264PredContext{
-    void (*pred4x4  [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
-    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
-    void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
-    void (*pred16x16[4+3])(uint8_t *src, int stride);
-
-    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
-    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
-    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
-    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
-}H264PredContext;
-
-void ff_h264_pred_init(H264PredContext *h);
-void ff_h264_pred_init_arm(H264PredContext *h);
-
-
-#endif /* AVCODEC_H264PRED_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1013 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 direct mb/block decoding.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "dsputil.h"
-#include "avcodec.h"
-#include "h264_data.h"
-#include "h264.h"
-#include "rectangle.h"
-
-//#undef NDEBUG
-#include <assert.h>
-
-static const uint8_t left_block_options[4][16]={
-    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
-    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
-    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
-    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
-};
-
-
-// static void check_cache_copy(MBRecContext *mrc, H264Slice *s, H264Mb *m){
-//     for (int list=0; list<2; list++){
-//         for (int i=0; i<40; i++){
-//             assert (m->ref_cache[list][i] == m->ref_cache_copy[list][i]);
-//             assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy[list][i][0]);
-//             assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy[list][i][1]);
-//         }
-//     }
-// }
-
-// static void check_cache_copy2(MBRecContext *mrc, H264Slice *s, H264Mb *m){
-//     for (int list=0; list<2; list++){
-//         for (int i=0; i<40; i++){
-//             assert (m->ref_cache[list][i] == m->ref_cache_copy2[list][i]);
-//             assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy2[list][i][0]);
-//             assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy2[list][i][1]);
-//         }
-//     }
-// }
-
-static void fill_decode_caches_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
-    int topleft_type, top_type, topright_type, left_type;
-    const uint8_t * left_block= left_block_options[0];
-    const int mb_x = m->mb_x;
-    int i;
-
-    mrs->top_type  = mrs->mb_type_top[mb_x  ];
-    mrs->left_type = mrs->mb_type    [mb_x-1];
-
-    topleft_type = mrs->mb_type_top[mb_x-1];
-    top_type     = mrs->mb_type_top[mb_x  ];
-    topright_type= mrs->mb_type_top[mb_x+1];
-    left_type    = mrs->mb_type    [mb_x-1];
-
-    int type_mask= s->pps.constrained_intra_pred ? 1 : -1;
-
-    if(!IS_SKIP(mb_type)){
-//         memset(mrc->non_zero_count_cache, 0, sizeof(mrc->non_zero_count_cache));
-        AV_COPY32(&mrs->non_zero_count_cache[4+8*1], &m->non_zero_count[ 0]);
-        AV_COPY32(&mrs->non_zero_count_cache[4+8*2], &m->non_zero_count[ 4]);
-        AV_COPY32(&mrs->non_zero_count_cache[4+8*3], &m->non_zero_count[ 8]);
-        AV_COPY32(&mrs->non_zero_count_cache[4+8*4], &m->non_zero_count[12]);
-
-        for (int i=0; i<2; i++) {
-            mrs->non_zero_count_cache[8*1 + 8*i + 1] = m->non_zero_count[16 + i*2   ];
-            mrs->non_zero_count_cache[8*1 + 8*i + 2] = m->non_zero_count[16 + i*2 +1];
-            mrs->non_zero_count_cache[8*4 + 8*i + 1] = m->non_zero_count[20 + i*2   ];
-            mrs->non_zero_count_cache[8*4 + 8*i + 2] = m->non_zero_count[20 + i*2 +1];
-        }
-
-        if(IS_INTRA(mb_type)){
-//             memset(mrc->intra4x4_pred_mode_cache, 0, sizeof(mrc->intra4x4_pred_mode_cache));
-
-            mrs->topleft_samples_available=
-            mrs->top_samples_available=
-            mrs->left_samples_available= 0xFFFF;
-            mrs->topright_samples_available= 0xEEEA;
-
-            if(!(top_type & type_mask)){
-                mrs->topleft_samples_available= 0xB3FF;
-                mrs->top_samples_available= 0x33FF;
-                mrs->topright_samples_available= 0x26EA;
-            }
-
-            if(!(left_type & type_mask)){
-                mrs->topleft_samples_available&= 0xDF5F;
-                mrs->left_samples_available&= 0x5F5F;
-            }
-
-            if(!(topleft_type & type_mask))
-                mrs->topleft_samples_available&= 0x7FFF;
-
-            if(!(topright_type & type_mask))
-                mrs->topright_samples_available&= 0xFBFF;
-
-            if(IS_INTRA4x4(mb_type)){
-                if(IS_INTRA4x4(top_type)){
-                    AV_COPY32(mrs->intra4x4_pred_mode_cache+4+8*0, &mrs->intra4x4_pred_mode_top[4*mb_x]);
-                }else{
-                    mrs->intra4x4_pred_mode_cache[4+8*0]=
-                    mrs->intra4x4_pred_mode_cache[5+8*0]=
-                    mrs->intra4x4_pred_mode_cache[6+8*0]=
-                    mrs->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask);
-                }
-
-                if(IS_INTRA4x4(left_type)){
-#if OMPSS
-                    mrs->intra4x4_pred_mode_cache[3+8*1]= m->intra4x4_pred_mode_left[0];
-                    mrs->intra4x4_pred_mode_cache[3+8*2]= m->intra4x4_pred_mode_left[1];
-                    mrs->intra4x4_pred_mode_cache[3+8*3]= m->intra4x4_pred_mode_left[2];
-                    mrs->intra4x4_pred_mode_cache[3+8*4]= m->intra4x4_pred_mode_left[3];
-#else
-                    mrs->intra4x4_pred_mode_cache[3+8*1]= mrs->intra4x4_pred_mode_left[0];
-                    mrs->intra4x4_pred_mode_cache[3+8*2]= mrs->intra4x4_pred_mode_left[1];
-                    mrs->intra4x4_pred_mode_cache[3+8*3]= mrs->intra4x4_pred_mode_left[2];
-                    mrs->intra4x4_pred_mode_cache[3+8*4]= mrs->intra4x4_pred_mode_left[3];
-#endif
-                }else{
-                    mrs->intra4x4_pred_mode_cache[3+8*1]=
-                    mrs->intra4x4_pred_mode_cache[3+8*2]=
-                    mrs->intra4x4_pred_mode_cache[3+8*3]=
-                    mrs->intra4x4_pred_mode_cache[3+8*4]= 2 - 3*!(left_type & type_mask);
-                }
-            }
-        }
-    }
-
-    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
-        int list;
-
-//         memset(mrs->mv_cache, 0, sizeof(mrs->mv_cache));
-//         memset(mrs->ref_cache, 0, sizeof(mrs->ref_cache));
-
-        mrs->ref_cache[0][scan8[5 ]+1] = mrs->ref_cache[0][scan8[7 ]+1] = mrs->ref_cache[0][scan8[13]+1] =
-        mrs->ref_cache[1][scan8[5 ]+1] = mrs->ref_cache[1][scan8[7 ]+1] = mrs->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
-
-        for(list=0; list<s->list_count; list++){
-            if(!USES_LIST(mb_type, list)){
-                continue;
-            }
-            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
-
-            if(USES_LIST(top_type, list)){
-                const int b_xy= 4*mb_x + 3*mrc->b_stride;
-                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
-                    mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
-                    mrs->ref_cache[list][scan8[0] + 1 - 1*8]= mrs->ref_index_top[list][4*mb_x + 2];
-                    mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
-                    mrs->ref_cache[list][scan8[0] + 3 - 1*8]= mrs->ref_index_top[list][4*mb_x + 3];
-            }else{
-                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
-                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
-            }
-
-            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
-                for(i=0; i<2; i++){
-                    int cache_idx = scan8[0] - 1 + i*2*8;
-                    if(USES_LIST(left_type, list)){
-                        const int b_xy= 4*(mb_x-1) + 3;
-                        const int b8_x= 4*(mb_x-1) + 1;
-                        AV_COPY32(mrs->mv_cache[list][cache_idx  ], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[0+i*2]]);
-                        AV_COPY32(mrs->mv_cache[list][cache_idx+8], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[1+i*2]]);
-                        mrs->ref_cache[list][cache_idx  ]= mrs->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
-                        mrs->ref_cache[list][cache_idx+8]= mrs->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
-                    }else{
-                        AV_ZERO32(mrs->mv_cache [list][cache_idx  ]);
-                        AV_ZERO32(mrs->mv_cache [list][cache_idx+8]);
-                        mrs->ref_cache[list][cache_idx  ]=
-                        mrs->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
-                    }
-                }
-            }else{
-                if(USES_LIST(left_type, list)){
-                    const int b_x = 4*(mb_x-1) + 3;
-                    const int b8_x= 4*(mb_x-1) + 1;
-                    AV_COPY32(mrs->mv_cache[list][scan8[0] - 1], mrs->motion_val[list][b_x + mrc->b_stride*left_block[0]]);
-                    mrs->ref_cache[list][scan8[0] - 1]= mrs->ref_index[list][b8_x + (left_block[0]&~1)];
-                }else{
-                    AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1]);
-                    mrs->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if(USES_LIST(topright_type, list)){
-                const int b_xy= 4*(mb_x+1) + 3*mrc->b_stride;
-                AV_COPY32(mrs->mv_cache[list][scan8[0] + 4 - 1*8], mrs->motion_val_top[list][b_xy]);
-                mrs->ref_cache[list][scan8[0] + 4 - 1*8]= mrs->ref_index_top[list][4*(mb_x+1) + 2];
-            }else{
-                AV_ZERO32(mrs->mv_cache [list][scan8[0] + 4 - 1*8]);
-                mrs->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-            }
-            if(mrs->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
-                int topleft_partition= -1;
-                if(USES_LIST(topleft_type, list)){
-                    const int b_xy = 4*(mb_x-1) + 3 + mrc->b_stride + (topleft_partition & 2*mrc->b_stride);
-                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
-                    AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 - 1*8], mrs->motion_val_top[list][b_xy]);
-                    mrs->ref_cache[list][scan8[0] - 1 - 1*8]= mrs->ref_index_top[list][b8_x];
-                }else{
-                    AV_ZERO32(mrs->mv_cache[list][scan8[0] - 1 - 1*8]);
-                    mrs->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
-                }
-            }
-
-            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
-                continue;
-
-            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
-                mrs->ref_cache[list][scan8[4 ]] =
-                mrs->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
-                AV_ZERO32(mrs->mv_cache [list][scan8[4 ]]);
-                AV_ZERO32(mrs->mv_cache [list][scan8[12]]);
-            }
-        }
-    }
-}
-
-static inline void write_back_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
-    const int b_stride = mrc->b_stride;
-    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
-    const int b8_x= 4*m->mb_x;
-    int list;
-
-    if(!USES_LIST(mb_type, 0))
-        fill_rectangle(&mrs->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
-
-    for(list=0; list<s->list_count; list++){
-        int y;
-        int16_t (*mv_dst)[2];
-        int16_t (*mv_src)[2];
-
-        if(!USES_LIST(mb_type, list))
-            continue;
-
-        mv_dst   = &mrs->motion_val[list][b_x];
-        mv_src   = &mrs->mv_cache[list][scan8[0]];
-        for(y=0; y<4; y++){
-            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
-        }
-
-        {
-            int8_t *ref_index = &mrs->ref_index[list][b8_x];
-            ref_index[0+0*2]= mrs->ref_cache[list][scan8[0]];
-            ref_index[1+0*2]= mrs->ref_cache[list][scan8[4]];
-            ref_index[0+1*2]= mrs->ref_cache[list][scan8[8]];
-            ref_index[1+1*2]= mrs->ref_cache[list][scan8[12]];
-        }
-    }
-}
-
-
-/**
-* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
-*/
-static int check_intra4x4_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){
-    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
-    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
-    int i;
-
-    if(!(mrs->top_samples_available&0x8000)){
-        for(i=0; i<4; i++){
-            int status= top[ mrs->intra4x4_pred_mode_cache[scan8[0] + i] ];
-            if(status<0){
-                av_log(AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
-                return -1;
-            } else if(status){
-                mrs->intra4x4_pred_mode_cache[scan8[0] + i]= status;
-            }
-        }
-    }
-
-    if((mrs->left_samples_available&0x8888)!=0x8888){
-        static const int mask[4]={0x8000,0x2000,0x80,0x20};
-        for(i=0; i<4; i++){
-            if(!(mrs->left_samples_available&mask[i])){
-                int status= left[ mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
-                if(status<0){
-                    av_log(AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
-                    return -1;
-                } else if(status){
-                    mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
-* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
-*/
-static int check_intra_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mode){
-    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
-    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
-
-    if(mode > 6) {
-        av_log(AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y);
-        return -1;
-    }
-
-    if(!(mrs->top_samples_available&0x8000)){
-        mode= top[ mode ];
-        if(mode<0){
-            av_log(AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y);
-            return -1;
-        }
-    }
-
-    if((mrs->left_samples_available&0x8080) != 0x8080){
-        mode= left[ mode ];
-        if(mrs->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
-            mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(mrs->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
-        }
-        if(mode<0){
-            av_log(AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y);
-            return -1;
-        }
-    }
-    return mode;
-}
-
-/**
- * gets the predicted intra4x4 prediction mode.
- */
-static inline int pred_intra_mode(MBRecContext *mrc, MBRecState *mrs, int n){
-    const int index8= scan8[n];
-    const int left= mrs->intra4x4_pred_mode_cache[index8 - 1];
-    const int top = mrs->intra4x4_pred_mode_cache[index8 - 8];
-    const int min= FFMIN(left, top);
-
-    if(min<0) return DC_PRED;
-    else      return min;
-}
-
-static void write_back_intra_pred_mode_rec(MBRecContext *mrc, MBRecState *mrs, H264Mb *m, int mb_x){
-    int8_t *mode= &mrs->intra4x4_pred_mode[4*mb_x];
-
-    AV_COPY32(mode, mrs->intra4x4_pred_mode_cache + 4 + 8*4);
-#if OMPSS
-    if (m->mb_x < mrc->mb_width-1){
-        H264Mb *mr= m+1;
-        mode = mr->intra4x4_pred_mode_left;
-        mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1];
-        mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2];
-        mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3];
-        mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4];
-    }
-#else
-    mode = mrs->intra4x4_pred_mode_left;
-    mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1];
-    mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2];
-    mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3];
-    mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4];
-#endif
-}
-
-static void pred_spatial_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
-    int b4_stride = mrc->b_stride;
-    const int mb_x = m->mb_x;
-    int mb_type_col[2];
-    const int16_t (*l1mv0)[2], (*l1mv1)[2];
-    const int8_t *l1ref0, *l1ref1;
-    const int is_b8x8 = IS_8X8(*mb_type);
-    unsigned int sub_mb_type= MB_TYPE_L0L1;
-    int i8, i4;
-    int ref[2];
-    int mv[2];
-    int list;
-
-    //assert(h->ref_list[1][0].reference&3);
-
-#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
-
-    /* ref = min(neighbors) */
-    for(list=0; list<2; list++){
-        int left_ref = mrs->ref_cache[list][scan8[0] - 1];
-        int top_ref  = mrs->ref_cache[list][scan8[0] - 8];
-        int refc = mrs->ref_cache[list][scan8[0] - 8 + 4];
-        const int16_t *C= mrs->mv_cache[list][ scan8[0] - 8 + 4];
-        if(refc == PART_NOT_AVAILABLE){
-            refc = mrs->ref_cache[list][scan8[0] - 8 - 1];
-            C    = mrs->mv_cache[list][scan8[0] - 8 - 1];
-        }
-        ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
-        if(ref[list] >= 0){
-            //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
-            const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ];
-            const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ];
-
-            int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
-            if(match_count > 1){ //most common
-                mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]),
-                                     mid_pred(A[1], B[1], C[1]) );
-            }else {
-                assert(match_count==1);
-                if(left_ref==ref[list]){
-                    mv[list]= AV_RN32A(A);
-                }else if(top_ref==ref[list]){
-                    mv[list]= AV_RN32A(B);
-                }else{
-                    mv[list]= AV_RN32A(C);
-                }
-            }
-        }else{
-            int mask= ~(MB_TYPE_L0 << (2*list));
-            mv[list] = 0;
-            ref[list] = -1;
-            if(!is_b8x8)
-                *mb_type &= mask;
-            sub_mb_type &= mask;
-        }
-    }
-
-    if(ref[0] < 0 && ref[1] < 0){
-        ref[0] = ref[1] = 0;
-        if(!is_b8x8)
-            *mb_type |= MB_TYPE_L0L1;
-        sub_mb_type |= MB_TYPE_L0L1;
-    }
-
-    if(!(is_b8x8|mv[0]|mv[1])){
-        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
-        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
-        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
-        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
-        *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
-        return;
-    }
-
-    mb_type_col[0] =
-    mb_type_col[1] = mrs->list1_mb_type[mb_x];
-
-    sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
-    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
-        *mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
-    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
-        *mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
-    }else{
-        if(!s->direct_8x8_inference_flag){
-            /* FIXME save sub mb types from previous frames (or derive from MVs)
-            * so we know exactly what block size to use */
-            sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
-        }
-        *mb_type   |= MB_TYPE_8x8;
-    }
-
-    l1mv0  = (void *) &mrs->list1_motion_val[0][4*mb_x];
-    l1mv1  = (void *) &mrs->list1_motion_val[1][4*mb_x];
-    l1ref0 = &mrs->list1_ref_index [0][4*mb_x];
-    l1ref1 = &mrs->list1_ref_index [1][4*mb_x];
-//     if(!b8_stride){
-//         if(m->mb_y&1){
-//             l1ref0 += 2;
-//             l1ref1 += 2;
-//             l1mv0  +=  2*b4_stride;
-//             l1mv1  +=  2*b4_stride;
-//         }
-//     }
-
-    if(IS_16X16(*mb_type)){
-        int a,b;
-
-        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
-        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
-        if(!IS_INTRA(mb_type_col[0]) && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
-            || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
-            ))){
-            a=b=0;
-            if(ref[0] > 0)
-                a= mv[0];
-            if(ref[1] > 0)
-                b= mv[1];
-        }else{
-            a= mv[0];
-            b= mv[1];
-        }
-        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
-        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
-    }else{
-        int n=0;
-        for(i8=0; i8<4; i8++){
-            const int x8 = i8&1;
-            const int y8 = i8>>1;
-
-            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
-                continue;
-            m->sub_mb_type[i8] = sub_mb_type;
-
-            fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4);
-            fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4);
-            fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
-            fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
-
-            /* col_zero_flag */
-            if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 ))
-                ){
-                const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
-                if(IS_SUB_8X8(sub_mb_type)){
-                    const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
-                    if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
-                        if(ref[0] == 0)
-                            fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
-                        if(ref[1] == 0)
-                            fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
-                        n+=4;
-                    }
-                }else{
-                    int k=0;
-                    for(i4=0; i4<4; i4++){
-                        const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
-                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
-                            if(ref[0] == 0)
-                                AV_ZERO32(mrs->mv_cache[0][scan8[i8*4+i4]]);
-                            if(ref[1] == 0)
-                                AV_ZERO32(mrs->mv_cache[1][scan8[i8*4+i4]]);
-                            k++;
-                        }
-                    }
-                    if(!(k&3))
-                        m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8;
-                    n+=k;
-                }
-            }
-        }
-        if(!is_b8x8 && !(n&15)){
-            *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
-        }
-    }
-}
-
-static void pred_temp_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
-    const int mb_x = m->mb_x;
-    int b4_stride = mrc->b_stride;
-    int mb_type_col[2];
-    const int16_t (*l1mv0)[2], (*l1mv1)[2];
-    const int8_t *l1ref0, *l1ref1;
-    const int is_b8x8 = IS_8X8(*mb_type);
-    unsigned int sub_mb_type;
-    int i8, i4;
-    const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]};
-    const int *dist_scale_factor = s->dist_scale_factor;
-
-    mb_type_col[0] =
-    mb_type_col[1] = mrs->list1_mb_type[mb_x];
-
-    sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
-    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
-        *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
-    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
-        *mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
-    }else{
-        if(!s->direct_8x8_inference_flag){
-            /* FIXME save sub mb types from previous frames (or derive from MVs)
-            * so we know exactly what block size to use */
-            sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
-        }
-        *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
-    }
-
-    l1mv0  = (void *) &mrs->list1_motion_val[0][4*mb_x];
-    l1mv1  = (void *) &mrs->list1_motion_val[1][4*mb_x];
-    l1ref0 = &mrs->list1_ref_index [0][4*mb_x];
-    l1ref1 = &mrs->list1_ref_index [1][4*mb_x];
-
-    /* one-to-one mv scaling */
-    if(IS_16X16(*mb_type)){
-        int ref, mv0, mv1;
-
-        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
-        if(IS_INTRA(mb_type_col[0])){
-            ref=mv0=mv1=0;
-        }else{
-            const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
-            : map_col_to_list0[1][l1ref1[0]];
-            const int scale = dist_scale_factor[ref0];
-            const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
-            int mv_l0[2];
-            mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
-            mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
-            ref= ref0;
-            mv0= pack16to32(mv_l0[0],mv_l0[1]);
-            mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
-        }
-        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
-        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
-        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
-    }else{
-        for(i8=0; i8<4; i8++){
-            const int x8 = i8&1;
-            const int y8 = i8>>1;
-            int ref0, scale;
-            const int16_t (*l1mv)[2]= l1mv0;
-
-            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
-                continue;
-            m->sub_mb_type[i8] = sub_mb_type;
-            fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
-            if(IS_INTRA(mb_type_col[0])){
-                fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
-                fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
-                fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
-                continue;
-            }
-
-            ref0 = l1ref0[i8];
-            if(ref0 >= 0)
-                ref0 = map_col_to_list0[0][ref0 ];
-            else{
-                ref0 = map_col_to_list0[1][l1ref1[i8]];
-                l1mv= l1mv1;
-            }
-            scale = dist_scale_factor[ref0];
-
-            fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
-            if(IS_SUB_8X8(sub_mb_type)){
-                const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
-                int mx = (scale * mv_col[0] + 128) >> 8;
-                int my = (scale * mv_col[1] + 128) >> 8;
-                fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
-                fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
-            }else
-            for(i4=0; i4<4; i4++){
-                const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
-                int16_t *mv_l0 = mrs->mv_cache[0][scan8[i8*4+i4]];
-                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
-                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
-                AV_WN32A(mrs->mv_cache[1][scan8[i8*4+i4]],
-                    pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]));
-            }
-        }
-    }
-}
-
-void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
-    if(s->direct_spatial_mv_pred){
-        pred_spatial_direct_motion_rec(mrc, mrs, s, m, mb_type);
-    }else{
-        pred_temp_direct_motion_rec(mrc, mrs, s, m, mb_type);
-    }
-}
-
-static inline int fetch_diagonal_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, const int16_t **C, int i, int list, int part_width){
-    const int topright_ref= mrs->ref_cache[list][ i - 8 + part_width ];
-
-    if(topright_ref != PART_NOT_AVAILABLE){
-        *C= mrs->mv_cache[list][ i - 8 + part_width ];
-        return topright_ref;
-    }else{
-        *C= mrs->mv_cache[list][ i - 8 - 1 ];
-        return mrs->ref_cache[list][ i - 8 - 1 ];
-    }
-}
-
-/**
- * gets the predicted MV.
- * @param n the block index
- * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
- * @param mx the x component of the predicted motion vector
- * @param my the y component of the predicted motion vector
- */
-static inline void pred_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int part_width, int list, int ref, int * const mx, int * const my){
-    const int index8= scan8[n];
-    const int top_ref=      mrs->ref_cache[list][ index8 - 8 ];
-    const int left_ref=     mrs->ref_cache[list][ index8 - 1 ];
-    const int16_t * const A= mrs->mv_cache[list][ index8 - 1 ];
-    const int16_t * const B= mrs->mv_cache[list][ index8 - 8 ];
-    const int16_t * C;
-    int diagonal_ref, match_count;
-
-    assert(part_width==1 || part_width==2 || part_width==4);
-
-/* mv_cache
-  B . . A T T T T
-  U . . L . . , .
-  U . . L . . . .
-  U . . L . . , .
-  . . . L . . . .
-*/
-
-    diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, index8, list, part_width);
-    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
-
-    if(match_count > 1){ //most common
-        *mx= mid_pred(A[0], B[0], C[0]);
-        *my= mid_pred(A[1], B[1], C[1]);
-    }else if(match_count==1){
-        if(left_ref==ref){
-            *mx= A[0];
-            *my= A[1];
-        }else if(top_ref==ref){
-            *mx= B[0];
-            *my= B[1];
-        }else{
-            *mx= C[0];
-            *my= C[1];
-        }
-    }else{
-        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
-            *mx= A[0];
-            *my= A[1];
-        }else{
-            *mx= mid_pred(A[0], B[0], C[0]);
-            *my= mid_pred(A[1], B[1], C[1]);
-        }
-    }
-
-}
-
-/**
- * gets the directionally predicted 16x8 MV.
- * @param n the block index
- * @param mx the x component of the predicted motion vector
- * @param my the y component of the predicted motion vector
- */
-static inline void pred_16x8_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){
-    if(n==0){
-        const int top_ref=      mrs->ref_cache[list][ scan8[0] - 8 ];
-        const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ];
-
-        if(top_ref == ref){
-            *mx= B[0];
-            *my= B[1];
-            return;
-        }
-    }else{
-        const int left_ref=     mrs->ref_cache[list][ scan8[8] - 1 ];
-        const int16_t * const A= mrs->mv_cache[list][ scan8[8] - 1 ];
-
-        if(left_ref == ref){
-            *mx= A[0];
-            *my= A[1];
-            return;
-        }
-    }
-
-    //RARE
-    pred_motion(mrc, mrs, s, n, 4, list, ref, mx, my);
-}
-
-/**
- * gets the directionally predicted 8x16 MV.
- * @param n the block index
- * @param mx the x component of the predicted motion vector
- * @param my the y component of the predicted motion vector
- */
-static inline void pred_8x16_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){
-    if(n==0){
-        const int left_ref=      mrs->ref_cache[list][ scan8[0] - 1 ];
-        const int16_t * const A=  mrs->mv_cache[list][ scan8[0] - 1 ];
-
-        if(left_ref == ref){
-            *mx= A[0];
-            *my= A[1];
-            return;
-        }
-    }else{
-        const int16_t * C;
-        int diagonal_ref;
-
-        diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, scan8[4], list, 2);
-        if(diagonal_ref == ref){
-            *mx= C[0];
-            *my= C[1];
-            return;
-        }
-    }
-
-    //RARE
-    pred_motion(mrc, mrs, s, n, 2, list, ref, mx, my);
-}
-
-static inline void pred_pskip_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb * m, int * const mx, int * const my){
-    const int top_ref = mrs->ref_cache[0][ scan8[0] - 8 ];
-    const int left_ref= mrs->ref_cache[0][ scan8[0] - 1 ];
-
-    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
-       || !( top_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 8 ]))
-       || !(left_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 1 ]))){
-
-        *mx = *my = 0;
-        return;
-    }
-
-    pred_motion(mrc, mrs, s, 0, 4, 0, 0, mx, my);
-
-    return;
-}
-
-#define ADD_MVD(list) \
-{ \
-    mx += m->mvd[list][mp][0]; \
-    my += m->mvd[list][mp][1]; \
-    mp++; \
-}
-
-int pred_motion_mb_rec (MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){
-    int mp=0;
-    int mb_type = m->mb_type;
-    const int mb_x = m->mb_x;
-
-//     mrc->m =m;
-
-    fill_decode_caches_rec(mrc, mrs, s, m, mb_type);
-    if (IS_SKIP(mb_type)){
-        mb_type=0;
-
-        if( s->slice_type_nos == FF_B_TYPE )
-        {
-            mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
-            ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
-        }
-        else
-        {
-            int mx, my;
-
-            mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; //FIXME check required
-            pred_pskip_motion(mrc, mrs, s, m, &mx, &my);
-            fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
-            fill_rectangle(mrs->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
-        }
-
-        write_back_motion_rec(mrc, mrs, s, m, mb_type);
-        m->mb_type = mrs->mb_type[mb_x]= mb_type;
-        return 0;
-    }
-
-
-    if (IS_INTRA_PCM(mb_type)){
-        mrs->mb_type[mb_x] =  mb_type;
-        return 0;
-    }
-    else if (IS_INTRA(mb_type)){
-        int i, pred_mode;
-
-        if( IS_INTRA4x4( mb_type ) ) {
-            if ( IS_8x8DCT(mb_type) ) {
-                for( i = 0; i < 16; i+=4 ) {
-                    int pred = pred_intra_mode(mrc, mrs, i );
-                    int mode = m->intra4x4_pred_mode[i];
-
-                    mode = mode < 0 ?  pred : mode + ( mode >= pred );
-                    fill_rectangle( &mrs->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
-                }
-            } else {
-                for( i = 0; i < 16; i++ ) {
-                    int pred = pred_intra_mode(mrc, mrs, i );
-                    int mode = m->intra4x4_pred_mode[i];
-                    mode = mode < 0 ?  pred : mode + ( mode >= pred );
-                    mrs->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
-                }
-            }
-            write_back_intra_pred_mode_rec(mrc, mrs, m, mb_x);
-            if( check_intra4x4_pred_mode(mrc, mrs, s, m) < 0 ) return -1;
-        } else {
-            m->intra16x16_pred_mode= check_intra_pred_mode(mrc, mrs, s, m, m->intra16x16_pred_mode );
-            if( m->intra16x16_pred_mode < 0 ) return -1;
-        }
-
-        pred_mode = m->chroma_pred_mode;
-        pred_mode= check_intra_pred_mode( mrc, mrs, s, m, pred_mode );
-        if( pred_mode < 0 ) return -1;
-        m->chroma_pred_mode= pred_mode;
-
-    }
-    else if (IS_8X8(mb_type)){
-        int i, j, list;
-
-        if( s->slice_type_nos == FF_B_TYPE ) {
-            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
-                            m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
-                ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
-                mrs->ref_cache[0][scan8[4]] =
-                mrs->ref_cache[1][scan8[4]] =
-                mrs->ref_cache[0][scan8[12]] =
-                mrs->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
-            }
-        }
-
-        for(list=0; list<s->list_count; list++){
-            for(i=0; i<4; i++){
-                if(IS_DIRECT(m->sub_mb_type[i])){
-                    mrs->ref_cache[list][ scan8[4*i]   ]=mrs->ref_cache[list][ scan8[4*i]+1 ];
-                    continue;
-                } else {
-                    mrs->ref_cache[list][ scan8[4*i]   ]=mrs->ref_cache[list][ scan8[4*i]+1 ]=
-                    mrs->ref_cache[list][ scan8[4*i]+8 ]=mrs->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i];
-
-                    if(IS_DIR(m->sub_mb_type[i], 0, list) ){
-                        const int sub_mb_type= m->sub_mb_type[i];
-                        const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
-
-                        int sub_partition_count = IS_SUB_8X8(sub_mb_type) ? 1 : (IS_SUB_4X4(sub_mb_type)? 4 :2);
-                        for(j=0; j<sub_partition_count; j++){
-                            int mx, my;
-                            const int index= 4*i + block_width*j;
-                            int16_t (* mv_cache)[2]= &mrs->mv_cache[list][ scan8[index]];
-                            pred_motion(mrc, mrs, s, index, block_width, list, mrs->ref_cache[list][ scan8[index] ], &mx, &my);
-
-                            ADD_MVD(list)
-
-                            if(IS_SUB_8X8(sub_mb_type)){
-                                mv_cache[ 1 ][0]=
-                                mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
-                                mv_cache[ 1 ][1]=
-                                mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
-                            }else if(IS_SUB_8X4(sub_mb_type)){
-                                mv_cache[ 1 ][0]= mx;
-                                mv_cache[ 1 ][1]= my;
-                            }else if(IS_SUB_4X8(sub_mb_type)){
-                                mv_cache[ 8 ][0]= mx;
-                                mv_cache[ 8 ][1]= my;
-                            }
-                            mv_cache[ 0 ][0]= mx;
-                            mv_cache[ 0 ][1]= my;
-                        }
-                    }else{
-                        fill_rectangle(mrs->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
-                    }
-                }
-            }
-        }
-    } else if( IS_DIRECT(mb_type) ) {
-        mb_type &= ~MB_TYPE_16x16;  //FIXME not nice
-        ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
-    }
-    else {
-        int list, i;
-        if(IS_16X16(mb_type)){
-            for(list=0; list<s->list_count; list++){
-                if(IS_DIR(mb_type, 0, list)){
-                    int ref;
-                    int mx,my;
-
-                    ref = m->ref_index[list][0];
-                    fill_rectangle(&mrs->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
-                    pred_motion(mrc, mrs, s, 0, 4, list, mrs->ref_cache[list][ scan8[0] ], &mx, &my);
-                    ADD_MVD(list)
-                    fill_rectangle(mrs->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
-                }
-            }
-        }
-        else if(IS_16X8(mb_type)){
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){
-                        int ref;
-                        int mx,my;
-                        ref = m->ref_index[list][i];
-                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
-
-                        pred_16x8_motion(mrc, mrs, s, 8*i, list, mrs->ref_cache[list][scan8[0] + 16*i], &mx, &my);
-                        ADD_MVD(list)
-
-                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
-                    }else{
-                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
-                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
-                    }
-                }
-            }
-
-        }else{
-            assert(IS_8X16(mb_type));
-
-            for(list=0; list<s->list_count; list++){
-                for(i=0; i<2; i++){
-                    if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                        int ref;
-                        int mx,my;
-                        ref = m->ref_index[list][i];
-                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
-                        pred_8x16_motion(mrc, mrs, s, i*4, list, mrs->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
-                        ADD_MVD(list)
-                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
-                    }else{
-                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
-                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
-                    }
-                }
-            }
-        }
-    }
-
-    if (IS_INTER(mb_type)||(IS_DIRECT(mb_type)))
-        write_back_motion_rec(mrc, mrs, s, m, mb_type);
-    m->mb_type = mrs->mb_type[mb_x]= mb_type;
-
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-#ifndef H264_DIRECT_H
-#define H264_DIRECT_H
-
-#include "h264_types.h"
-
-void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int *mb_type);
-int pred_motion_mb_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m);
-
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_ps.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_ps.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,462 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 parameter set decoding.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "dsputil.h"
-#include "avcodec.h"
-#include "h264_types.h"
-#include "h264_data.h"
-#include "golomb.h"
-
-
-//#undef NDEBUG
-#include <assert.h>
-
-static const int pixel_aspect[17][2]={
- {0, 1},
- {1, 1},
- {12, 11},
- {10, 11},
- {16, 11},
- {40, 33},
- {24, 11},
- {20, 11},
- {32, 11},
- {80, 33},
- {18, 11},
- {15, 11},
- {64, 33},
- {160,99},
- {4, 3},
- {3, 2},
- {2, 1},
-};
-
-const uint8_t ff_h264_chroma_qp[52]={
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
-   12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
-   28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
-   37,38,38,38,39,39,39,39
-};
-
-static const uint8_t default_scaling4[2][16]={
-{   6,13,20,28,
-   13,20,28,32,
-   20,28,32,37,
-   28,32,37,42
-},{
-   10,14,20,24,
-   14,20,24,27,
-   20,24,27,30,
-   24,27,30,34
-}};
-
-static const uint8_t default_scaling8[2][64]={
-{   6,10,13,16,18,23,25,27,
-   10,11,16,18,23,25,27,29,
-   13,16,18,23,25,27,29,31,
-   16,18,23,25,27,29,31,33,
-   18,23,25,27,29,31,33,36,
-   23,25,27,29,31,33,36,38,
-   25,27,29,31,33,36,38,40,
-   27,29,31,33,36,38,40,42
-},{
-    9,13,15,17,19,21,22,24,
-   13,13,17,19,21,22,24,25,
-   15,17,19,21,22,24,25,27,
-   17,19,21,22,24,25,27,28,
-   19,21,22,24,25,27,28,30,
-   21,22,24,25,27,28,30,32,
-   22,24,25,27,28,30,32,33,
-   24,25,27,28,30,32,33,35
-}};
-
-static inline int decode_hrd_parameters(GetBitContext *gb, SPS *sps){
-    int cpb_count, i;
-    cpb_count = get_ue_golomb_31(gb) + 1;
-
-    if(cpb_count > 32){
-        av_log(AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
-        return -1;
-    }
-
-    get_bits(gb, 4); /* bit_rate_scale */
-    get_bits(gb, 4); /* cpb_size_scale */
-    for(i=0; i<cpb_count; i++){
-        get_ue_golomb(gb); /* bit_rate_value_minus1 */
-        get_ue_golomb(gb); /* cpb_size_value_minus1 */
-        get_bits1(gb);     /* cbr_flag */
-    }
-    sps->initial_cpb_removal_delay_length = get_bits(gb, 5) + 1;
-    sps->cpb_removal_delay_length = get_bits(gb, 5) + 1;
-    sps->dpb_output_delay_length = get_bits(gb, 5) + 1;
-    sps->time_offset_length = get_bits(gb, 5);
-    sps->cpb_cnt = cpb_count;
-    return 0;
-}
-
-static inline int decode_vui_parameters(GetBitContext *gb, SPS *sps){
-    int aspect_ratio_info_present_flag;
-    unsigned int aspect_ratio_idc;
-
-    aspect_ratio_info_present_flag= get_bits1(gb);
-
-    if( aspect_ratio_info_present_flag ) {
-        aspect_ratio_idc= get_bits(gb, 8);
-        if( aspect_ratio_idc == EXTENDED_SAR ) {
-            sps->num= get_bits(gb, 16);
-            sps->den= get_bits(gb, 16);
-        }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(int[2])){
-            //sps->sar=  pixel_aspect[aspect_ratio_idc];
-        }else{
-            av_log( AV_LOG_ERROR, "illegal aspect ratio idc %d\n", aspect_ratio_idc);
-         //   return -1;
-        }
-    }else{
-        sps->num=
-        sps->den= 0;
-    }
-
-    if(get_bits1(gb)){      /* overscan_info_present_flag */
-        get_bits1(gb);      /* overscan_appropriate_flag */
-    }
-
-    sps->video_signal_type_present_flag = get_bits1(gb);
-    if(sps->video_signal_type_present_flag){
-        get_bits(gb, 3);    /* video_format */
-        sps->full_range = get_bits1(gb); /* video_full_range_flag */
-
-        sps->colour_description_present_flag = get_bits1(gb);
-        if(sps->colour_description_present_flag){
-            sps->color_primaries = get_bits(gb, 8); /* colour_primaries */
-            sps->color_trc       = get_bits(gb, 8); /* transfer_characteristics */
-            sps->colorspace      = get_bits(gb, 8); /* matrix_coefficients */
-            if (sps->color_primaries >= AVCOL_PRI_NB)
-                sps->color_primaries  = AVCOL_PRI_UNSPECIFIED;
-            if (sps->color_trc >= AVCOL_TRC_NB)
-                sps->color_trc  = AVCOL_TRC_UNSPECIFIED;
-            if (sps->colorspace >= AVCOL_SPC_NB)
-                sps->colorspace  = AVCOL_SPC_UNSPECIFIED;
-        }
-    }
-
-    if(get_bits1(gb)){      /* chroma_location_info_present_flag */
-        av_log(AV_LOG_ERROR, "chroma_location_info_present_flag found, but not supported\n");
-        (void) (get_ue_golomb(gb)+1);  /* chroma_sample_location_type_top_field */
-        (void) get_ue_golomb(gb);  /* chroma_sample_location_type_bottom_field */
-    }
-
-    sps->timing_info_present_flag = get_bits1(gb);
-    if(sps->timing_info_present_flag){
-        sps->num_units_in_tick = get_bits_long(gb, 32);
-        sps->time_scale = get_bits_long(gb, 32);
-        if(!sps->num_units_in_tick || !sps->time_scale){
-            av_log(AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
-            return -1;
-        }
-        sps->fixed_frame_rate_flag = get_bits1(gb);
-    }
-
-    sps->nal_hrd_parameters_present_flag = get_bits1(gb);
-    if(sps->nal_hrd_parameters_present_flag)
-        if(decode_hrd_parameters(gb, sps) < 0)
-            return -1;
-    sps->vcl_hrd_parameters_present_flag = get_bits1(gb);
-    if(sps->vcl_hrd_parameters_present_flag)
-        if(decode_hrd_parameters(gb, sps) < 0)
-            return -1;
-    if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
-        get_bits1(gb);     /* low_delay_hrd_flag */
-    sps->pic_struct_present_flag = get_bits1(gb);
-
-    sps->bitstream_restriction_flag = get_bits1(gb);
-    if(sps->bitstream_restriction_flag){
-        get_bits1(gb);     /* motion_vectors_over_pic_boundaries_flag */
-        get_ue_golomb(gb); /* max_bytes_per_pic_denom */
-        get_ue_golomb(gb); /* max_bits_per_mb_denom */
-        get_ue_golomb(gb); /* log2_max_mv_length_horizontal */
-        get_ue_golomb(gb); /* log2_max_mv_length_vertical */
-        sps->num_reorder_frames= get_ue_golomb(gb);
-        get_ue_golomb(gb); /*max_dec_frame_buffering*/
-
-        if(sps->num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
-            av_log(AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
-            return -1;
-        }
-    }
-
-    return 0;
-}
-
-static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size, const uint8_t *jvt_list, const uint8_t *fallback_list){
-    int i, last = 8, next = 8;
-    const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
-    if(!get_bits1(gb)) /* matrix not written, we use the predicted one */
-        memcpy(factors, fallback_list, size*sizeof(uint8_t));
-    else
-    for(i=0;i<size;i++){
-        if(next)
-            next = (last + get_se_golomb(gb)) & 0xff;
-        if(!i && !next){ /* matrix not written, we use the preset one */
-            memcpy(factors, jvt_list, size*sizeof(uint8_t));
-            break;
-        }
-        last = factors[scan[i]] = next ? next : last;
-    }
-}
-
-static void decode_scaling_matrices(GetBitContext *gb, SPS *sps, PPS *pps, int is_sps,
-                                   uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
-    int fallback_sps = !is_sps && sps->scaling_matrix_present;
-    const uint8_t *fallback[4] = {
-        fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
-        fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
-        fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
-        fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
-    };
-    if(get_bits1(gb)){
-        sps->scaling_matrix_present |= is_sps;
-        decode_scaling_list(gb, scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
-        decode_scaling_list(gb, scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
-        decode_scaling_list(gb, scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
-        decode_scaling_list(gb, scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
-        decode_scaling_list(gb, scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
-        decode_scaling_list(gb, scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
-        if(is_sps || pps->transform_8x8_mode){
-            decode_scaling_list(gb, scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
-            decode_scaling_list(gb, scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
-        }
-    }
-}
-
-int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb){
-    int profile_idc, level_idc;
-    unsigned int sps_id;
-    int i;
-    SPS *sps;
-
-    profile_idc= get_bits(gb, 8);
-    get_bits1(gb);   //constraint_set0_flag
-    get_bits1(gb);   //constraint_set1_flag
-    get_bits1(gb);   //constraint_set2_flag
-    get_bits1(gb);   //constraint_set3_flag
-    get_bits(gb, 4); // reserved
-    level_idc= get_bits(gb, 8);
-    sps_id= get_ue_golomb_31(gb);
-
-    if(sps_id >= MAX_SPS_COUNT) {
-        av_log(AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
-        return -1;
-    }
-    if (!n->sps_buffers[sps_id])
-        n->sps_buffers[sps_id]= av_mallocz(sizeof(SPS));
-        
-    sps = n->sps_buffers[sps_id];
-    if(sps == NULL)
-        return -1;
-
-    sps->profile_idc= profile_idc;
-    sps->level_idc= level_idc;
-
-    memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
-    memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
-    sps->scaling_matrix_present = 0;
-
-    if(sps->profile_idc >= 100){ //high profile
-        sps->chroma_format_idc= get_ue_golomb_31(gb);
-        if(sps->chroma_format_idc == 3)
-            sps->residual_color_transform_flag = get_bits1(gb);
-        sps->bit_depth_luma   = get_ue_golomb(gb) + 8;
-        sps->bit_depth_chroma = get_ue_golomb(gb) + 8;
-        sps->transform_bypass = get_bits1(gb);
-        decode_scaling_matrices(gb, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
-    }else{
-        sps->chroma_format_idc= 1;
-        sps->bit_depth_luma   = 8;
-        sps->bit_depth_chroma = 8;
-    }
-
-    sps->log2_max_frame_num= get_ue_golomb(gb) + 4;
-    sps->poc_type= get_ue_golomb_31(gb);
-
-    if(sps->poc_type == 0){ //FIXME #define
-        sps->log2_max_poc_lsb= get_ue_golomb(gb) + 4;
-    } else if(sps->poc_type == 1){//FIXME #define
-        sps->delta_pic_order_always_zero_flag= get_bits1(gb);
-        sps->offset_for_non_ref_pic= get_se_golomb(gb);
-        sps->offset_for_top_to_bottom_field= get_se_golomb(gb);
-        sps->poc_cycle_length                = get_ue_golomb(gb);
-
-        if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
-            av_log(AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
-            goto fail;
-        }
-
-        for(i=0; i<sps->poc_cycle_length; i++)
-            sps->offset_for_ref_frame[i]= get_se_golomb(gb);
-    }else if(sps->poc_type != 2){
-        av_log(AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
-        goto fail;
-    }
-
-    sps->ref_frame_count= get_ue_golomb_31(gb);
-    if(sps->ref_frame_count >= 32){
-        av_log(AV_LOG_ERROR, "too many reference frames\n");
-        goto fail;
-    }
-    sps->gaps_in_frame_num_allowed_flag= get_bits1(gb);
-    sps->mb_width = get_ue_golomb(gb) + 1;
-    sps->mb_height= get_ue_golomb(gb) + 1;
-
-
-    sps->frame_mbs_only_flag= get_bits1(gb);
-    if(!sps->frame_mbs_only_flag){
-        av_log(AV_LOG_ERROR, "MBAFF support not included\n");
-        get_bits1(gb);
-    }else
-        sps->mb_aff= 0;
-
-    sps->direct_8x8_inference_flag= get_bits1(gb);
-    if(!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag){
-        av_log(AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n");
-        goto fail;
-    }
-
-    sps->crop= get_bits1(gb);
-    if(sps->crop){
-		sps->crop_left = get_ue_golomb(gb);
-		sps->crop_right = get_ue_golomb(gb);
-		sps->crop_top = get_ue_golomb(gb);
-		sps->crop_bottom= get_ue_golomb(gb);
-		if(sps->crop_left || sps->crop_top){
-			av_log( AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
-		}
-		if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
-			av_log( AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
-		}
-	}else {
-	
-		sps->crop_left  =
-		sps->crop_right =
-		sps->crop_top   =
-		sps->crop_bottom= 0;
-	}
-
-    sps->vui_parameters_present_flag= get_bits1(gb);
-    if( sps->vui_parameters_present_flag )
-        if (decode_vui_parameters(gb, sps) < 0)
-            goto fail;
-
-    
-    n->sps = *sps;
-
-    if( sps->bitstream_restriction_flag){
-        n->has_b_frames = sps->num_reorder_frames;
-    }
-    else
-        n->has_b_frames= MAX_DELAYED_PIC_COUNT;
-
-    return 0;
-fail:
-    av_free(sps);
-    return -1;
-}
-
-static void
-build_qp_table(PPS *pps, int t, int index)
-{
-    int i;
-    for(i = 0; i < 52; i++)
-        pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)];
-}
-
-int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length){
-    unsigned int pps_id= get_ue_golomb(gb);
-    PPS *pps;
-
-    if(pps_id >= MAX_PPS_COUNT) {
-        av_log(AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
-        return -1;
-    }
-    if (!n->pps_buffers[pps_id])
-        n->pps_buffers[pps_id]= av_mallocz(sizeof(PPS));
-    pps = n->pps_buffers[pps_id];
-    if(pps == NULL)
-        return -1;
-    pps->sps_id= get_ue_golomb_31(gb);
-    if((unsigned)pps->sps_id>=MAX_SPS_COUNT || n->sps_buffers[pps->sps_id] == NULL){
-        av_log(AV_LOG_ERROR, "sps_id out of range\n");
-        goto fail;
-    }
-
-    pps->cabac= get_bits1(gb);
-    pps->pic_order_present= get_bits1(gb);
-    if(pps->pic_order_present){        
-        av_log(AV_LOG_ERROR, "no interlaces support\n");
-    }
-    pps->slice_group_count= get_ue_golomb(gb) + 1;
-    if(pps->slice_group_count > 1 ){
-        pps->mb_slice_group_map_type= get_ue_golomb(gb);
-        av_log(AV_LOG_ERROR, "multiple slices not supported\n");
-    }
-    pps->ref_count[0]= get_ue_golomb(gb) + 1;
-    pps->ref_count[1]= get_ue_golomb(gb) + 1;
-    if(pps->ref_count[0]> 32 || pps->ref_count[1]> 32){
-        av_log(AV_LOG_ERROR, "reference overflow (pps)\n");
-        goto fail;
-    }
-
-    pps->weighted_pred= get_bits1(gb);
-    pps->weighted_bipred_idc= get_bits(gb, 2);
-    pps->init_qp= get_se_golomb(gb) + 26;
-    pps->init_qs= get_se_golomb(gb) + 26;
-    pps->chroma_qp_index_offset[0]= get_se_golomb(gb);
-    pps->deblocking_filter_parameters_present= get_bits1(gb);
-    pps->constrained_intra_pred= get_bits1(gb);
-    pps->redundant_pic_cnt_present = get_bits1(gb);
-
-    pps->transform_8x8_mode= 0;
-    memcpy(pps->scaling_matrix4, n->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
-    memcpy(pps->scaling_matrix8, n->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
-
-    if(get_bits_count(gb) < bit_length){
-        pps->transform_8x8_mode= get_bits1(gb);
-        decode_scaling_matrices(gb, n->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
-        pps->chroma_qp_index_offset[1]= get_se_golomb(gb); //second_chroma_qp_index_offset
-    } else {
-        pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
-    }
-
-    build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
-    build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
-    if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
-        pps->chroma_qp_diff= 1;
-
-    return 0;
-fail:
-    av_free(pps);
-    return -1;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_ps.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_ps.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-#ifndef H264_PS_H
-#define H264_PS_H
-
-#include "h264_types.h"
-
-int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb);
-int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pthread.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_pthread.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,604 +0,0 @@
-#include "config.h"
-
-#include "h264_types.h"
-#include "h264_parser.h"
-#include "h264_nal.h"
-#include "h264_entropy.h"
-#include "h264_rec.h"
-#include "h264_misc.h"
-// #undef NDEBUG
-#include <assert.h>
-#include <pthread.h>
-
-#define XOANON 1
-
-#ifdef XOANON
-static int ed_rec_affinity[40] = { 0,  4,  8, 12, 16, 20, 24, 28, 32, 36,
-                                   1,  5,  9, 13, 17, 21, 25, 29, 33, 37,
-                                   2,  6, 10, 14, 18, 22, 26, 30, 34, 38,
-                                   3,  7, 11, 15, 19, 23, 27, 31, 35, 39 };
-static int ed_rec_smt_aff[80]  = { 0,  40,  4, 44,  8, 48, 12, 52, 16, 56, 20, 60, 24, 64, 28, 68, 32, 72, 36, 76,
-                                   1,  41,  5, 45,  9, 49, 13, 53, 17, 57, 21, 61, 25, 65, 29, 69, 33, 73, 37, 77,
-                                   2,  42,  6, 46, 10, 50, 14, 54, 18, 58, 22, 62, 26, 66, 30, 70, 34, 74, 38, 78,
-                                   3,  43,  7, 47, 11, 51, 15, 55, 19, 59, 23, 63, 27, 67, 31, 71, 35, 75, 39, 79 };
-#else
-static int ed_rec_affinity[10] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9};
-static int ed_rec_smt_aff[20] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, };
-#endif
-
-static int frames=0;
-
-static void notify_one_worker(H264Context *h){
-    pthread_mutex_lock(&h->task_lock);
-    pthread_cond_signal(&h->task_cond);
-    pthread_mutex_unlock(&h->task_lock);
-}
-
-static void notify_all_workers(H264Context *h){
-    pthread_mutex_lock(&h->task_lock);
-    pthread_cond_broadcast(&h->task_cond);
-    pthread_mutex_unlock(&h->task_lock);
-}
-
-static void push_sbe (SliceBufferQueue *sbq, SliceBufferEntry *sbe, int notify ){
-    pthread_mutex_lock(&sbq->lock);
-    while (sbq->cnt >= sbq->size)
-        pthread_cond_wait(&sbq->cond, &sbq->lock);
-    sbq->queue[sbq->fi] = sbe;
-    sbq->cnt++;
-    sbq->fi++; sbq->fi %= sbq->size;
-    if (notify)
-        pthread_cond_signal(&sbq->cond);
-    pthread_mutex_unlock(&sbq->lock);
-}
-
-static SliceBufferEntry* pop_sbe (SliceBufferQueue *sbq, int block){
-    SliceBufferEntry *sbe=NULL;
-
-    pthread_mutex_lock(&sbq->lock);
-    if (block){
-        while (sbq->cnt <= 0)
-            pthread_cond_wait(&sbq->cond, &sbq->lock);
-    }else {
-        if (sbq->cnt <= 0)
-            goto nonblock;
-    }
-    sbe = sbq->queue[sbq->fo];
-    sbq->cnt--;
-    sbq->fo++; sbq->fo %= sbq->size;
-    pthread_cond_signal(&sbq->cond);
-nonblock:
-    pthread_mutex_unlock(&sbq->lock);
-
-    return sbe;
-}
-
-// static void push_rle (RingLineQueue *rlq, SliceBufferEntry *sbe, int line, int notify){
-//
-//     //check for free slots
-//     pthread_mutex_lock(&rlq->wslock);
-//     while (rlq->free <= 0){
-//         pthread_cond_wait(&rlq->wscond, &rlq->wslock);
-//     }
-//     //free slot is available, decrement one in this lock
-//     rlq->free--;
-//     pthread_mutex_unlock(&rlq->wslock);
-//
-//     pthread_mutex_lock(&rlq->swlock);
-//     rlq->queue[rlq->fi]->sbe=sbe;
-//     rlq->queue[rlq->fi]->line=line;
-//     rlq->queue[rlq->fi]->mb_cnt=0;
-//     rlq->fi++; rlq->fi %= rlq->size;
-//     rlq->ready++;
-//     if(notify)
-//         pthread_cond_signal(&rlq->swcond);
-//     pthread_mutex_unlock(&rlq->swlock);
-// }
-
-// static RingLineEntry* pop_rle (RingLineQueue *rlq, int block){
-//     RingLineEntry *rle=NULL;
-//
-//     pthread_mutex_lock(&rlq->swlock);
-//     if (block){
-//         while (rlq->ready <= 0)
-//             pthread_cond_wait(&rlq->swcond, &rlq->swlock);
-//     }else {
-//         if (rlq->ready <= 0)
-//             goto nonblock;
-//     }
-//     rle = rlq->queue[rlq->fo];
-//     rlq->fo++; rlq->fo %= rlq->size;
-//     rlq->ready--;
-// nonblock:
-//     pthread_mutex_unlock(&rlq->swlock);
-//
-//     return rle;
-// }
-//
-// static void rel_rle (RingLineQueue *rlq){
-//     pthread_mutex_lock(&rlq->wslock);
-//     rlq->free++;
-//     pthread_cond_signal(&rlq->wscond);
-//     pthread_mutex_unlock(&rlq->wslock);
-// }
-
-static RingLineEntry* pop_rle (SliceBufferQueue *sbq, RingLineQueue *rlq, int *has_token){
-    RingLineEntry *rle=NULL;
-    SliceBufferEntry *sbe=NULL;
-    int line=-1;
-
-    pthread_mutex_lock(&sbq->lock);
-    if (sbq->cnt <= 0)
-        goto unlock;
-    sbe = sbq->queue[sbq->fo];
-    line = sbe->lines_taken;
-
-
-    pthread_mutex_lock(&rlq->swlock);
-    if (!*has_token){
-        if (rlq->free <= 0)
-            goto unlock2;
-        rlq->free--;
-        *has_token=1;
-    }
-    rle = rlq->queue[rlq->fo];
-    rlq->fo++; rlq->fo %= rlq->size;
-    rle->sbe=sbe;
-    rle->line = line;
-    rle->mb_cnt =0;
-    if (++sbe->lines_taken >= sbe->lines_total){
-        sbq->cnt--;
-        sbq->fo++; sbq->fo %= sbq->size;
-        pthread_cond_signal(&sbq->cond);
-    }
-unlock2:
-    pthread_mutex_unlock(&rlq->swlock);
-unlock:
-    pthread_mutex_unlock(&sbq->lock);
-
-
-    return rle;
-}
-
-static void rel_rle (RingLineQueue *rlq, int *rec_token){
-    pthread_mutex_lock(&rlq->swlock);
-    rlq->free++;
-    *rec_token=0;
-//     pthread_cond_signal(&rlq->swcond);
-    pthread_mutex_unlock(&rlq->swlock);
-
-}
-
-//get either a entropy or a line reconstruct task
-static void pop_next_task(H264Context *h, SliceBufferEntry **psbe, RingLineEntry **prle, int *rec_token){
-
-    pthread_mutex_lock(&h->task_lock);
-
-    for(;;){
-        if ( (*psbe = pop_sbe(&h->sb_q[ENTROPY], 0)) ){
-            if (*rec_token){
-                rel_rle(&h->rl_q, rec_token);
-                pthread_cond_signal(&h->task_cond);
-            }
-            break;
-        }
-        else if ( (*prle = pop_rle(&h->sb_q[MBDEC], &h->rl_q, rec_token)) )
-            break;
-        pthread_cond_wait(&h->task_cond, &h->task_lock);
-    }
-
-    pthread_mutex_unlock(&h->task_lock);
-}
-
-void *parse_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-    ParserContext *pc = get_parse_context(h->ifile);
-    NalContext *nc = get_nal_context(h->width, h->height);
-    H264Slice *s;
-    SliceBufferEntry *sbe = NULL;
-
-    while(!pc->final_frame && frames++ <h->num_frames && !h->quit){
-        sbe = get_sb_entry(h);
-
-        av_read_frame_internal(pc, &sbe->gb);
-        s = &sbe->slice;
-
-        decode_nal_units(nc, s, &sbe->gb);
-
-        push_sbe(&h->sb_q[ENTROPY], sbe, 0);
-        notify_one_worker(h);
-    }
-
-    if (!h->no_mbd){
-        sbe = get_sb_entry(h);
-        sbe->state=-1;
-        sbe->slice.coded_pic_num=nc->coded_pic_num;
-        sbe->lines_total=h->threads;
-
-        push_sbe(&h->sb_q[REORDER], sbe, 1);
-    }else{
-        for (int i=0; i<h->threads; i++){
-            sbe = get_sb_entry(h);
-            sbe->state=-1;
-            push_sbe(&h->sb_q[ENTROPY], sbe, 1);
-            notify_one_worker(h);
-        }
-    }
-    free_nal_context(nc);
-    free_parse_context(pc);
-
-    pthread_exit(NULL);
-    return NULL;
-}
-
-int decode_slice_entropy(EntropyContext *ec, SliceBufferEntry *sbe){
-    int i,j;
-    H264Slice *s = &sbe->slice;
-    GetBitContext *gb = &sbe->gb;
-    CABACContext *c = &ec->c;
-    H264Mb *mbs = sbe->mbs;
-
-    if( !s->pps.cabac ){
-        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
-        return -1;
-    }
-
-    init_dequant_tables(s, ec);
-    ec->curr_qscale = s->qscale;
-    ec->last_qscale_diff = 0;
-    ec->chroma_qp[0] = get_chroma_qp( s, 0, s->qscale);
-    ec->chroma_qp[1] = get_chroma_qp( s, 1, s->qscale);
-
-    /* realign */
-    align_get_bits( gb );
-    /* init cabac */
-    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
-
-    ff_h264_init_cabac_states(ec, s, c);
-
-    for(j=0; j<ec->mb_height; j++){
-        init_entropy_buf(ec, s, j);
-        for(i=0; i<ec->mb_width; i++){
-            int eos,ret;
-            H264Mb *m = &mbs[i + j*ec->mb_width];
-            //memset(m, 0, sizeof(H264Mb));
-            m->mb_x=i;
-            m->mb_y=j;
-            ec->m = m;
-
-            ret = ff_h264_decode_mb_cabac(ec, s, c);
-            eos = get_cabac_terminate( c); (void) eos;
-
-            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
-                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
-                return -1;
-            }
-        }
-    }
-
-    return 0;
-}
-
-static int decode_slice_mb(MBRecContext *d, RingLineEntry *rle, int frames){
-    SliceBufferEntry *sbe= rle->sbe;
-    H264Slice *s = &sbe->slice;
-    H264Mb *mbs = sbe->mbs;
-
-    int mb_width= d->mb_width;
-    int i;
-    const int line = rle->line;
-
-    init_mbrec_context(d, d->mrs, s, line);
-
-    H264Mb *m = &mbs[line*mb_width];
-    d->top=rle->prev_line->top;
-    d->top_next=rle->top;
-
-//     assert(rle->mb_cnt ==0);
-    for(i=0; i< mb_width; i++){
-        if (frames || line>0){
-            while (rle->mb_cnt >= rle->prev_line->mb_cnt -1);
-        }
-        h264_decode_mb_internal( d, d->mrs, s, &m[i]);
-        rle->mb_cnt++;
-    }
-    draw_edges(d, s, line);
-
-    return 0;
-}
-
-// static int decode_slice_mb_static(MBRecContext *d, H264Slice *s, RLThreadContext *r, RLThreadContext *rp,  int frames){
-//     int mb_height= d->mb_height;
-//     int mb_width= d->mb_width;
-//     int thread_num = r->thread_num;
-//     int thread_total = r->thread_total;
-//     int i;
-//     int j = thread_num;
-//
-//     r->mb_cnt=frames* mb_height*mb_width;
-//     for(; j<mb_height; j+=thread_total){
-//         H264Mb *m = &s->mbs[j*mb_width];
-//         for(i=0; i< mb_width; i++){
-//             if (j>0){
-//                 while (r->mb_cnt- (thread_num? 0:mb_width) >= rp->mb_cnt-1);
-//             }
-//             h264_decode_mb_internal(d, s, m++);
-//             r->mb_cnt++;
-//         }
-//         draw_edges(d, s, j);
-//     }
-//     return 0;
-// }
-
-static void *ed_rec_thread(void *arg){
-    H264Context *h =  (H264Context*) arg;
-    EntropyContext *ec=NULL;
-    MBRecContext *mrc=NULL;
-
-    RingLineEntry *rle=NULL;
-    SliceBufferEntry *sbe=NULL;
-    H264Slice *s;
-    int rec_token=0;
-
-    if (!h->no_mbd){
-        mrc = get_mbrec_context(h);
-    }
-    ec = get_entropy_context(h);
-
-    for(;;){
-        pop_next_task(h, &sbe, &rle, &rec_token);
-        if (sbe){
-            if (h->no_mbd && sbe->state<0){
-                break;
-            }
-            if (!sbe->initialized){
-                init_sb_entry(h, sbe);
-            }
-            decode_slice_entropy(ec, sbe);
-
-            if (h->no_mbd){
-                release_sb_entry(h, sbe);
-                sbe=NULL;
-            } else {
-                push_sbe(&h->sb_q[REORDER], sbe, 1);
-            }
-        } else if (rle){
-            if (rle->sbe->state<0)
-                break;
-            s = &rle->sbe->slice;
-
-            decode_slice_mb(mrc, rle, s->coded_pic_num);
-
-            if (rle->line == h->mb_height-1){
-                push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1);
-            }
-            rle->mb_cnt++;
-        }
-    }
-
-    //make sure threads quit in order of rle assignment
-    if (!h->no_mbd){
-        while (rle->prev_line->mb_cnt <= h->mb_width);
-        rel_rle(&h->rl_q, &rec_token);
-        notify_one_worker(h);
-        rle->mb_cnt = h->mb_width +1;
-        if (rle->line == h->threads-1){
-            push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1);
-        }
-
-        free_mbrec_context(mrc);
-    }
-
-    free_entropy_context(ec);
-
-    pthread_exit(NULL);
-    return NULL;
-}
-
-static void *reorder_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-    int i;
-    SliceBufferEntry *reorder[h->sb_size];
-    SliceBufferEntry *sbe, *next_sbe;
-    H264Slice *s;
-    int reorder_cnt=0;
-    unsigned next_pic_num=0;
-
-    for(;;){
-
-        sbe = pop_sbe(&h->sb_q[REORDER], 1);
-
-        s = &sbe->slice;
-        for(i=reorder_cnt; i>0; i--){
-            if (s->coded_pic_num < reorder[i-1]->slice.coded_pic_num)
-                break;
-            reorder[i]=reorder[i-1];
-        }
-        reorder[i]=sbe;
-
-        while(reorder_cnt>=0){
-            if (next_pic_num!=reorder[reorder_cnt]->slice.coded_pic_num){
-                break;
-            }
-            next_sbe = reorder[reorder_cnt];
-            H264Slice *es = &next_sbe->slice;
-
-            if (next_sbe->state<0)
-                goto end;
-
-            for (int i=0; i<2; i++){
-                for(int j=0; j< es->ref_count[i]; j++){
-                    if (es->ref_list_cpn[i][j] ==-1)
-                        continue;
-                    int k;
-                    for (k=0; k<h->max_dpb_cnt; k++){
-                        if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == es->ref_list_cpn[i][j]){
-                            es->dp_ref_list[i][j] = &h->dpb[k];
-                            break;
-                        }
-                    }
-                }
-            }
-            next_sbe->dp = get_dpb_entry(h, es);
-
-            push_sbe(&h->sb_q[MBDEC], next_sbe, 0);
-            notify_all_workers(h);
-
-//             for (int i=0; i< h->mb_height; i++){
-//                 push_rle(&h->rl_q, next_sbe, i, 0);
-//                 notify_one_worker(h);
-//             }
-
-
-            next_pic_num++;
-            reorder_cnt--;
-        }
-        reorder_cnt++;
-    }
-
-end:
-    {
-        push_sbe(&h->sb_q[MBDEC], next_sbe, 0);
-        notify_all_workers(h);
-        if (h->no_mbd){
-            push_sbe(&h->sb_q[OUTPUT], next_sbe, 1);
-        }
-//         for (int i=0; i< h->threads; i++){
-//             push_rle(&h->rl_q, next_sbe, i, 0);
-//             notify_one_worker(h);
-//         }
-    }
-
-    pthread_exit(NULL);
-    return NULL;
-}
-
-void create_ed_rec_threads(H264Context *h){
-    cpu_set_t cpuset;
-    int* aff;
-
-    if (h->setaff){
-        aff = h->smt ? ed_rec_smt_aff : ed_rec_affinity ;
-        for (int i=0; i<h->threads; i++){
-            pthread_attr_init(&h->ed_rec_attr[i]);
-            CPU_ZERO(&cpuset);
-            CPU_SET(aff[i], &cpuset);
-            pthread_attr_setaffinity_np(&h->ed_rec_attr[i], sizeof(cpu_set_t), &cpuset);
-            pthread_create(&h->ed_rec_thr[i], &h->ed_rec_attr[i], ed_rec_thread, h);
-        }
-    } else {
-        for (int i=0; i<h->threads; i++){
-            pthread_create(&h->ed_rec_thr[i], NULL, ed_rec_thread, h);
-        }
-    }
-}
-
-void join_ed_rec_threads(H264Context *h){
-    for (int i=0; i< h->threads; i++){
-        pthread_join(h->ed_rec_thr[i], NULL);
-    }
-}
-
-void *output_thread(void *arg){
-    H264Context *h = (H264Context *) arg;
-
-    OutputContext *oc = get_output_context( h );
-
-    SliceBufferEntry *sbe = NULL;
-    H264Slice *s=NULL;
-    for(;;) {
-        DecodedPicture *out, *dp;
-        sbe = pop_sbe(&h->sb_q[OUTPUT], 1);
-
-        if (sbe->state <0)
-            break;
-
-        s = &sbe->slice;
-        for (int i=0; i<s->release_cnt; i++){
-            for(int j=0; j<h->max_dpb_cnt; j++){
-                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
-                    release_dpb_entry(h, &h->dpb[j], 2);
-                    break;
-                }
-            }
-        }
-
-        dp=sbe->dp;
-        release_sb_entry(h, sbe);
-
-        out =output_frame(h, oc, dp, h->ofile, h->frame_width, h->frame_height);
-        if (out){
-            release_dpb_entry(h, out, 1);
-        }
-
-        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
-
-    }
-    /* at the end of stream, we must flush the decoder buffers */
-    while (output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height));
-    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
-
-    free_output_context(oc);
-
-    pthread_exit(NULL);
-    return NULL;
-}
-
-/*
-* The following code is the main loop of the file converter
-*/
-int h264_decode_pthread(H264Context *h) {
-    pthread_t parse_thr, reorder_thr, output_thr;
-
-    av_start_timer();
-
-    pthread_create(&parse_thr, NULL, parse_thread, h);
-    if (!h->no_mbd){
-        pthread_create(&reorder_thr, NULL, reorder_thread, h);
-        pthread_create(&output_thr, NULL, output_thread, h);
-    }
-#if HAVE_LIBSDL2
-    pthread_t sdl_thr;
-    if (h->display){
-        pthread_create(&sdl_thr, NULL, sdl_thread, h);
-    }
-#endif
-    create_ed_rec_threads(h);
-
-
-    if (h->rl_side_touch){
-        pthread_mutex_lock(&h->ilock);
-        while (h->init_threads< h->threads)
-            pthread_cond_wait(&h->icond, &h->ilock);
-        pthread_mutex_unlock(&h->ilock);
-
-        pthread_mutex_lock(&h->tlock);
-        h->touch_start =1;
-        pthread_cond_broadcast(&h->tcond);
-        pthread_mutex_unlock(&h->tlock);
-
-        pthread_mutex_lock(&h->tdlock);
-        while (h->touch_done < h->threads)
-            pthread_cond_wait(&h->tdcond, &h->tdlock);
-        pthread_mutex_unlock(&h->tdlock);
-
-        pthread_mutex_lock(&h->slock);
-        h->start =1;
-        pthread_cond_broadcast(&h->scond);
-        pthread_mutex_unlock(&h->slock);
-    }
-    join_ed_rec_threads(h);
-    pthread_join(parse_thr, NULL);
-    if (!h->no_mbd){
-        pthread_join(reorder_thr, NULL);
-        pthread_join(output_thr, NULL);
-    }
-#if HAVE_LIBSDL2
-    if (h->display)
-        signal_sdl_exit(h);
-        pthread_join(sdl_thr, NULL);
-#endif
-
-
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pthread.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_pthread.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-#ifndef H264_PTHREAD_H
-#define H264_PTHREAD_H
-
-#include "h264_types.h"
-
-int decode_B_slice_entropy(EntropyContext *ec, EDSlice *s, EDThreadContext *eb, EDThreadContext *eb_prev);
-int decode_slice_entropy(EntropyContext *hc, EDSlice *s);
-
-void *read_thread(void *arg);
-void *parsenal_thread(void *arg);
-void *mbrec_thread(void *arg);
-void *write_thread(void *arg);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_rec.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_rec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,412 +0,0 @@
-#include "config.h"
-
-#include "dsputil.h"
-#include "h264_types.h"
-#include "h264_data.h"
-#include "h264_mc.h"
-#include "h264_deblock.h"
-#include "h264_pred_mode.h"
-//#undef NDEBUG
-#include <assert.h>
-
-void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line){
-    DecodedPicture *pic = s->curr_pic;
-    int mb_stride = mrc->mb_stride;
-    int mb_width = mrc->mb_width;
-    mrs->mb_type_top = pic->mb_type + (line -1)*mb_stride;
-    mrs->mb_type = pic->mb_type + line*mb_stride;
-    mrs->ref_index_top[0] = pic->ref_index[0] + 4*(line -1)*mb_stride;
-    mrs->ref_index_top[1] = pic->ref_index[1] + 4*(line -1)*mb_stride;
-    mrs->ref_index[0] = pic->ref_index[0] + 4*line*mb_stride;
-    mrs->ref_index[1] = pic->ref_index[1] + 4*line*mb_stride;
-
-    mrs->motion_val_top[0] = pic->motion_val[0] + 4*mb_width*4*(line-1);
-    mrs->motion_val_top[1] = pic->motion_val[1] + 4*mb_width*4*(line-1);
-    mrs->motion_val[0] = pic->motion_val[0] + 4*mb_width*4*line;
-    mrs->motion_val[1] = pic->motion_val[1] + 4*mb_width*4*line;
-
-    mrs->intra4x4_pred_mode_top = pic->intra4x4_pred_mode + 4*mb_width*(line-1);
-    mrs->intra4x4_pred_mode = pic->intra4x4_pred_mode + 4*mb_width*line;
-
-    mrs->non_zero_count_top = pic->non_zero_count + 8*mb_width*(line-1);
-    mrs->non_zero_count = pic->non_zero_count + 8*mb_width*line;
-
-    if (s->slice_type_nos == FF_B_TYPE){
-        mrs->list1_mb_type = s->dp_ref_list[1][0]->mb_type + line*mb_stride;
-        mrs->list1_ref_index[0]  = s->dp_ref_list[1][0]->ref_index[0] + 4*line*mb_stride;
-        mrs->list1_ref_index[1]  = s->dp_ref_list[1][0]->ref_index[1] + 4*line*mb_stride;
-        mrs->list1_motion_val[0] = s->dp_ref_list[1][0]->motion_val[0] + 4*mb_width*4*line;
-        mrs->list1_motion_val[1] = s->dp_ref_list[1][0]->motion_val[1] + 4*mb_width*4*line;
-    }
-
-}
-
-#if OMPSS
-static void backup_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
-    int i;
-    uint8_t * top_border_y1 = m->top_border;
-    uint8_t * top_border_y2 = m->top_border + 8;
-    uint8_t * top_border_cb = m->top_border + 16;
-    uint8_t * top_border_cr = m->top_border + 24;
-    uint8_t * top_border_next = m->top_border_next;
-
-    src_y  -=   linesize;
-    src_cb -= uvlinesize;
-    src_cr -= uvlinesize;
-
-    m->left_border[0]= m->top_border[15];
-    for(i=1; i<17 ; i++){
-        m->left_border[i]= src_y[15 + i*linesize];
-    }
-
-    *(uint64_t*)(top_border_y1)   = *(uint64_t*)(src_y +  16*linesize);
-    *(uint64_t*)(top_border_next) = *(uint64_t*)(src_y +  16*linesize);
-    *(uint64_t*)(top_border_y2)   = *(uint64_t*)(src_y +8+16*linesize);
-
-    m->left_border[17]= m->top_border[16+7];
-    m->left_border[17+9]= m->top_border[24+7];
-    for(i=1; i<9; i++){
-        m->left_border[17  +i]= src_cb[7+i*uvlinesize];
-        m->left_border[17+9+i]= src_cr[7+i*uvlinesize];
-    }
-    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
-    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
-}
-
-static void xchg_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
-    int temp8, i;
-    uint64_t temp64;
-
-    uint8_t * top_border_y1 = m->top_border;
-    uint8_t * top_border_y2 = m->top_border + 8;
-    uint8_t * top_border_cb = m->top_border + 16;
-    uint8_t * top_border_cr = m->top_border + 24;
-    uint8_t * top_border_next = m->top_border_next;
-
-    int deblock_left;
-    int deblock_top;
-
-    deblock_left = (m->mb_x > 0);
-    deblock_top =  (m->mb_y > 0);
-
-    src_y  -= (  linesize + 1);
-    src_cb -= (uvlinesize + 1);
-    src_cr -= (uvlinesize + 1);
-
-    #define XCHG(a,b,t,xchg)\
-    t= a;\
-    if(xchg)\
-        a= b;\
-    b= t;
-
-    if(deblock_left){
-        for(i = !deblock_top; i<16; i++){
-            XCHG(m->left_border[i], src_y [i*  linesize], temp8, xchg);
-        }
-        XCHG(m->left_border[i], src_y [i*  linesize], temp8, 1);
-
-        for(i = !deblock_top; i<8; i++){
-            XCHG(m->left_border[17  +i], src_cb[i*uvlinesize], temp8, xchg);
-            XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, xchg);
-        }
-        XCHG(m->left_border[17  +i], src_cb[i*uvlinesize], temp8, 1);
-        XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, 1);
-    }
-
-    if(deblock_top){
-        XCHG(*(uint64_t*)(top_border_y1)  , *(uint64_t*)(src_y +1), temp64, xchg);
-        XCHG(*(uint64_t*)(top_border_y2)  , *(uint64_t*)(src_y +9), temp64, 1);
-        XCHG(*(uint64_t*)(top_border_next), *(uint64_t*)(src_y +17), temp64, 1);
-
-        XCHG(*(uint64_t*)(top_border_cb)  , *(uint64_t*)(src_cb+1), temp64, 1);
-        XCHG(*(uint64_t*)(top_border_cr)  , *(uint64_t*)(src_cr+1), temp64, 1);
-    }
-}
-#else
-
-static void backup_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
-    int i;
-    uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y;
-    uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb;
-    uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr;
-
-    uint8_t* left_border_y = d->left.unfiltered_y;
-    uint8_t* left_border_cb = d->left.unfiltered_cb;
-    uint8_t* left_border_cr = d->left.unfiltered_cr;
-
-    src_y  -=   linesize;
-    src_cb -= uvlinesize;
-    src_cr -= uvlinesize;
-
-    // There are two lines saved, the line above the top macroblock of a pair,
-    // and the line above the bottom macroblock
-    left_border_y[0] = top_border_y[15];
-    for(i=1; i<17; i++){
-        left_border_y[i] = src_y[15+i*  linesize];
-    }
-    *(uint64_t*)(top_border_y   )   = *(uint64_t*)(src_y +  16*linesize);
-    *(uint64_t*)(top_border_y +8)   = *(uint64_t*)(src_y +8+16*linesize);
-
-    left_border_cb[0] = top_border_cb[7];
-    left_border_cr[0] = top_border_cr[7];
-    for(i=1; i<9; i++){
-        left_border_cb[i] = src_cb[7+i*uvlinesize];
-        left_border_cr[i] = src_cr[7+i*uvlinesize];
-    }
-    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
-    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
-}
-
-static void xchg_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
-
-    int temp8, i;
-    uint64_t temp64;
-    int deblock_left;
-    int deblock_top;
-
-    uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y;
-    uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb;
-    uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr;
-    uint8_t* top_border_y_next = d->top[m->mb_x +1].unfiltered_y;
-
-    uint8_t* left_border_y = d->left.unfiltered_y;
-    uint8_t* left_border_cb = d->left.unfiltered_cb;
-    uint8_t* left_border_cr = d->left.unfiltered_cr;
-
-    deblock_left = (m->mb_x > 0);
-    deblock_top =  (m->mb_y > 0);
-
-    src_y  -= (  linesize + 1);
-    src_cb -= (uvlinesize + 1);
-    src_cr -= (uvlinesize + 1);
-
-    #define XCHG(a,b,t,xchg)\
-    t= a;\
-    if(xchg)\
-        a= b;\
-    b= t;
-
-    if(deblock_left){
-        for(i = !deblock_top; i<16; i++){
-            XCHG(left_border_y[i], src_y [i*  linesize], temp8, xchg);
-        }
-        XCHG(left_border_y[i], src_y [i*  linesize], temp8, 1);
-
-        for(i = !deblock_top; i<8; i++){
-            XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg);
-            XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg);
-        }
-        XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1);
-        XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1);
-    }
-
-    if(deblock_top){
-        XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg);
-        XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1);
-        if(m->mb_x+1 < d->mb_width){
-            XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1);
-        }
-        XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1);
-        XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1);
-    }
-}
-
-#endif
-
-void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m){
-    int i;
-    const int mb_x= m->mb_x;
-    const int mb_y= m->mb_y;
-    int *block_offset = d->block_offset;
-
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
-
-    int linesize   = d->linesize;
-    int uvlinesize = d->uvlinesize;
-
-    uint8_t *dest_y  = s->curr_pic->data[0] + (mb_x + mb_y * linesize  ) * 16;
-    uint8_t *dest_cb = s->curr_pic->data[1] + (mb_x + mb_y * uvlinesize) * 8;
-    uint8_t *dest_cr = s->curr_pic->data[2] + (mb_x + mb_y * uvlinesize) * 8;
-
-    pred_motion_mb_rec (d, mrs, s, m);
-
-    const int mb_type= m->mb_type;
-
-    d->dsp.prefetch(dest_y + (m->mb_x&3)*4*linesize + 64, d->linesize, 4);
-    d->dsp.prefetch(dest_cb + (m->mb_x&7)*uvlinesize + 64, dest_cr - dest_cb, 2);
-
-    if(IS_INTRA(mb_type)){
-#if OMPSS
-        xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
-#else
-        xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
-#endif
-
-        d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cb, uvlinesize);
-        d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cr, uvlinesize);
-
-        if(IS_INTRA4x4(mb_type)){
-            if(IS_8x8DCT(mb_type)){
-                idct_dc_add = d->hdsp.h264_idct8_dc_add;
-                idct_add    = d->hdsp.h264_idct8_add;
-
-                for(i=0; i<16; i+=4){
-                    uint8_t * const ptr= dest_y + block_offset[i];
-                    const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ];
-
-                    const int nnz = mrs->non_zero_count_cache[ scan8[i] ];
-                    d->hpc.pred8x8l[ dir ](ptr, (mrs->topleft_samples_available<<i)&0x8000,
-                                                (mrs->topright_samples_available<<i)&0x4000, linesize);
-                    if(nnz){
-                        if(nnz == 1 && m->mb[i*16])
-                            idct_dc_add(ptr, m->mb + i*16, linesize);
-                        else
-                            idct_add   (ptr, m->mb + i*16, linesize);
-                    }
-                }
-            }else{
-                idct_dc_add = d->hdsp.h264_idct_dc_add;
-                idct_add    = d->hdsp.h264_idct_add;
-
-                for(i=0; i<16; i++){
-                    uint8_t * const ptr= dest_y + block_offset[i];
-                    const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ];
-                    uint8_t *topright;
-                    int nnz, tr;
-                    if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
-                        const int topright_avail= (mrs->topright_samples_available<<i)&0x8000;
-                        assert(mb_y || linesize <= block_offset[i]);
-                        if(!topright_avail){
-                            tr= ptr[3 - linesize]*0x01010101;
-                            topright= (uint8_t*) &tr;
-                        }else
-                            topright= ptr + 4 - linesize;
-                    }else
-                        topright= NULL;
-
-                    d->hpc.pred4x4[ dir ](ptr, topright, linesize);
-                    nnz = mrs->non_zero_count_cache[ scan8[i] ];
-                    if(nnz){
-                        if(nnz == 1 && m->mb[i*16])
-                            idct_dc_add(ptr, m->mb + i*16, linesize);
-                        else
-                            idct_add   (ptr, m->mb + i*16, linesize);
-                    }
-                }
-            }
-        }else{
-            d->hpc.pred16x16[ m->intra16x16_pred_mode ](dest_y , linesize);
-        }
-#if OMPSS
-        xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
-#else
-        xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
-#endif
-    }else {
-        hl_motion(d, mrs, s, m, dest_y, dest_cb, dest_cr,
-                    d->hdsp.qpel_put, d->dsp.put_h264_chroma_pixels_tab,
-                    d->hdsp.qpel_avg, d->dsp.avg_h264_chroma_pixels_tab,
-                    d->hdsp.weight_h264_pixels_tab, d->hdsp.biweight_h264_pixels_tab);
-    }
-
-    if(!IS_INTRA4x4(mb_type)){
-
-        if(IS_INTRA16x16(mb_type)){
-
-            d->hdsp.h264_idct_add16intra(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
-
-        }else if(m->cbp&15){
-
-            if(IS_8x8DCT(mb_type)){
-                d->hdsp.h264_idct8_add4(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
-            }else{
-                d->hdsp.h264_idct_add16(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
-            }
-        }
-    }
-
-    if(m->cbp&0x30){
-        uint8_t *dest[2] = {dest_cb, dest_cr};
-
-        idct_add = d->hdsp.h264_idct_add;
-        idct_dc_add = d->hdsp.h264_idct_dc_add;
-        for(i=16; i<16+8; i++){
-            if(mrs->non_zero_count_cache[ scan8[i] ])
-                idct_add   (dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize);
-            else if(m->mb[i*16])
-                idct_dc_add(dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize);
-        }
-    }
-
-#if OMPSS
-    backup_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
-    if (mb_x+1 <d->mb_width){
-        H264Mb *mr = m+1;
-        memcpy(mr->left_border, m->left_border, sizeof(m->left_border));
-    }
-    if (mb_y +1 <d->mb_height){
-        H264Mb *md = m + d->mb_width;
-        memcpy(md->top_border, m->top_border, sizeof(m->top_border));
-        if (mb_x>0){
-            H264Mb *mdl = m + d->mb_width -1;
-            memcpy(mdl->top_border_next, m->top_border_next, sizeof(m->top_border_next));
-        }
-    }
-#else
-    backup_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
-    if (mb_y +1 <d->mb_height && d->top_next != d->top){
-        memcpy(&d->top_next[mb_x],&d->top[mb_x], sizeof(TopBorder));
-    }
-#endif
-
-    ff_h264_filter_mb(d, mrs, s, m, dest_y, dest_cb, dest_cr);
-}
-
-MBRecContext *get_mbrec_context(H264Context *h){
-    MBRecContext *d = av_mallocz(sizeof(MBRecContext));
-
-    ff_h264dsp_init(&d->hdsp);
-    ff_h264_pred_init(&d->hpc);
-    dsputil_init(&d->dsp);
-
-#if !OMPSS
-    d->mrs = av_mallocz(sizeof(MBRecState));
-#endif
-    d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab;
-    d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab;
-    d->mb_height = h->mb_height;
-    d->mb_width  = h->mb_width;
-    d->mb_stride  = h->mb_stride;
-    d->b_stride  = h->b_stride;
-    d->height = h->height;
-    d->width  = h->width;
-    d->linesize = h->width + EDGE_WIDTH*2;
-    d->uvlinesize = d->linesize>>1;
-
-    d->scratchpad_y = av_malloc(d->linesize*16*sizeof(uint8_t));
-    d->scratchpad_cb= av_malloc(d->uvlinesize*8*sizeof(uint8_t));
-    d->scratchpad_cr= av_malloc(d->uvlinesize*8*sizeof(uint8_t));
-
-    for (int i=0; i<16; i++){
-        d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3);
-    }
-    for (int i=0; i<4; i++){
-        d->block_offset[16+i]=
-        d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3);
-    }
-
-
-
-    return d;
-}
-
-void free_mbrec_context(MBRecContext *d){
-#if !OMPSS
-    av_free(d->mrs);
-#endif
-    av_free(d->scratchpad_y);
-    av_free(d->scratchpad_cb);
-    av_free(d->scratchpad_cr);
-    av_free(d);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_rec.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_rec.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-#ifndef H264_REC_H
-#define H264_REC_H
-
-#include "h264_types.h"
-
-MBRecContext *get_mbrec_context(H264Context *h);
-void free_mbrec_context( MBRecContext *d);
-void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m);
-
-void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_refs.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_refs.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,461 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10  reference picture handling.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "dsputil.h"
-#include "h264_types.h"
-#include "golomb.h"
-
-//#undef NDEBUG
-#include <assert.h>
-
-static int build_def_list(PictureInfo **def, PictureInfo **in, int len, int is_long){
-    int i[2]={0};
-    int index=0;
-
-    while(i[0]<len || i[1]<len){
-        while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference)))
-            i[0]++;
-        while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & 0)))
-            i[1]++;
-        if(i[0] < len){
-            in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
-            def[index++]= in[ i[0]++ ];
-        }
-        if(i[1] < len){
-            in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
-            def[index++]= in[ i[1]++ ];
-        }
-    }
-
-    return index;
-}
-
-static int add_sorted(PictureInfo **sorted, PictureInfo **src, int len, int limit, int dir){
-    int i, best_poc;
-    int out_i= 0;
-
-    for(;;){
-        best_poc= dir ? INT_MIN : INT_MAX;
-
-        for(i=0; i<len; i++){
-            const int poc= src[i]->poc;
-            if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
-                best_poc= poc;
-                sorted[out_i]= src[i];
-            }
-        }
-        if(best_poc == (dir ? INT_MIN : INT_MAX))
-            break;
-        limit= sorted[out_i++]->poc - dir;
-    }
-    return out_i;
-}
-
-int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s){
-    int i,len;
-
-    if(s->slice_type_nos==FF_B_TYPE){
-        PictureInfo *sorted[32];
-        int cur_poc, list;
-        int lens[2];
-
-        cur_poc= s->poc;
-
-        for(list= 0; list<2; list++){
-            len= add_sorted(sorted, n->short_ref, n->short_ref_count, cur_poc, !list);
-            len+=add_sorted(sorted+len, n->short_ref, n->short_ref_count, cur_poc, list);
-            assert(len<=32);
-            len= build_def_list(s->ref_list[list], sorted, len, 0);
-            len+=build_def_list(s->ref_list[list] +len, n->long_ref, 16 , 1);
-            assert(len<=32);
-
-            for(int i=len; i<s->ref_count[list]; i++)
-                s->ref_list[list][i] = NULL;
-
-            lens[list]= len;
-        }
-
-        if(lens[0] == lens[1] && lens[1] > 1){
-            for(i=0; s->ref_list[0][i]->poc == s->ref_list[1][i]->poc && i<lens[0]; i++);
-
-			if(i == lens[0])
-				FFSWAP(PictureInfo *, s->ref_list[1][0], s->ref_list[1][1]);
-        }
-    }else{
-        len = build_def_list(s->ref_list[0], n->short_ref, n->short_ref_count, 0);
-        len+= build_def_list(s->ref_list[0] +len, n->long_ref, 16, 1);
-        assert(len <= 32);
-        for(i=len; i<s->ref_count[0]; i++)
-            s->ref_list[0][i] = NULL;
-    }
-
-    return 0;
-}
-
-/**
-* print short term list
-*/
-static void print_short_term(NalContext *n) {
-    av_log(AV_LOG_DEBUG, "short term list:\n");
-    for(int i=0; i<n->short_ref_count; i++){
-        PictureInfo *pic= n->short_ref[i];
-        av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d ref:%d \n", i, pic->frame_num, pic->poc, pic->reference);
-    }
-}
-
-/**
-* print long term list
-*/
-static void print_long_term(NalContext *n) {
-    uint32_t i;
-
-    av_log(AV_LOG_DEBUG, "long term list:\n");
-    for(i = 0; i < 16; i++){
-        PictureInfo *pic= n->long_ref[i];
-        if (pic) {
-            av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d\n", i, pic->frame_num, pic->poc);
-        }
-    }
-}
-
-int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb){
-    int list, index;
-
-    print_short_term(n);
-    print_long_term(n);
-
-    for(list=0; list<s->list_count; list++){
-
-        if(get_bits1(gb)){
-            int frame_num = n->frame_num;
-            unsigned int abs_diff_pic_num;
-            for(index=0; ; index++){
-                unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(gb);
-                int i=0;
-                PictureInfo *ref = NULL;
-
-                if(reordering_of_pic_nums_idc==3){
-                    break;
-                }
-                if(index >= s->ref_count[list]){
-                    av_log(AV_LOG_ERROR, "reference count overflow\n");
-                    return -1;
-                }
-
-                if (reordering_of_pic_nums_idc>2){
-                    av_log(AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
-                    return -1;
-                }
-
-                if (reordering_of_pic_nums_idc<2){
-                    //av_log(AV_LOG_ERROR, "long term pic not supported\n");
-
-                    abs_diff_pic_num= get_ue_golomb(gb) + 1;
-                    if(abs_diff_pic_num > (unsigned) n->max_pic_num){
-                        av_log(AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
-                        return -1;
-                    }
-
-                    if(reordering_of_pic_nums_idc == 0)
-                        frame_num-= abs_diff_pic_num;
-                    else
-                        frame_num+= abs_diff_pic_num;
-                    frame_num &= n->max_pic_num - 1;
-
-                    for(i= 0 ; i<n->short_ref_count; i++){
-                        ref = n->short_ref[i];
-                        if(ref->frame_num == frame_num && ref->reference){
-                            break;
-                        }
-                    }
-                    ref->pic_id= frame_num;
-                }else{
-                    int long_idx;
-                    long_idx= get_ue_golomb(gb); //long_term_pic_idx
-
-                    if(long_idx>31){
-                        av_log(AV_LOG_ERROR, "long_term_pic_idx overflow\n");
-                        return -1;
-                    }
-                    ref = n->long_ref[long_idx];
-                    assert(!(ref && !ref->reference));
-                    if(ref && (ref->reference)){
-                        ref->pic_id= long_idx;
-                        assert(ref->long_ref);
-                    }else{
-                        av_log(AV_LOG_ERROR, "reference picture missing during reorder\n");
-                    }
-                }
-
-                if (i >= n->short_ref_count) {
-                    av_log(AV_LOG_ERROR, "reference picture missing during reorder\n");
-                    return -1;
-                } else {
-                    for(i=index; i+1 <s->ref_count[list]; i++){
-
-//                         if(ref->frame_num == s->ref_list[list][i]->frame_num)
-//                            break;
-                        ///there is probably no need for a separate pic_id and frame_num
-						if (s->ref_list[list][i]){
-
-							if(ref->long_ref == s->ref_list[list][i]->long_ref && ref->pic_id == s->ref_list[list][i]->pic_id)
-								break;
-						}
-                    }
-                    for(; i > index; i--){
-                        s->ref_list[list][i]= s->ref_list[list][i-1];
-                    }
-                    s->ref_list[list][index]= ref;
-                }
-            }
-        }
-    }
-
-//     //Check if everything went well
-//     for(list=0; list<s->list_count; list++){
-// 		//printf("ref_count %d list %d\n", s->ref_count[list], list);
-//         for(index= 0; index < s->ref_count[list]; index++){
-// 			//printf("%d\n", s->ref_list[list][index]->pic_id);
-//             if(!s->ref_list[list][index]->data[0]){
-//                 av_log(AV_LOG_ERROR, "Missing reference picture\n");
-//                 return -1;
-//             }
-//         }
-//     }
-
-    return 0;
-}
-
-static PictureInfo *find_short(NalContext *n, int frame_num){
-    int i;
-    for(i=0; i<n->short_ref_count; i++){
-        if(n->short_ref[i]->frame_num == frame_num) {
-            return n->short_ref[i];
-        }
-    }
-    return NULL;
-}
-
-static int remove_short(NalContext *n, H264Slice *s, int frame_num, int release){
-    int i;
-
-    for (i=0; i<n->short_ref_count; i++){
-        if (n->short_ref[i]->frame_num == frame_num){
-            if (release){
-                s->release_ref_cpn[s->release_cnt++] = n->short_ref[i]->cpn;
-                n->short_ref[i]->reference &= ~2;
-            }
-            n->short_ref[i] = NULL;
-            if (--n->short_ref_count)
-                memmove(&n->short_ref[i], &n->short_ref[i+1], (n->short_ref_count - i)*sizeof(PictureInfo *));
-            return 0;
-        }
-    }
-    return -1;
-}
-
-static void remove_long(NalContext *n, H264Slice *s, int i){
-
-    if (n->long_ref[i]){
-        s->release_ref_cpn[s->release_cnt++] = n->long_ref[i]->cpn;
-        n->long_ref[i]->reference &= ~2;
-        n->long_ref[i]->long_ref = 0;
-        n->long_ref_count--;
-        n->long_ref[i] = NULL;
-    }
-}
-
-void ff_h264_remove_all_refs(NalContext *n, H264Slice *s){
-    int i;
-
-    while (n->short_ref[0])
-        remove_short(n, s, n->short_ref[0]->frame_num, 1);
-
-    for(i=0; i<16; i++){
-        remove_long(n, s, i);
-    }
-    assert(n->short_ref_count==0);
-    assert(n->long_ref_count==0);
-}
-
-int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb){
-
-    if(s->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
-        get_bits1(gb); //get_bits1(gb) -1; //broken link
-        if(get_bits1(gb)){
-            av_log(AV_LOG_ERROR, "MMCO_LONG reference management not supported\n");
-        }
-    }else{
-        if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
-            int i,j;
-            for(i= 0; i<MAX_MMCO_COUNT; i++) {
-                PictureInfo *pic;
-                int short_pic_num=0;
-                unsigned int long_arg=0;
-                MMCOOpcode opcode= get_ue_golomb_31(gb);
-
-                if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
-                    short_pic_num= (n->frame_num - get_ue_golomb(gb) - 1) & (n->max_pic_num - 1);
-                }
-                if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
-                    long_arg= get_ue_golomb_31(gb);
-                    if(long_arg >= 16){
-                        av_log(AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
-                        return -1;
-                    }
-                }
-
-                if(opcode > (unsigned)MMCO_LONG){
-                    av_log(AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
-                    return -1;
-                }
-                if(opcode == MMCO_END)
-                    break;
-
-                switch (opcode){
-                    case MMCO_SHORT2UNUSED:
-                        remove_short(n, s, short_pic_num, 1);
-                        break;
-                    case MMCO_SHORT2LONG:
-                        pic = find_short(n, short_pic_num);
-                        if (n->long_ref[long_arg] != pic)
-                            remove_long(n, s, long_arg);
-                        remove_short(n, s, short_pic_num, 0);
-                        n->long_ref[long_arg]= pic;
-                        if (pic){
-                            pic->long_ref=1;
-                            n->long_ref[long_arg]= pic;
-                            n->long_ref_count++;
-                        }
-                        break;
-                    case MMCO_LONG2UNUSED:
-                        assert(n->long_ref[long_arg]);
-                        remove_long(n, s, long_arg);
-                        break;
-                    case MMCO_SET_MAX_LONG:
-                        for(j=long_arg; j<16; j++)
-                            remove_long(n, s, j);
-                        break;
-                    case MMCO_RESET:
-                        while(n->short_ref_count)
-                            remove_short(n, s, n->short_ref[0]->frame_num, 1);
-
-                        for(j=0; j < 16; j++)
-                            remove_long(n, s, j);
-
-                        s->current_picture_info->poc=
-                        s->poc =
-                        n->poc_lsb=
-                        n->poc_msb=
-                        n->frame_num=
-                        s->current_picture_info->frame_num= 0;
-                        break;
-					case MMCO_END:
-					case MMCO_LONG:
-						break;
-                }
-            }
-        }else{// sliding window ref picture marking
-            if(n->short_ref_count == n->sps.ref_frame_count) {
-                s->release_ref_cpn[s->release_cnt++] = n->short_ref[n->short_ref_count - 1]->cpn;
-                n->short_ref[n->short_ref_count - 1]->reference &= ~2;
-                n->short_ref[ n->short_ref_count - 1 ] =NULL;
-                n->short_ref_count--;
-            }
-        }
-    }
-
-    if(n->short_ref_count)
-        memmove(&n->short_ref[1], &n->short_ref[0], n->short_ref_count*sizeof(PictureInfo *));
-
-    n->short_ref[0]= s->current_picture_info;
-    n->short_ref_count++;
-
-    return 0;
-}
-
-static int get_scale_factor(H264Slice *s, int poc, int poc1, int i){
-    int poc0 = s->ref_list[0][i]->poc;
-    int td = av_clip(poc1 - poc0, -128, 127);
-    if(td == 0 || s->ref_list[0][i]->long_ref){
-        return 256;
-    }else{
-        int tb = av_clip(poc - poc0, -128, 127);
-        int tx = (16384 + (FFABS(td) >> 1)) / td;
-        return av_clip((tb*tx + 32) >> 6, -1024, 1023);
-    }
-}
-
-void ff_h264_direct_dist_scale_factor(H264Slice *s){
-    const int poc = s->current_picture_info->poc;
-    const int poc1 = s->ref_list[1][0]->poc;
-
-    for(int i=0; i<s->ref_count[0]; i++){
-        s->dist_scale_factor[i] = get_scale_factor(s, poc, poc1, i);
-    }
-}
-
-static void fill_colmap(H264Slice *s, int map[2][16], int list){
-    PictureInfo * const ref1 = s->ref_list[1][0];
-    int old_ref, rfield;
-
-    /* bogus; fills in for missing frames */
-    memset(map[list], 0, sizeof(map[list]));
-
-    for(rfield=0; rfield<2; rfield++){
-        for(old_ref=0; old_ref < ref1->ref_count[list]; old_ref++){
-            int poc = ref1->ref_poc[list][old_ref];
-
-            for(int j=0; j<s->ref_count[0]; j++){
-                if(s->ref_list[0][j]->poc == poc){
-                    map[list][old_ref] = j;
-                    break;
-                }
-            }
-        }
-    }
-}
-
-void ff_h264_direct_ref_list_init(H264Slice *s){
-    PictureInfo * const cur = s->current_picture_info;
-    int list;
-
-    for(list=0; list<2; list++){
-        cur->ref_count[list] = s->ref_count[list];
-        for(int j=0; j<s->ref_count[list]; j++){
-            cur->ref_poc[list][j] = s->ref_list[list][j] ? s->ref_list[list][j]->poc : 0;
-        }
-    }
-
-    if(s->slice_type_nos != FF_B_TYPE || s->direct_spatial_mv_pred)
-        return;
-
-    for(list=0; list<2; list++){
-        fill_colmap(s, s->map_col_to_list0, list);
-    }
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_refs.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_refs.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-#ifndef H264_REFS_H
-#define H264_REFS_H
-
-#include "avcodec.h"
-#include "h264_types.h"
-
-int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s);
-int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb);
-void ff_h264_remove_all_refs(NalContext *n, H264Slice *s);
-int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb);
-void ff_h264_direct_ref_list_init(H264Slice *s);
-void ff_h264_direct_dist_scale_factor(H264Slice *s);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_sei.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_sei.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,191 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... sei decoding
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 sei decoding.
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "avcodec.h"
-#include "h264_types.h"
-#include "golomb.h"
-
-//#undef NDEBUG
-#include <assert.h>
-
-static const uint8_t sei_num_clock_ts_table[9]={
-    1,  1,  1,  2,  2,  3,  3,  2,  3
-};
-
-void ff_h264_reset_sei(NalContext *n) {
-    n->sei_recovery_frame_cnt       = -1;
-    n->sei_dpb_output_delay         =  0;
-    n->sei_cpb_removal_delay        = -1;
-    n->sei_buffering_period_present =  0;
-}
-
-static int decode_picture_timing(NalContext *n, GetBitContext *gb){
-    if(n->sps.nal_hrd_parameters_present_flag || n->sps.vcl_hrd_parameters_present_flag){
-        n->sei_cpb_removal_delay = get_bits(gb, n->sps.cpb_removal_delay_length);
-        n->sei_dpb_output_delay = get_bits(gb, n->sps.dpb_output_delay_length);
-    }
-    if(n->sps.pic_struct_present_flag){
-        unsigned int i, num_clock_ts;
-        n->sei_pic_struct = get_bits(gb, 4);
-        n->sei_ct_type    = 0;
-
-        if (n->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
-            return -1;
-
-        num_clock_ts = sei_num_clock_ts_table[n->sei_pic_struct];
-
-        for (i = 0 ; i < num_clock_ts ; i++){
-            if(get_bits(gb, 1)){                  /* clock_timestamp_flag */
-                unsigned int full_timestamp_flag;
-                n->sei_ct_type |= 1<<get_bits(gb, 2);
-                skip_bits(gb, 1);                 /* nuit_field_based_flag */
-                skip_bits(gb, 5);                 /* counting_type */
-                full_timestamp_flag = get_bits(gb, 1);
-                skip_bits(gb, 1);                 /* discontinuity_flag */
-                skip_bits(gb, 1);                 /* cnt_dropped_flag */
-                skip_bits(gb, 8);                 /* n_frames */
-                if(full_timestamp_flag){
-                    skip_bits(gb, 6);             /* seconds_value 0..59 */
-                    skip_bits(gb, 6);             /* minutes_value 0..59 */
-                    skip_bits(gb, 5);             /* hours_value 0..23 */
-                }else{
-                    if(get_bits(gb, 1)){          /* seconds_flag */
-                        skip_bits(gb, 6);         /* seconds_value range 0..59 */
-                        if(get_bits(gb, 1)){      /* minutes_flag */
-                            skip_bits(gb, 6);     /* minutes_value 0..59 */
-                            if(get_bits(gb, 1))   /* hours_flag */
-                                skip_bits(gb, 5); /* hours_value 0..23 */
-                        }
-                    }
-                }
-                if(n->sps.time_offset_length > 0)
-                    skip_bits(gb, n->sps.time_offset_length); /* time_offset */
-            }
-        }
-    }
-    return 0;
-}
-
-static int decode_unregistered_user_data(GetBitContext *gb, int size){
-    char user_data[16+256];
-    int e, build, i;
-
-    if(size<16)
-        return -1;
-
-    for(i=0; i<(int) sizeof(user_data)-1 && i<size; i++){
-        user_data[i]= get_bits(gb, 8);
-    }
-
-    user_data[i]= 0;
-    e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
-    (void) e;
-    for(; i<size; i++)
-        skip_bits(gb, 8);
-
-    return 0;
-}
-
-static int decode_recovery_point(NalContext *n, GetBitContext *gb){
-
-    n->sei_recovery_frame_cnt = get_ue_golomb(gb);
-    skip_bits(gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
-
-    return 0;
-}
-
-static int decode_buffering_period(NalContext *n, GetBitContext *gb){
-    unsigned int sps_id;
-    int sched_sel_idx;
-    SPS *sps;
-
-    sps_id = get_ue_golomb_31(gb);
-    if(sps_id > 31 || !n->sps_buffers[sps_id]) {
-        av_log(AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
-        return -1;
-    }
-    sps = n->sps_buffers[sps_id];
-
-    // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
-    if (sps->nal_hrd_parameters_present_flag) {
-        for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
-            n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length);
-            skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
-        }
-    }
-    if (sps->vcl_hrd_parameters_present_flag) {
-        for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
-            n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length);
-            skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
-        }
-    }
-
-    n->sei_buffering_period_present = 1;
-    return 0;
-}
-
-int ff_h264_decode_sei(NalContext *n, GetBitContext *gb){
-    while(get_bits_count(gb) + 16 < gb->size_in_bits){
-        int size, type;
-
-        type=0;
-        do{
-            type+= show_bits(gb, 8);
-        }while(get_bits(gb, 8) == 255);
-
-        size=0;
-        do{
-            size+= show_bits(gb, 8);
-        }while(get_bits(gb, 8) == 255);
-
-        switch(type){
-        case SEI_TYPE_PIC_TIMING: // Picture timing SEI
-            if(decode_picture_timing(n, gb) < 0)
-                return -1;
-            break;
-        case SEI_TYPE_USER_DATA_UNREGISTERED:
-            if(decode_unregistered_user_data(gb, size) < 0)
-                return -1;
-            break;
-        case SEI_TYPE_RECOVERY_POINT:
-            if(decode_recovery_point(n, gb) < 0)
-                return -1;
-            break;
-        case SEI_BUFFERING_PERIOD:
-            if(decode_buffering_period(n, gb) < 0)
-                return -1;
-            break;
-        default:
-            skip_bits(gb, 8*size);
-        }
-
-        //FIXME check bits here
-        align_get_bits(gb);
-    }
-
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_sei.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_sei.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-#ifndef H264_SEI_H
-#define H264_SEI_H
-
-int ff_h264_decode_sei(NalContext *n, GetBitContext *gb);
-void ff_h264_reset_sei(NalContext *n);
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_seq.c
--- a/ffmpeg_smp/h264dec/libavcodec/h264_seq.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,220 +0,0 @@
-/*
-* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
-* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
-*
-* This file is part of FFmpeg.
-*
-* FFmpeg is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* FFmpeg is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with FFmpeg; if not, write to the Free Software
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-#include "h264_types.h"
-#include "h264_parser.h"
-#include "h264_nal.h"
-#include "h264_entropy.h"
-#include "h264_rec.h"
-#include "h264_pred_mode.h"
-#include "h264_misc.h"
-// #undef NDEBUG
-#include <assert.h>
-
-static int decode_slice_entropy_seq(H264Context *h, EntropyContext *ec, H264Slice *s, GetBitContext *gb, H264Mb *mbs){
-    int i,j;
-//     GetBitContext *gb = s->gb;
-    CABACContext *c = &ec->c;
-
-    if( !s->pps.cabac ){
-        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
-        return -1;
-    }
-
-    init_dequant_tables(s, ec);
-    ec->curr_qscale = s->qscale;
-    ec->last_qscale_diff = 0;
-    ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale);
-    ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale);
-
-    /* realign */
-    align_get_bits( gb );
-    /* init cabac */
-    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
-
-    ff_h264_init_cabac_states(ec, s, c);
-
-    for(j=0; j<ec->mb_height; j++){
-        init_entropy_buf(ec, s, j);
-        for(i=0; i<ec->mb_width; i++){
-            int eos,ret;
-            H264Mb *m = &mbs[i + j*ec->mb_width];
-            //memset(m, 0, sizeof(H264Mb));
-            m->mb_x=i;
-            m->mb_y=j;
-            ec->m = m;
-
-            ret = ff_h264_decode_mb_cabac(ec, s, c);
-            eos = get_cabac_terminate( c);
-            (void) eos;
-            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
-                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
-                return -1;
-            }
-        }
-    }
-
-//     av_freep(&s->gb.raw);
-//     if (s->gb.rbsp)
-//         av_freep(&s->gb.rbsp);
-
-    return 0;
-}
-
-
-
-/**
-*   Sequential version
-*/
-static void decode_slice_mb_seq(H264Context *h, MBRecContext *d, H264Slice *s2, H264Mb *mbs){
-
-    for (int i=0; i<2; i++){
-        for(int j=0; j< s2->ref_count[i]; j++){
-            if (s2->ref_list_cpn[i][j] ==-1)
-                continue;
-            int k;
-            for (k=0; k<h->max_dpb_cnt; k++){
-                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s2->ref_list_cpn[i][j]){
-                    s2->dp_ref_list[i][j] = &h->dpb[k];
-                    break;
-                }
-            }
-        }
-    }
-
-    get_dpb_entry(h, s2);
-
-    if (!h->no_mbd){
-        for(int j=0; j<d->mb_height; j++){
-            init_mbrec_context(d, d->mrs, s2, j);
-            if (h->profile) printf("\n[MBREC LINE %d ", j);
-            for(int i=0; i<d->mb_width; i++){
-
-                if ((i & 0x7) == 0) start_timer(h, REC);
-                H264Mb *m = &mbs[i + j*d->mb_width];
-                if (h->profile==2)
-                    pred_motion_mb_rec (d, d->mrs, s2, m);
-                else{
-                    h264_decode_mb_internal(d, d->mrs, s2, m);
-                }
-                stop_timer(h, REC);
-            }
-            draw_edges(d, s2, j);
-
-        }
-    }
-
-    for (int i=0; i<s2->release_cnt; i++){
-        for(int j=0; j<h->max_dpb_cnt; j++){
-            if(h->dpb[j].cpn== s2->release_ref_cpn[i]){
-                release_dpb_entry(h, &h->dpb[j], 2);
-                break;
-            }
-        }
-    }
-    s2->release_cnt=0;
-}
-
-/*
-* The following code is the main loop of the file converter
-*/
-int h264_decode_seq( H264Context *h) {
-    ParserContext *pc;
-    NalContext *nc;
-    EntropyContext *ec;
-    MBRecContext *rc;
-    OutputContext *oc;
-
-    H264Slice slice, *s=&slice;
-    H264Mb *mbs;
-    DecodedPicture *out;
-    int frames=0;
-
-#if HAVE_LIBSDL2
-    pthread_t sdl_thr;
-    if (h->display){
-        pthread_create(&sdl_thr, NULL, sdl_thread, h);
-    }
-#endif
-    
-    pc = get_parse_context(h->ifile);
-    nc = get_nal_context(h->width, h->height);
-
-    memset(s, 0, sizeof(H264Slice));
-    mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
-
-    ec = get_entropy_context( h );
-    rc = get_mbrec_context(h);
-    rc->top_next = rc->top = av_malloc( h->mb_width * sizeof(TopBorder));
-
-    oc = get_output_context( h );
-
-    av_start_timer();
-    GetBitContext gb = {0,};
-    while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
-        if (h->profile) start_timer(h, FRONT);
-        av_read_frame_internal(pc, &gb);
-        decode_nal_units(nc, s, &gb);
-        if (h->profile) stop_timer(h, FRONT);
-//         memset(s->mbs, 0, sizeof(H264Mb)*ec->mb_width*ec->mb_height);
-        if (h->profile) start_timer(h, ED);
-        decode_slice_entropy_seq(h, ec, s, &gb, mbs);
-        if (h->profile) stop_timer(h, ED);
-
-        if (h->profile) start_timer(h, REC);
-        decode_slice_mb_seq(h, rc, s, mbs);
-        if (h->profile) stop_timer(h, REC);
-
-        out =output_frame(h, oc, s->curr_pic, h->ofile, h->frame_width, h->frame_height);
-        if (out){
-            release_dpb_entry(h, out, 1);
-        }
-
-        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
-        if (h->profile == 3){
-            printf("[ENTROPY %.3fms] [MBREC %.3fms]\n", h->last_time[ED] , h->last_time[REC]);
-        }
-    }
-    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
-    
-    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
-    h->num_frames = oc->frame_number;
-    /* finished ! */
-    av_freep(&mbs);
-    av_freep(&gb.raw);
-    if (gb.rbsp)
-        av_freep(&gb.rbsp);
-    av_freep(&rc->top);
-
-    free_parse_context(pc);
-    free_nal_context  (nc);
-    free_entropy_context(ec);
-    free_mbrec_context(rc);
-    free_output_context(oc);
-
-#if HAVE_LIBSDL2
-    if (h->display){
-        signal_sdl_exit(h);
-        pthread_join(sdl_thr, NULL);
-    }
-#endif
-    
-    return 0;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_types.h
--- a/ffmpeg_smp/h264dec/libavcodec/h264_types.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,658 +0,0 @@
-#ifndef H264_TYPES_H
-#define H264_TYPES_H
-
-#include "config.h"
-#ifdef HAVE_LIBSDL2
-#include <SDL2/SDL.h>
-#endif
-
-#include <pthread.h>
-#include "avcodec.h"
-#include "cabac.h"
-#include "h264_dsp.h"
-#include "h264_pred.h"
-#include "get_bits.h"
-
-
-#define MAX_REF_PIC_COUNT 16
-#define MAX_DELAYED_PIC_COUNT 16
-
-#define MAX_THREADS 80
-
-//#define MAX_PIC_COUNT (4*(MAX_REF_PIC_COUNT+MAX_DELAYED_PIC_COUNT))
-
-#define DPB_SIZE 33
-
-
-//potsdam machine 8xX7560 without HT
-// static int edb_affinity [16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-// static int edip_affinity[8] =  {16, 17, 18, 19, 20, 21, 22, 23};
-//
-// static int mbd_affinity[8][5] = {	{24, 32, 40, 48, 56},
-// 							{25, 33, 41, 49, 57},
-// 							{26, 34, 42, 50, 58},
-// 							{27, 35, 43, 51, 59},
-// 							{28, 36, 44, 52, 60},
-// 							{29, 37, 45, 53, 61},
-// 							{30, 38, 46, 54, 62},
-// 							{31, 39, 47, 55, 63}, };
-
-// static int edb_affinity [22] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 58, 59, 60, 61 ,62, 63};
-// static int edip_affinity[10] =  {16, 17, 18, 19, 20, 21, 22, 23, 56, 57 };
-//
-// static int mbd_affinity[8][5] = {	{24, 32, 40, 48, 56},
-// 							{25, 33, 41, 49, 57},
-// 							{26, 34, 42, 50, 58},
-// 							{27, 35, 43, 51, 59},
-// 							{28, 36, 44, 52, 60},
-// 							{29, 37, 45, 53, 61},
-// 							{30, 38, 46, 54, 62},
-// 							{31, 39, 47, 55, 63}, };
-// //4 socket
-// static int edip_affinity[5] = {0, 1, 2, 3, 56};
-// static int edb_affinity [12] = {8, 9, 10, 11, 16, 17, 18, 19, 59, 58, 57, 51};
-//
-// static int mbd_affinity[4][5] = { {24, 32, 40, 48, 56},
-// {25, 33, 41, 49, 57},
-// {26, 34, 42, 50, 58},
-// {27, 35, 43, 51, 59}, };
-
-// static int edip_affinity[3] = {0, 1, 49};
-// static int edb_affinity [6] = {8, 9, 16, 17, 56, 57};
-//
-// static int mbd_affinity[2][5] = { {24, 32, 40, 48, 56},
-// {25, 33, 41, 49, 57}};
-
-// static int edip_affinity[2] = {0, 8};
-// static int edb_affinity [3] = {16, 24, 56};
-//
-// static int mbd_affinity[1][4] = { {32, 40, 48, 56},
-// };
-
-/// for ducks_take_off_2160p
-// static int edip_affinity[2] = {0, 8};
-// static int edb_affinity [3] = {16, 24, 32};
-//
-// static int mbd_affinity[1][4] = {{ 40, 48, 56, 32}};
-
-// static int edip_affinity[3] = {0, 1, 57};
-// static int edb_affinity [7] = {8, 9, 16, 17, 24, 25, 56};
-//
-// static int mbd_affinity[2][4] = { {32, 40, 48, 56},
-// {33, 41, 49, 57}};
-
-//4 socket
-// static int edip_affinity[6]  = {0, 1, 2, 3, 59};
-// static int edb_affinity [14] = {8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 58, 57};
-//
-// static int mbd_affinity[4][4] = { {32, 40, 48, 56},
-// {33, 41, 49, 57},
-// {34, 42, 50, 58},
-// {35, 43, 51, 59}, };
-
-
-// static int edb_affinity [29] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 59, 60, 61, 62, 63};
-// static int edip_affinity[11] =  {24, 25, 26, 27, 28, 29, 30, 31, 63, 62, 61};
-//
-// static int mbd_affinity[8][4] = {{32, 40, 48, 56},
-// 							{33, 41, 49, 57},
-// 							{34, 42, 50, 58},
-// 							{35, 43, 51, 59},
-// 							{36, 44, 52, 60},
-// 							{37, 45, 53, 61},
-// 							{38, 46, 54, 62},
-// 							{39, 47, 55, 63}, };
-
-//potsdam machine 4xX7550 with HT
-// int edip_affinity[16] = {0, 8, 16, 24, 	1, 9, 17, 25, 	2, 10, 18, 26,	3, 11, 19, 27 };
-// int edb_affinity [16] = {1, 9, 17, 25, 	2, 10, 18, 26, 	6, 14, 22, 30,	7, 15, 23, 31 };
-// int edip_affinity[16] = {58, 50, 42, 34, 	1, 9, 17, 25, 	2, 10, 18, 26,	3, 11, 19, 27 };
-// int edb_affinity [16] = {57, 49, 41, 33, 	56, 48, 40, 32, 	6, 14, 22, 30,	7, 15, 23, 31 };
-// //int edb_affinity [16] = {4, 12, 20, 28, 5, 13, 21, 29, 	6, 14, 22, 30,	7, 15, 23, 31 };
-// //mb threads affinity on logical cores moving back to keep inteference with ed threads low
-// int mbd_affinity[4][8] = {	{63, 62, 61, 60, 59, 58, 57, 56},
-// 							{55, 54, 53, 52, 51, 50, 49, 48},
-// 							{47, 46, 45, 44, 43, 42, 41, 40},
-// 							{39, 38, 37, 36, 35, 34, 33, 32},
-// 							};
-
-
-// static int edip_affinity[2] = {0, 2};
-// static int edb_affinity [4] = {1, 3, 2, 5};
-//
-// static int mbd_affinity[1][4] = {{ 4, 6, 7, 5}};
-
-enum{
-    PARSE=0,
-    ENTROPY,
-    REORDER,
-    REORDER2,   //second mutex-cond pair used in reorder_thread
-    MBDEC,
-    OUTPUT,
-    STAGES
-};
-
-//adhoc for profiling
-enum{
-    TOTAL=0,
-    FRONT,
-    ED,
-    REC,
-    PROFILE_STAGES
-};
-
-/* bit input */
-/* buffer, buffer_end and size_in_bits must be present and used by every reader */
-
-/* frame parsing */
-typedef struct ParserContext {
-    //int64_t offset;      ///< byte offset from starting packet start
-    int ifile;
-    int ofile;
-    int buffer_size;
-    int eof_reached;
-
-    uint8_t *data;
-    int   size;
-    uint8_t *cur_ptr;
-    int cur_len;
-
-    int64_t frame_offset; /* offset of the current frame */
-    int64_t cur_offset; /* current offset (incremented by each av_parser_parse()) */
-    int64_t next_frame_offset; /* offset of the next frame */
-    int pict_type;
-    int repeat_pict;     //frame_duration = (1 + repeat_pict) * time_base. It is used by codecs like H.264 to display telecined material.
-    int key_frame;  //Set by parser to 1 for key frames and 0 for non-key frames.
-    int64_t pos;     // Byte position of currently parsed frame in stream.
-    int64_t last_pos;  //Previous frame byte position.
-    int final_frame;
-
-    uint8_t overread[5];
-    int overread_cnt;           ///< the number of bytes which where irreversibly read from the next frame
-    int index;
-    int last_index;
-    int frame_start_found;
-    uint32_t state;             ///< contains the last few bytes in MSB order
-} ParserContext;
-
-typedef struct NalContext {
-
-    SPS *sps_buffers[MAX_SPS_COUNT];
-    PPS *pps_buffers[MAX_PPS_COUNT];
-    SPS sps; ///< current sps
-
-    PictureInfo picture[16 + 1];  ///< Ref pic buffer used for deriving lists. Later linked with pic in dpb.
-    PictureInfo *release_ref[MAX_MMCO_COUNT];
-    PictureInfo *short_ref[32];
-    PictureInfo *long_ref[32];
-    int long_ref_count;  ///< number of actual long term references
-    int short_ref_count; ///< number of actual short term references
-
-    //POC stuff
-    uint32_t coded_pic_num;
-    int poc_lsb;
-    int poc_msb;
-    uint32_t poc_offset;
-    int delta_poc;
-    int frame_num;
-    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
-    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
-    int frame_num_offset;         ///< for POC type 2
-    int prev_frame_num_offset;    ///< for POC type 2
-    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
-
-    int max_pic_num;
-    int redundant_pic_count;
-    int outputed_poc;
-    int ip_id;
-//   int b8_stride;             ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing
-    int b4_stride;             ///< 4*mb_width+1 used for some 4x4 block arrays to allow simple addressing
-    int mb_stride;             ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11
-    int mb_width;
-    int mb_height;
-    int width;
-    int height;
-
-    int has_b_frames;
-    //pic_struct in picture timing SEI message
-    SEI_PicStructType sei_pic_struct;
-    // Bit set of clock types for fields/frames in picture timing SEI message. For each found ct_type, appropriate bit is set (e.g., bit 1 for interlaced).
-    int sei_ct_type;
-    // dpb_output_delay in picture timing SEI message, see H.264 C.2.2
-    int sei_dpb_output_delay;
-    //cpb_removal_delay in picture timing SEI message, see H.264 C.1.2
-    int sei_cpb_removal_delay;
-    //recovery_frame_cnt from SEI message
-    int sei_recovery_frame_cnt;
-    // Timestamp stuff
-    int sei_buffering_period_present;  ///< Buffering period SEI flag
-    int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs
-
-} NalContext;
-
-typedef struct EntropyContext{
-    CABACContext c;
-
-    H264Mb *m;
-    int top_cbp;
-    int left_cbp;
-    int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct
-
-    uint32_t top_type;
-    uint32_t left_type;
-    uint32_t topright_type;
-    uint32_t topleft_type;
-
-    int curr_qscale;
-    int chroma_qp[2]; //QPc
-    int last_qscale_diff;
-
-    uint32_t dequant4_buffer[6][52][16];
-    uint32_t dequant8_buffer[2][52][64];
-    uint32_t (*dequant4_coeff[6])[16];
-    uint32_t (*dequant8_coeff[2])[64];
-
-//     uint8_t (*non_zero_count_top)[32];
-//     uint8_t (*non_zero_count)[32];
-//     uint8_t (*non_zero_count_row[2])[32];
-
-    uint8_t (*non_zero_count_top)[8];
-    uint8_t (*non_zero_count)[8];
-    uint8_t (*non_zero_count_row[2])[8];
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_left[8]);
-
-    uint8_t (*mvd_top[2])[2];
-    uint8_t (*mvd[2])[2];
-    uint8_t (*mvd_table[2][2])[2];
-
-    uint8_t *direct_top;
-    uint8_t *direct;
-    uint8_t *direct_table[2];
-
-    uint8_t *chroma_pred_mode_top;
-    uint8_t *chroma_pred_mode;
-    uint8_t *chroma_pred_mode_table[2];
-
-    uint16_t *cbp_top;
-    uint16_t *cbp;
-    uint16_t *cbp_table[2];
-
-    int8_t *qscale_top;
-    int8_t *qscale;
-    int8_t *qscale_table[2];
-
-    int8_t *ref_index_top[2];
-    int8_t *ref_index[2];
-    int8_t *ref_index_table[2][2];
-
-    uint32_t *mb_type_top;
-    uint32_t *mb_type;
-    uint32_t *mb_type_table[2];
-
-    int b_stride;
-    int mb_stride;
-    int mb_width;
-    int mb_height;
-
-    uint8_t *zigzag_scan;
-    uint8_t *zigzag_scan8x8;
-    uint8_t direct_cache[5*8];
-
-    DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]);
-    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
-    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
-    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
-
-} EntropyContext;
-
-typedef struct H264Slice {
-    PPS pps;                   ///< current pps
-    PictureInfo* current_picture_info;
-    DecodedPicture* curr_pic;
-    int slice_num;
-
-    int release_ref_cpn[MAX_MMCO_COUNT];
-    int release_cnt;
-
-    int qp_thresh;      ///< QP threshold to skip loopfilter
-    int use_weight;
-    int use_weight_chroma;
-    int luma_log2_weight_denom;
-    int chroma_log2_weight_denom;
-
-    int16_t luma_weight[16][2][2];
-    int16_t chroma_weight[16][2][2][2];
-    int16_t implicit_weight[16][16][2];
-
-    //poc number of ref_list int ref_poc[2][16]
-    //In edslice this must becom Picture Info
-    int ref_list_cpn[2][16];
-    PictureInfo *ref_list[2][16];         ///Reordered version of default_ref_list according to picture reordering in slice header
-    DecodedPicture *dp_ref_list[2][16];
-    int ref_count[2];   ///< counts frames or fields, depending on current mb mode
-
-    int slice_type;
-    int slice_type_nos;
-    int slice_alpha_c0_offset;
-    int slice_beta_offset;
-    int direct_8x8_inference_flag;
-
-    uint8_t list_count;
-    uint32_t coded_pic_num;
-
-    int poc;
-    int key_frame;
-    int mmco_reset; //FIXME not used?
-
-    ///stuff only needed for nal/entropy decoding
-//     H264Mb *m;
-//     GetBitContext *gb;
-    int ip_id;
-    int transform_bypass;
-    int direct_spatial_mv_pred;
-    int map_col_to_list0[2][16];
-    int dist_scale_factor[16];
-
-    int cabac_init_idc;
-    int nal_ref_idc;
-    int nal_unit_type;
-
-    int ref2frm[2][64];  ///< reference to frame number lists, the first 2 are for -2,-1
-
-    int qscale;
-
-} H264Slice;
-
-typedef struct {
-    H264Slice slice;
-    H264Mb *mbs;
-    DecodedPicture *dp;
-    GetBitContext gb;
-
-    int lines_taken;
-    int lines_total;
-    int state;       // 0 free, 1 in use //1 wait for entropy, 2 wait for reconstruct.
-    int initialized;
-} SliceBufferEntry;
-
-typedef struct RingLineEntry{
-    union{
-    DECLARE_ALIGNED(64, volatile int32_t, mb_cnt);
-    DECLARE_ALIGNED(64, int32_t, pad[16]);
-    };
-    SliceBufferEntry *sbe;
-    int id;
-    int line;
-    TopBorder *top;
-    struct RingLineEntry *prev_line;
-
-} RingLineEntry;
-
-// #if OMPSS
-typedef struct SuperMBTask{
-    int smb_x;
-    int smb_y;
-} SuperMBTask;
-
-typedef struct SuperMBContext{
-    int nsmb_width;             //number of super macroblocks in picture width
-    int nsmb_height;            //number of super macroblocks in picture height
-    int nsmb_3dheight;          //number of super macroblocks in picture height - max motion vertical vector
-    int smb_width;              //width of a super macroblock
-    int smb_height;             //height of a super macroblock
-    int refcount;
-    int index;
-    SuperMBTask *smbs[2];
-} SuperMBContext;
-// #endif
-
-//scratchpad for decoding a macroblock
-typedef struct MBRecState{
-    int8_t *ref_index_top[2];
-    int8_t *ref_index[2];
-    int16_t (*motion_val_top[2])[2];
-    int16_t (*motion_val[2])[2];
-    uint32_t *mb_type_top;
-    uint32_t *mb_type;
-
-    int8_t *list1_ref_index[2];
-    int16_t (*list1_motion_val[2])[2];
-    uint32_t *list1_mb_type;
-
-    int8_t *intra4x4_pred_mode_top;
-    int8_t *intra4x4_pred_mode;
-#if !OMPSS
-    int8_t intra4x4_pred_mode_left[4];
-#endif
-    int8_t *non_zero_count_top;
-    int8_t *non_zero_count;
-//     int8_t non_zero_count_left[8];
-
-
-    unsigned int topleft_samples_available;
-    unsigned int topright_samples_available;
-    unsigned int top_samples_available;
-    unsigned int left_samples_available;
-
-    int top_type;
-    int left_type;
-
-    DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]);
-    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
-    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
-    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
-
-    DECLARE_ALIGNED(8, int16_t, bS)[2][4][4];
-    uint8_t edges[2];
-
-}MBRecState ;
-
-typedef struct MBRecContext{
-    DSPContext dsp;             ///< pointers for accelerated dsp functions
-    H264DSPContext hdsp;
-    H264PredContext hpc;
-
-    MBRecState *mrs;
-    RingLineEntry *rle;         //debug
-
-    uint8_t *scratchpad_y;      ///implemented different on Cell
-    uint8_t *scratchpad_cb;     ///implemented different on Cell
-    uint8_t *scratchpad_cr;     ///implemented different on Cell
-
-    int linesize;
-    int uvlinesize;
-    int mb_width;
-    int mb_height;
-    int mb_stride;
-    int b_stride;
-    int width;
-    int height;
-
-#if !OMPSS   // not used in OMPSS
-    LeftBorder left;
-    TopBorder *top;
-    TopBorder *top_next; 	// next line top border
-#endif
-    /*
-    .UU.YYYY
-    .UU.YYYY
-    .vv.YYYY
-    .VV.YYYY
-    */
-
-    // block_offset[ 0..23] for frame macroblocks
-    int block_offset[16+8];
-
-} MBRecContext;
-
-#ifdef HAVE_LIBSDL2
-typedef struct SDLContext{
-    int display;
-    int fullscreen;
-    pthread_t listen_thread;
-
-    SDL_DisplayMode full;
-    SDL_DisplayMode wind;
-
-    
-    SDL_Renderer *renderer;
-    SDL_Rect rect;
-    SDL_Rect win_rect;
-    SDL_Window *window;
-    double aspect;
-    int win_w;
-    int win_h;
-    int resized;
-    
-    SDL_Texture *sbmap_texture;
-    int showmap;
-    int updatemap;
-    int pause;
-    
-} SDLContext;
-#endif
-
-typedef struct OutputContext {
-    int bit_buffer_size;
-    uint8_t *bit_buffer;
-    uint64_t video_size;
-    int frame_number;
-    DecodedPicture *delayed_pic[DPB_SIZE];
-    int dp_cnt;
-
-} OutputContext;
-
-typedef struct {
-    pthread_mutex_t lock;
-    pthread_cond_t cond;
-    SliceBufferEntry **queue;
-    int size;
-    int cnt;
-    int fi;
-    int fo;
-} SliceBufferQueue;
-
-typedef struct {
-    pthread_mutex_t wslock;
-    pthread_cond_t wscond;
-    pthread_mutex_t swlock;
-    pthread_cond_t swcond;
-    RingLineEntry **queue;
-    int size;
-    int ready;
-    int free;
-    int fi;
-    int fo;
-} RingLineQueue;
-
-#if HAVE_LIBSDL2
-typedef struct {
-    pthread_mutex_t sdl_lock;
-    pthread_cond_t sdl_cond;
-    SDL_Texture **queue;
-    int size;
-    int ready;
-    int fi;
-    int fo;
-    int exit;
-} SDLTextureQueue;
-#endif
-/**
-* H264Context
-*/
-typedef struct H264Context{
-    SliceBufferQueue sb_q[STAGES];
-    RingLineQueue rl_q;
-
-    pthread_mutex_t lock[STAGES];
-    pthread_cond_t cond[STAGES];
-
-    pthread_mutex_t task_lock;
-    pthread_cond_t task_cond;
-
-    pthread_attr_t ed_rec_attr[MAX_THREADS];
-    pthread_t ed_rec_thr[MAX_THREADS];
-
-    int init_threads;
-    pthread_mutex_t ilock;
-    pthread_cond_t icond;
-
-    const char *file_name;
-    int profile;
-    int start;
-    int touch_start;
-    int setaff;
-    int touch_done;
-    int rl_side_touch;
-    int statmbd;
-    pthread_mutex_t slock;
-    pthread_cond_t scond;
-    pthread_mutex_t tlock;
-    pthread_cond_t tcond;
-    pthread_mutex_t tdlock;
-    pthread_cond_t tdcond;
-
-    int ed_ppe_threads;
-    int threads;
-    int smt;
-
-    int acdpb_cnt;  //debug
-    int reldpb_cnt;
-    
-    int sb_size;
-    SliceBufferEntry *sb;               ///< Slice Syntax Buffer
-    int free_sb_cnt;
-    int slice_bufs;
-
-    int max_dpb_cnt;
-    DecodedPicture *dpb;       ///< Decoded Picture Buffer
-    int free_dpb_cnt;
-
-    int ifile;
-    int ofile;
-    int frame_width;
-    int frame_height;
-    int num_frames;
-    int width;
-    int height;
-    int mb_width;
-    int mb_height;
-    int mb_stride;          ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11
-    int b4_stride;
-    int b_stride;
-
-    int smb_height;
-    int smb_width;
-    pthread_mutex_t smb_lock;
-    pthread_cond_t sdl_cond;
-    pthread_mutex_t sdl_lock;
-    SuperMBContext *smbc;
-    
-    int wave_order;
-    int static_3d;
-    int pipe_bufs;
-
-    //shared tables used in entropy decoding
-    uint8_t zigzag_scan[16];
-    uint8_t zigzag_scan8x8[64];
-
-    int verbose;
-    int no_mbd;
-    int display;
-    int fullscreen;
-    int quit;
-#ifdef HAVE_LIBSDL2
-    SDLTextureQueue sdlq;
-    SDLContext *sdlc;
-#endif
-     
-    struct timespec start_time[PROFILE_STAGES];
-    struct timespec end_time[PROFILE_STAGES];
-    double last_time[PROFILE_STAGES];
-    double total_time[PROFILE_STAGES];
-
-}H264Context;
-
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/mathops.h
--- a/ffmpeg_smp/h264dec/libavcodec/mathops.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,145 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2001, 2002 Fabrice Bellard
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef AVCODEC_MATHOPS_H
-#define AVCODEC_MATHOPS_H
-
-#include "libavutil/common.h"
-#include "libavutil/internal.h"
-
-#if   ARCH_ARM
-#   include "arm/mathops.h"
-#elif ARCH_PPC
-#   include "ppc/mathops.h"
-#elif ARCH_X86
-#   include "x86/mathops.h"
-#endif
-
-/* generic implementation */
-
-#ifndef MULL
-#   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
-#endif
-
-#ifndef MULH
-//gcc 3.4 creates an incredibly bloated mess out of this
-//#    define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32)
-
-static av_always_inline int MULH(int a, int b){
-    return ((int64_t)(a) * (int64_t)(b))>>32;
-}
-#endif
-
-#ifndef UMULH
-static av_always_inline unsigned UMULH(unsigned a, unsigned b){
-    return ((uint64_t)(a) * (uint64_t)(b))>>32;
-}
-#endif
-
-#ifndef MUL64
-#   define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
-#endif
-
-#ifndef MAC64
-#   define MAC64(d, a, b) ((d) += MUL64(a, b))
-#endif
-
-#ifndef MLS64
-#   define MLS64(d, a, b) ((d) -= MUL64(a, b))
-#endif
-
-/* signed 16x16 -> 32 multiply add accumulate */
-#ifndef MAC16
-#   define MAC16(rt, ra, rb) rt += (ra) * (rb)
-#endif
-
-/* signed 16x16 -> 32 multiply */
-#ifndef MUL16
-#   define MUL16(ra, rb) ((ra) * (rb))
-#endif
-
-#ifndef MLS16
-#   define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb))
-#endif
-
-/* median of 3 */
-#ifndef mid_pred
-#define mid_pred mid_pred
-static inline av_const int mid_pred(int a, int b, int c)
-{
-#if 0
-    int t= (a-b)&((a-b)>>31);
-    a-=t;
-    b+=t;
-    b-= (b-c)&((b-c)>>31);
-    b+= (a-b)&((a-b)>>31);
-
-    return b;
-#else
-    if(a>b){
-        if(c>b){
-            if(c>a) b=a;
-            else    b=c;
-        }
-    }else{
-        if(b>c){
-            if(c>a) b=c;
-            else    b=a;
-        }
-    }
-    return b;
-#endif
-}
-#endif
-
-#ifndef sign_extend
-static inline av_const int sign_extend(int val, unsigned bits)
-{
-    return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
-}
-#endif
-
-#ifndef zero_extend
-static inline av_const unsigned zero_extend(unsigned val, unsigned bits)
-{
-    return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
-}
-#endif
-
-#ifndef COPY3_IF_LT
-#define COPY3_IF_LT(x, y, a, b, c, d)\
-if ((y) < (x)) {\
-    (x) = (y);\
-    (a) = (b);\
-    (c) = (d);\
-}
-#endif
-
-#ifndef NEG_SSR32
-#   define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
-#endif
-
-#ifndef NEG_USR32
-#   define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
-#endif
-
-#endif /* AVCODEC_MATHOPS_H */
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,619 +0,0 @@
-/*
- * Copyright (c) 2002 Brian Foley
- * Copyright (c) 2002 Dieter Shirley
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-#include "libavcodec/dsputil.h"
-#include "dsputil_ppc.h"
-#include "util_altivec.h"
-#include "types_altivec.h"
-#include "dsputil_altivec.h"
-
-
-static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
-{
-    int i;
-    vector unsigned char perm, bytes, *pixv;
-    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
-    vector signed short shorts;
-
-    for (i = 0; i < 8; i++) {
-        // Read potentially unaligned pixels.
-        // We're reading 16 pixels, and actually only want 8,
-        // but we simply ignore the extras.
-        perm = vec_lvsl(0, pixels);
-        pixv = (vector unsigned char *) pixels;
-        bytes = vec_perm(pixv[0], pixv[1], perm);
-
-        // convert the bytes into shorts
-        shorts = (vector signed short)vec_mergeh(zero, bytes);
-
-        // save the data to the block, we assume the block is 16-byte aligned
-        vec_st(shorts, i*16, (vector signed short*)block);
-
-        pixels += line_size;
-    }
-}
-
-static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
-        const uint8_t *s2, int stride)
-{
-    int i;
-    vector unsigned char perm, bytes, *pixv;
-    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
-    vector signed short shorts1, shorts2;
-
-    for (i = 0; i < 4; i++) {
-        // Read potentially unaligned pixels
-        // We're reading 16 pixels, and actually only want 8,
-        // but we simply ignore the extras.
-        perm = vec_lvsl(0, s1);
-        pixv = (vector unsigned char *) s1;
-        bytes = vec_perm(pixv[0], pixv[1], perm);
-
-        // convert the bytes into shorts
-        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
-
-        // Do the same for the second block of pixels
-        perm = vec_lvsl(0, s2);
-        pixv = (vector unsigned char *) s2;
-        bytes = vec_perm(pixv[0], pixv[1], perm);
-
-        // convert the bytes into shorts
-        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
-
-        // Do the subtraction
-        shorts1 = vec_sub(shorts1, shorts2);
-
-        // save the data to the block, we assume the block is 16-byte aligned
-        vec_st(shorts1, 0, (vector signed short*)block);
-
-        s1 += stride;
-        s2 += stride;
-        block += 8;
-
-
-        // The code below is a copy of the code above... This is a manual
-        // unroll.
-
-        // Read potentially unaligned pixels
-        // We're reading 16 pixels, and actually only want 8,
-        // but we simply ignore the extras.
-        perm = vec_lvsl(0, s1);
-        pixv = (vector unsigned char *) s1;
-        bytes = vec_perm(pixv[0], pixv[1], perm);
-
-        // convert the bytes into shorts
-        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
-
-        // Do the same for the second block of pixels
-        perm = vec_lvsl(0, s2);
-        pixv = (vector unsigned char *) s2;
-        bytes = vec_perm(pixv[0], pixv[1], perm);
-
-        // convert the bytes into shorts
-        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
-
-        // Do the subtraction
-        shorts1 = vec_sub(shorts1, shorts2);
-
-        // save the data to the block, we assume the block is 16-byte aligned
-        vec_st(shorts1, 0, (vector signed short*)block);
-
-        s1 += stride;
-        s2 += stride;
-        block += 8;
-    }
-}
-
-
-static void clear_block_altivec(DCTELEM *block) {
-    LOAD_ZERO;
-    vec_st(zero_s16v,   0, block);
-    vec_st(zero_s16v,  16, block);
-    vec_st(zero_s16v,  32, block);
-    vec_st(zero_s16v,  48, block);
-    vec_st(zero_s16v,  64, block);
-    vec_st(zero_s16v,  80, block);
-    vec_st(zero_s16v,  96, block);
-    vec_st(zero_s16v, 112, block);
-}
-
-
-
-/* next one assumes that ((line_size % 16) == 0) */
-void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    register vector unsigned char perm = vec_lvsl(0, pixels);
-    int i;
-    register int line_size_2 = line_size << 1;
-    register int line_size_3 = line_size + line_size_2;
-    register int line_size_4 = line_size << 2;
-
-POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
-// hand-unrolling the loop by 4 gains about 15%
-// mininum execution time goes from 74 to 60 cycles
-// it's faster than -funroll-loops, but using
-// -funroll-loops w/ this is bad - 74 cycles again.
-// all this is on a 7450, tuning for the 7450
-#if 0
-    for (i = 0; i < h; i++) {
-        pixelsv1 = vec_ld(0, pixels);
-        pixelsv2 = vec_ld(16, pixels);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, block);
-        pixels+=line_size;
-        block +=line_size;
-    }
-#else
-    for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, pixels);
-        pixelsv2  = vec_ld(15, pixels);
-        pixelsv1B = vec_ld(line_size, pixels);
-        pixelsv2B = vec_ld(15 + line_size, pixels);
-        pixelsv1C = vec_ld(line_size_2, pixels);
-        pixelsv2C = vec_ld(15 + line_size_2, pixels);
-        pixelsv1D = vec_ld(line_size_3, pixels);
-        pixelsv2D = vec_ld(15 + line_size_3, pixels);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               line_size, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               line_size_2, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               line_size_3, (unsigned char*)block);
-        pixels+=line_size_4;
-        block +=line_size_4;
-    }
-#endif
-POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
-}
-
-/* next one assumes that ((line_size % 16) == 0) */
-#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
-void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
-    register vector unsigned char perm = vec_lvsl(0, pixels);
-    int i;
-
-POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
-
-    for (i = 0; i < h; i++) {
-        pixelsv1 = vec_ld( 0, pixels);
-        pixelsv2 = vec_ld(16,pixels);
-        blockv = vec_ld(0, block);
-        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
-        blockv = vec_avg(blockv,pixelsv);
-        vec_st(blockv, 0, (unsigned char*)block);
-        pixels+=line_size;
-        block +=line_size;
-    }
-
-POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
-}
-
-/* next one assumes that ((line_size % 8) == 0) */
-static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
-    int i;
-
-POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
-
-   for (i = 0; i < h; i++) {
-       /* block is 8 bytes-aligned, so we're either in the
-          left block (16 bytes-aligned) or in the right block (not) */
-       int rightside = ((unsigned long)block & 0x0000000F);
-
-       blockv = vec_ld(0, block);
-       pixelsv1 = vec_ld( 0, pixels);
-       pixelsv2 = vec_ld(16, pixels);
-       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
-
-       if (rightside) {
-           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
-       } else {
-           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
-       }
-
-       blockv = vec_avg(blockv, pixelsv);
-
-       vec_st(blockv, 0, block);
-
-       pixels += line_size;
-       block += line_size;
-   }
-
-POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
-}
-
-/* next one assumes that ((line_size % 8) == 0) */
-static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
-    register int i;
-    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
-    register vector unsigned short pixelssum1, pixelssum2, temp3;
-    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
-    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
-
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
-    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
-                         (vector unsigned short)pixelsv2);
-    pixelssum1 = vec_add(pixelssum1, vctwo);
-
-POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
-    for (i = 0; i < h ; i++) {
-        int rightside = ((unsigned long)block & 0x0000000F);
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
-        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
-                             (vector unsigned short)pixelsv2);
-        temp3 = vec_add(pixelssum1, pixelssum2);
-        temp3 = vec_sra(temp3, vctwo);
-        pixelssum1 = vec_add(pixelssum2, vctwo);
-        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
-
-        if (rightside) {
-            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
-        } else {
-            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
-        }
-
-        vec_st(blockv, 0, block);
-
-        block += line_size;
-        pixels += line_size;
-    }
-
-POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
-}
-
-/* next one assumes that ((line_size % 8) == 0) */
-static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
-    register int i;
-    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
-    register vector unsigned short pixelssum1, pixelssum2, temp3;
-    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
-    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
-    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
-
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
-    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
-                         (vector unsigned short)pixelsv2);
-    pixelssum1 = vec_add(pixelssum1, vcone);
-
-POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
-    for (i = 0; i < h ; i++) {
-        int rightside = ((unsigned long)block & 0x0000000F);
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
-        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
-                             (vector unsigned short)pixelsv2);
-        temp3 = vec_add(pixelssum1, pixelssum2);
-        temp3 = vec_sra(temp3, vctwo);
-        pixelssum1 = vec_add(pixelssum2, vcone);
-        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
-
-        if (rightside) {
-            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
-        } else {
-            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
-        }
-
-        vec_st(blockv, 0, block);
-
-        block += line_size;
-        pixels += line_size;
-    }
-
-POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
-}
-
-/* next one assumes that ((line_size % 16) == 0) */
-static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
-    register int i;
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
-    register vector unsigned short temp3, temp4,
-        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
-    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
-    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
-
-POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
-
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
-    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
-                         (vector unsigned short)pixelsv4);
-    pixelssum3 = vec_add(pixelssum3, vctwo);
-    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
-                         (vector unsigned short)pixelsv2);
-    pixelssum1 = vec_add(pixelssum1, vctwo);
-
-    for (i = 0; i < h ; i++) {
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
-
-        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
-                             (vector unsigned short)pixelsv4);
-        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
-                             (vector unsigned short)pixelsv2);
-        temp4 = vec_add(pixelssum3, pixelssum4);
-        temp4 = vec_sra(temp4, vctwo);
-        temp3 = vec_add(pixelssum1, pixelssum2);
-        temp3 = vec_sra(temp3, vctwo);
-
-        pixelssum3 = vec_add(pixelssum4, vctwo);
-        pixelssum1 = vec_add(pixelssum2, vctwo);
-
-        blockv = vec_packsu(temp3, temp4);
-
-        vec_st(blockv, 0, block);
-
-        block += line_size;
-        pixels += line_size;
-    }
-
-POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
-}
-
-/* next one assumes that ((line_size % 16) == 0) */
-static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
-    register int i;
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
-    register vector unsigned short temp3, temp4,
-        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
-    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
-    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
-    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
-
-POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
-
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
-    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
-                         (vector unsigned short)pixelsv4);
-    pixelssum3 = vec_add(pixelssum3, vcone);
-    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
-                         (vector unsigned short)pixelsv2);
-    pixelssum1 = vec_add(pixelssum1, vcone);
-
-    for (i = 0; i < h ; i++) {
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
-
-        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
-                             (vector unsigned short)pixelsv4);
-        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
-                             (vector unsigned short)pixelsv2);
-        temp4 = vec_add(pixelssum3, pixelssum4);
-        temp4 = vec_sra(temp4, vctwo);
-        temp3 = vec_add(pixelssum1, pixelssum2);
-        temp3 = vec_sra(temp3, vctwo);
-
-        pixelssum3 = vec_add(pixelssum4, vcone);
-        pixelssum1 = vec_add(pixelssum2, vcone);
-
-        blockv = vec_packsu(temp3, temp4);
-
-        vec_st(blockv, 0, block);
-
-        block += line_size;
-        pixels += line_size;
-    }
-
-POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
-}
-
-/* next one assumes that ((line_size % 8) == 0) */
-static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
-    register int i;
-    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2, blocktemp;
-    register vector unsigned short pixelssum1, pixelssum2, temp3;
-
-    register const vector unsigned char vczero = (const vector unsigned char)
-                                        vec_splat_u8(0);
-    register const vector unsigned short vctwo = (const vector unsigned short)
-                                        vec_splat_u16(2);
-
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
-    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
-                         (vector unsigned short)pixelsv2);
-    pixelssum1 = vec_add(pixelssum1, vctwo);
-
-POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
-    for (i = 0; i < h ; i++) {
-        int rightside = ((unsigned long)block & 0x0000000F);
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
-        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
-                             (vector unsigned short)pixelsv2);
-        temp3 = vec_add(pixelssum1, pixelssum2);
-        temp3 = vec_sra(temp3, vctwo);
-        pixelssum1 = vec_add(pixelssum2, vctwo);
-        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
-
-        if (rightside) {
-            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
-        } else {
-            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
-        }
-
-        blockv = vec_avg(blocktemp, blockv);
-        vec_st(blockv, 0, block);
-
-        block += line_size;
-        pixels += line_size;
-    }
-
-POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
-}
-
-void dsputil_init_altivec(DSPContext* c)
-{
-    c->diff_pixels = diff_pixels_altivec;
-    c->get_pixels = get_pixels_altivec;
-    c->clear_block = clear_block_altivec;
-
-    c->put_pixels_tab[0][0] = put_pixels16_altivec;
-    /* the two functions do the same thing, so use the same code */
-    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
-    c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
-    c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
-    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
-    c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
-    c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
-
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2002 Brian Foley
- * Copyright (c) 2002 Dieter Shirley
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H
-#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H
-
-#include <stdint.h>
-#include "libavcodec/dsputil.h"
-
-void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-int has_altivec(void);
-
-void fdct_altivec(int16_t *block);
-void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
-                  int x16, int y16, int rounder);
-void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_vp3_idct_altivec(DCTELEM *block);
-void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block);
-
-void dsputil_h264_init_ppc(DSPContext* c);
-
-void dsputil_init_altivec(DSPContext* c);
-//void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
-//void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
-//void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
-
-#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2002 Brian Foley
- * Copyright (c) 2002 Dieter Shirley
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#include "dsputil_ppc.h"
-#include "dsputil_altivec.h"
-
-static void prefetch_ppc(void *mem, int stride, int h)
-{
-    register const uint8_t *p = mem;
-    do {
-        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
-        p+= stride;
-    } while(--h);
-}
-
-void dsputil_init_ppc(DSPContext* c)
-{
-    c->prefetch = prefetch_ppc;
-
-#if HAVE_ALTIVEC
-	dsputil_h264_init_ppc(c);	
-	dsputil_init_altivec(c);
-
-	c->idct_put = idct_put_altivec;
-	c->idct_add = idct_add_altivec;
-
-#endif /* HAVE_ALTIVEC */
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_PPC_DSPUTIL_PPC_H
-#define AVCODEC_PPC_DSPUTIL_PPC_H
-
-#include "config.h"
-
-#if CONFIG_POWERPC_PERF
-void powerpc_display_perf_report(void);
-/* the 604* have 2, the G3* have 4, the G4s have 6,
-   and the G5 are completely different (they MUST use
-   ARCH_PPC64, and let's hope all future 64 bis PPC
-   will use the same PMCs... */
-#define POWERPC_NUM_PMC_ENABLED 6
-/* if you add to the enum below, also add to the perfname array
-   in dsputil_ppc.c */
-enum powerpc_perf_index {
-    altivec_fft_num = 0,
-    altivec_gmc1_num,
-    altivec_dct_unquantize_h263_num,
-    altivec_fdct,
-    altivec_idct_add_num,
-    altivec_idct_put_num,
-    altivec_put_pixels16_num,
-    altivec_avg_pixels16_num,
-    altivec_avg_pixels8_num,
-    altivec_put_pixels8_xy2_num,
-    altivec_put_no_rnd_pixels8_xy2_num,
-    altivec_put_pixels16_xy2_num,
-    altivec_put_no_rnd_pixels16_xy2_num,
-    altivec_hadamard8_diff8x8_num,
-    altivec_hadamard8_diff16_num,
-    altivec_avg_pixels8_xy2_num,
-    powerpc_clear_blocks_dcbz32,
-    powerpc_clear_blocks_dcbz128,
-    altivec_put_h264_chroma_mc8_num,
-    altivec_avg_h264_chroma_mc8_num,
-    altivec_put_h264_qpel16_h_lowpass_num,
-    altivec_avg_h264_qpel16_h_lowpass_num,
-    altivec_put_h264_qpel16_v_lowpass_num,
-    altivec_avg_h264_qpel16_v_lowpass_num,
-    altivec_put_h264_qpel16_hv_lowpass_num,
-    altivec_avg_h264_qpel16_hv_lowpass_num,
-    powerpc_perf_total
-};
-enum powerpc_data_index {
-    powerpc_data_min = 0,
-    powerpc_data_max,
-    powerpc_data_sum,
-    powerpc_data_num,
-    powerpc_data_total
-};
-extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
-
-#if !ARCH_PPC64
-#define POWERP_PMC_DATATYPE unsigned long
-#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a))
-#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a))
-#if (POWERPC_NUM_PMC_ENABLED > 2)
-#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a))
-#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a))
-#else
-#define POWERPC_GET_PMC3(a) do {} while (0)
-#define POWERPC_GET_PMC4(a) do {} while (0)
-#endif
-#if (POWERPC_NUM_PMC_ENABLED > 4)
-#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a))
-#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a))
-#else
-#define POWERPC_GET_PMC5(a) do {} while (0)
-#define POWERPC_GET_PMC6(a) do {} while (0)
-#endif
-#else /* ARCH_PPC64 */
-#define POWERP_PMC_DATATYPE unsigned long long
-#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a))
-#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a))
-#if (POWERPC_NUM_PMC_ENABLED > 2)
-#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a))
-#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a))
-#else
-#define POWERPC_GET_PMC3(a) do {} while (0)
-#define POWERPC_GET_PMC4(a) do {} while (0)
-#endif
-#if (POWERPC_NUM_PMC_ENABLED > 4)
-#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a))
-#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a))
-#else
-#define POWERPC_GET_PMC5(a) do {} while (0)
-#define POWERPC_GET_PMC6(a) do {} while (0)
-#endif
-#endif /* ARCH_PPC64 */
-#define POWERPC_PERF_DECLARE(a, cond)       \
-    POWERP_PMC_DATATYPE                     \
-        pmc_start[POWERPC_NUM_PMC_ENABLED], \
-        pmc_stop[POWERPC_NUM_PMC_ENABLED],  \
-        pmc_loop_index;
-#define POWERPC_PERF_START_COUNT(a, cond) do { \
-    POWERPC_GET_PMC6(pmc_start[5]); \
-    POWERPC_GET_PMC5(pmc_start[4]); \
-    POWERPC_GET_PMC4(pmc_start[3]); \
-    POWERPC_GET_PMC3(pmc_start[2]); \
-    POWERPC_GET_PMC2(pmc_start[1]); \
-    POWERPC_GET_PMC1(pmc_start[0]); \
-    } while (0)
-#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
-    POWERPC_GET_PMC1(pmc_stop[0]);            \
-    POWERPC_GET_PMC2(pmc_stop[1]);            \
-    POWERPC_GET_PMC3(pmc_stop[2]);            \
-    POWERPC_GET_PMC4(pmc_stop[3]);            \
-    POWERPC_GET_PMC5(pmc_stop[4]);            \
-    POWERPC_GET_PMC6(pmc_stop[5]);            \
-    if (cond) {                               \
-        for(pmc_loop_index = 0;               \
-            pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
-            pmc_loop_index++) {               \
-            if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) {  \
-                POWERP_PMC_DATATYPE diff =                                \
-                  pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index];   \
-                if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
-                    perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
-                if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
-                    perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
-                perfdata[pmc_loop_index][a][powerpc_data_sum] += diff;    \
-                perfdata[pmc_loop_index][a][powerpc_data_num] ++;         \
-            }                                 \
-        }                                     \
-    }                                         \
-} while (0)
-#else /* CONFIG_POWERPC_PERF */
-// those are needed to avoid empty statements.
-#define POWERPC_PERF_DECLARE(a, cond)        int altivec_placeholder __attribute__ ((unused))
-#define POWERPC_PERF_START_COUNT(a, cond)    do {} while (0)
-#define POWERPC_PERF_STOP_COUNT(a, cond)     do {} while (0)
-#endif /* CONFIG_POWERPC_PERF */
-
-#endif /*  AVCODEC_PPC_DSPUTIL_PPC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1021 +0,0 @@
-/*
- * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#include "libavcodec/h264_data.h"
-#include "libavcodec/h264_dsp.h"
-
-#include "dsputil_ppc.h"
-#include "dsputil_altivec.h"
-#include "util_altivec.h"
-#include "types_altivec.h"
-
-#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
-#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
-
-#define OP_U8_ALTIVEC                          PUT_OP_U8_ALTIVEC
-#define PREFIX_h264_chroma_mc8_altivec         put_h264_chroma_mc8_altivec
-#define PREFIX_no_rnd_vc1_chroma_mc8_altivec   put_no_rnd_vc1_chroma_mc8_altivec
-#define PREFIX_h264_chroma_mc8_num             altivec_put_h264_chroma_mc8_num
-#define PREFIX_h264_qpel16_h_lowpass_altivec   put_h264_qpel16_h_lowpass_altivec
-#define PREFIX_h264_qpel16_h_lowpass_num       altivec_put_h264_qpel16_h_lowpass_num
-#define PREFIX_h264_qpel16_v_lowpass_altivec   put_h264_qpel16_v_lowpass_altivec
-#define PREFIX_h264_qpel16_v_lowpass_num       altivec_put_h264_qpel16_v_lowpass_num
-#define PREFIX_h264_qpel16_hv_lowpass_altivec  put_h264_qpel16_hv_lowpass_altivec
-#define PREFIX_h264_qpel16_hv_lowpass_num      altivec_put_h264_qpel16_hv_lowpass_num
-#include "h264_template_altivec.c"
-#undef OP_U8_ALTIVEC
-#undef PREFIX_h264_chroma_mc8_altivec
-#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
-#undef PREFIX_h264_chroma_mc8_num
-#undef PREFIX_h264_qpel16_h_lowpass_altivec
-#undef PREFIX_h264_qpel16_h_lowpass_num
-#undef PREFIX_h264_qpel16_v_lowpass_altivec
-#undef PREFIX_h264_qpel16_v_lowpass_num
-#undef PREFIX_h264_qpel16_hv_lowpass_altivec
-#undef PREFIX_h264_qpel16_hv_lowpass_num
-
-#define OP_U8_ALTIVEC                          AVG_OP_U8_ALTIVEC
-#define PREFIX_h264_chroma_mc8_altivec         avg_h264_chroma_mc8_altivec
-#define PREFIX_no_rnd_vc1_chroma_mc8_altivec   avg_no_rnd_vc1_chroma_mc8_altivec
-#define PREFIX_h264_chroma_mc8_num             altivec_avg_h264_chroma_mc8_num
-#define PREFIX_h264_qpel16_h_lowpass_altivec   avg_h264_qpel16_h_lowpass_altivec
-#define PREFIX_h264_qpel16_h_lowpass_num       altivec_avg_h264_qpel16_h_lowpass_num
-#define PREFIX_h264_qpel16_v_lowpass_altivec   avg_h264_qpel16_v_lowpass_altivec
-#define PREFIX_h264_qpel16_v_lowpass_num       altivec_avg_h264_qpel16_v_lowpass_num
-#define PREFIX_h264_qpel16_hv_lowpass_altivec  avg_h264_qpel16_hv_lowpass_altivec
-#define PREFIX_h264_qpel16_hv_lowpass_num      altivec_avg_h264_qpel16_hv_lowpass_num
-#include "h264_template_altivec.c"
-#undef OP_U8_ALTIVEC
-#undef PREFIX_h264_chroma_mc8_altivec
-#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
-#undef PREFIX_h264_chroma_mc8_num
-#undef PREFIX_h264_qpel16_h_lowpass_altivec
-#undef PREFIX_h264_qpel16_h_lowpass_num
-#undef PREFIX_h264_qpel16_v_lowpass_altivec
-#undef PREFIX_h264_qpel16_v_lowpass_num
-#undef PREFIX_h264_qpel16_hv_lowpass_altivec
-#undef PREFIX_h264_qpel16_hv_lowpass_num
-
-#define H264_MC(OPNAME, SIZE, CODETYPE) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
-    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
-    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
-}\
-
-static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
-                                    const uint8_t * src2, int dst_stride,
-                                    int src_stride1, int h)
-{
-    int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
-
-    mask_ = vec_lvsl(0, src2);
-
-    for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
-        d = vec_avg(a, b);
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
-        dst += dst_stride;
-    }
-}
-
-static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
-                                    const uint8_t * src2, int dst_stride,
-                                    int src_stride1, int h)
-{
-    int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
-
-    mask_ = vec_lvsl(0, src2);
-
-    for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
-        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
-        dst += dst_stride;
-    }
-}
-
-/* Implemented but could be faster
-#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
-#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
- */
-
-H264_MC(put_, 16, altivec)
-H264_MC(avg_, 16, altivec)
-
-
-/****************************************************************************
- * IDCT transform:
- ****************************************************************************/
-
-#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)               \
-    /* 1st stage */                                               \
-    vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
-    vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
-    vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
-    vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
-    vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
-    vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
-    /* 2nd stage: output */                                       \
-    va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
-    va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
-    va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
-    va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
-
-#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
-    b0 = vec_mergeh( a0, a0 ); \
-    b1 = vec_mergeh( a1, a0 ); \
-    b2 = vec_mergeh( a2, a0 ); \
-    b3 = vec_mergeh( a3, a0 ); \
-    a0 = vec_mergeh( b0, b2 ); \
-    a1 = vec_mergel( b0, b2 ); \
-    a2 = vec_mergeh( b1, b3 ); \
-    a3 = vec_mergel( b1, b3 ); \
-    b0 = vec_mergeh( a0, a2 ); \
-    b1 = vec_mergel( a0, a2 ); \
-    b2 = vec_mergeh( a1, a3 ); \
-    b3 = vec_mergel( a1, a3 )
-
-#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
-    vdst_orig = vec_ld(0, dst);                               \
-    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
-    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
-    va = vec_add(va, vdst_ss);                                \
-    va_u8 = vec_packsu(va, zero_s16v);                        \
-    va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
-    vec_ste(va_u32, element, (uint32_t*)dst);
-
-static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
-{
-    vec_s16 va0, va1, va2, va3;
-    vec_s16 vz0, vz1, vz2, vz3;
-    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
-    vec_u8 va_u8;
-    vec_u32 va_u32;
-    vec_s16 vdst_ss;
-    const vec_u16 v6us = vec_splat_u16(6);
-    vec_u8 vdst, vdst_orig;
-    vec_u8 vdst_mask = vec_lvsl(0, dst);
-    int element = ((unsigned long)dst & 0xf) >> 2;
-    LOAD_ZERO;
-
-    block[0] += 32;  /* add 32 as a DC-level for rounding */
-
-    vtmp0 = vec_ld(0,block);
-    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
-    vtmp2 = vec_ld(16,block);
-    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
-
-    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
-    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
-    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
-
-    va0 = vec_sra(va0,v6us);
-    va1 = vec_sra(va1,v6us);
-    va2 = vec_sra(va2,v6us);
-    va3 = vec_sra(va3,v6us);
-
-    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
-    dst += stride;
-    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
-    dst += stride;
-    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
-    dst += stride;
-    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
-}
-
-#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
-    /*        a0  = SRC(0) + SRC(4); */ \
-    vec_s16 a0v = vec_add(s0, s4);    \
-    /*        a2  = SRC(0) - SRC(4); */ \
-    vec_s16 a2v = vec_sub(s0, s4);    \
-    /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
-    vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6);    \
-    /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
-    vec_s16 a6v = vec_add(vec_sra(s6, onev), s2);    \
-    /*        b0  =         a0 + a6; */ \
-    vec_s16 b0v = vec_add(a0v, a6v);  \
-    /*        b2  =         a2 + a4; */ \
-    vec_s16 b2v = vec_add(a2v, a4v);  \
-    /*        b4  =         a2 - a4; */ \
-    vec_s16 b4v = vec_sub(a2v, a4v);  \
-    /*        b6  =         a0 - a6; */ \
-    vec_s16 b6v = vec_sub(a0v, a6v);  \
-    /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
-    /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
-    vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
-    /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
-    /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
-    vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
-    /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
-    /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
-    vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
-    /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
-    vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
-    /*        b1 =                  (a7>>2)  +  a1; */ \
-    vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
-    /*        b3 =          a3 +        (a5>>2); */ \
-    vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
-    /*        b5 =                  (a3>>2)  -   a5; */ \
-    vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
-    /*        b7 =           a7 -        (a1>>2); */ \
-    vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
-    /* DST(0,    b0 + b7); */ \
-    d0 = vec_add(b0v, b7v); \
-    /* DST(1,    b2 + b5); */ \
-    d1 = vec_add(b2v, b5v); \
-    /* DST(2,    b4 + b3); */ \
-    d2 = vec_add(b4v, b3v); \
-    /* DST(3,    b6 + b1); */ \
-    d3 = vec_add(b6v, b1v); \
-    /* DST(4,    b6 - b1); */ \
-    d4 = vec_sub(b6v, b1v); \
-    /* DST(5,    b4 - b3); */ \
-    d5 = vec_sub(b4v, b3v); \
-    /* DST(6,    b2 - b5); */ \
-    d6 = vec_sub(b2v, b5v); \
-    /* DST(7,    b0 - b7); */ \
-    d7 = vec_sub(b0v, b7v); \
-}
-
-#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
-    /* unaligned load */                                       \
-    vec_u8 hv = vec_ld( 0, dest );                           \
-    vec_u8 lv = vec_ld( 7, dest );                           \
-    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
-    vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
-    vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
-    vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
-    vec_u8 edgehv;                                           \
-    /* unaligned store */                                      \
-    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
-    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
-    lv    = vec_sel( lv, bodyv, edgelv );                      \
-    vec_st( lv, 7, dest );                                     \
-    hv    = vec_ld( 0, dest );                                 \
-    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
-    hv    = vec_sel( hv, bodyv, edgehv );                      \
-    vec_st( hv, 0, dest );                                     \
- }
-
-static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
-    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
-    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
-    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
-
-    vec_u8 perm_ldv = vec_lvsl(0, dst);
-    vec_u8 perm_stv = vec_lvsr(8, dst);
-
-    const vec_u16 onev = vec_splat_u16(1);
-    const vec_u16 twov = vec_splat_u16(2);
-    const vec_u16 sixv = vec_splat_u16(6);
-
-    const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
-    LOAD_ZERO;
-
-    dct[0] += 32; // rounding for the >>6 at the end
-
-    s0 = vec_ld(0x00, (int16_t*)dct);
-    s1 = vec_ld(0x10, (int16_t*)dct);
-    s2 = vec_ld(0x20, (int16_t*)dct);
-    s3 = vec_ld(0x30, (int16_t*)dct);
-    s4 = vec_ld(0x40, (int16_t*)dct);
-    s5 = vec_ld(0x50, (int16_t*)dct);
-    s6 = vec_ld(0x60, (int16_t*)dct);
-    s7 = vec_ld(0x70, (int16_t*)dct);
-
-    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
-                     d0, d1, d2, d3, d4, d5, d6, d7);
-
-    TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );
-
-    IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
-                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
-
-    ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
-}
-
-static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size)
-{
-    vec_s16 dc16;
-    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
-    LOAD_ZERO;
-    DECLARE_ALIGNED(16, int, dc);
-    int i;
-
-    dc = (block[0] + 32) >> 6;
-    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
-
-    if (size == 4)
-        dc16 = vec_sld(dc16, zero_s16v, 8);
-    dcplus = vec_packsu(dc16, zero_s16v);
-    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
-
-    aligner = vec_lvsr(0, dst);
-    dcplus = vec_perm(dcplus, dcplus, aligner);
-    dcminus = vec_perm(dcminus, dcminus, aligner);
-
-    for (i = 0; i < size; i += 4) {
-        v0 = vec_ld(0, dst+0*stride);
-        v1 = vec_ld(0, dst+1*stride);
-        v2 = vec_ld(0, dst+2*stride);
-        v3 = vec_ld(0, dst+3*stride);
-
-        v0 = vec_adds(v0, dcplus);
-        v1 = vec_adds(v1, dcplus);
-        v2 = vec_adds(v2, dcplus);
-        v3 = vec_adds(v3, dcplus);
-
-        v0 = vec_subs(v0, dcminus);
-        v1 = vec_subs(v1, dcminus);
-        v2 = vec_subs(v2, dcminus);
-        v3 = vec_subs(v3, dcminus);
-
-        vec_st(v0, 0, dst+0*stride);
-        vec_st(v1, 0, dst+1*stride);
-        vec_st(v2, 0, dst+2*stride);
-        vec_st(v3, 0, dst+3*stride);
-
-        dst += 4*stride;
-    }
-}
-
-static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
-{
-    h264_idct_dc_add_internal(dst, block, stride, 4);
-}
-
-static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
-{
-    h264_idct_dc_add_internal(dst, block, stride, 8);
-}
-
-static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i++){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
-            else                      ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
-        }
-    }
-}
-
-static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i++){
-        if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
-        else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
-    }
-}
-
-static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i+=4){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
-            else                      ff_h264_idct8_add_altivec   (dst + block_offset[i], block + i*16, stride);
-        }
-    }
-}
-
-static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=16; i<16+8; i++){
-        if(nnzc[ scan8[i] ])
-            ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
-        else if(block[i*16])
-            h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
-    }
-}
-
-#define transpose4x16(r0, r1, r2, r3) {      \
-    register vec_u8 r4;                    \
-    register vec_u8 r5;                    \
-    register vec_u8 r6;                    \
-    register vec_u8 r7;                    \
-                                             \
-    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
-    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
-    r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
-    r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
-                                             \
-    r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
-    r1 = vec_mergel(r4, r6);  /*all set 1*/  \
-    r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
-    r3 = vec_mergel(r5, r7);  /*all set 3*/  \
-}
-
-static inline void write16x4(uint8_t *dst, int dst_stride,
-                             register vec_u8 r0, register vec_u8 r1,
-                             register vec_u8 r2, register vec_u8 r3) {
-    DECLARE_ALIGNED(16, unsigned char, result)[64];
-    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
-    int int_dst_stride = dst_stride/4;
-
-    vec_st(r0, 0, result);
-    vec_st(r1, 16, result);
-    vec_st(r2, 32, result);
-    vec_st(r3, 48, result);
-    /* FIXME: there has to be a better way!!!! */
-    *dst_int = *src_int;
-    *(dst_int+   int_dst_stride) = *(src_int + 1);
-    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
-    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
-    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
-    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
-    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
-    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
-    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
-    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
-    *(dst_int+10*int_dst_stride) = *(src_int + 10);
-    *(dst_int+11*int_dst_stride) = *(src_int + 11);
-    *(dst_int+12*int_dst_stride) = *(src_int + 12);
-    *(dst_int+13*int_dst_stride) = *(src_int + 13);
-    *(dst_int+14*int_dst_stride) = *(src_int + 14);
-    *(dst_int+15*int_dst_stride) = *(src_int + 15);
-}
-
-/** \brief performs a 6x16 transpose of data in src, and stores it to dst
-    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
-    out of unaligned_load() */
-#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
-    register vec_u8 r0  = unaligned_load(0,             src);            \
-    register vec_u8 r1  = unaligned_load(   src_stride, src);            \
-    register vec_u8 r2  = unaligned_load(2* src_stride, src);            \
-    register vec_u8 r3  = unaligned_load(3* src_stride, src);            \
-    register vec_u8 r4  = unaligned_load(4* src_stride, src);            \
-    register vec_u8 r5  = unaligned_load(5* src_stride, src);            \
-    register vec_u8 r6  = unaligned_load(6* src_stride, src);            \
-    register vec_u8 r7  = unaligned_load(7* src_stride, src);            \
-    register vec_u8 r14 = unaligned_load(14*src_stride, src);            \
-    register vec_u8 r15 = unaligned_load(15*src_stride, src);            \
-                                                                           \
-    r8  = unaligned_load( 8*src_stride, src);                              \
-    r9  = unaligned_load( 9*src_stride, src);                              \
-    r10 = unaligned_load(10*src_stride, src);                              \
-    r11 = unaligned_load(11*src_stride, src);                              \
-    r12 = unaligned_load(12*src_stride, src);                              \
-    r13 = unaligned_load(13*src_stride, src);                              \
-                                                                           \
-    /*Merge first pairs*/                                                  \
-    r0 = vec_mergeh(r0, r8);    /*0, 8*/                                   \
-    r1 = vec_mergeh(r1, r9);    /*1, 9*/                                   \
-    r2 = vec_mergeh(r2, r10);   /*2,10*/                                   \
-    r3 = vec_mergeh(r3, r11);   /*3,11*/                                   \
-    r4 = vec_mergeh(r4, r12);   /*4,12*/                                   \
-    r5 = vec_mergeh(r5, r13);   /*5,13*/                                   \
-    r6 = vec_mergeh(r6, r14);   /*6,14*/                                   \
-    r7 = vec_mergeh(r7, r15);   /*7,15*/                                   \
-                                                                           \
-    /*Merge second pairs*/                                                 \
-    r8  = vec_mergeh(r0, r4);   /*0,4, 8,12 set 0*/                        \
-    r9  = vec_mergel(r0, r4);   /*0,4, 8,12 set 1*/                        \
-    r10 = vec_mergeh(r1, r5);   /*1,5, 9,13 set 0*/                        \
-    r11 = vec_mergel(r1, r5);   /*1,5, 9,13 set 1*/                        \
-    r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/                        \
-    r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/                        \
-    r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/                        \
-    r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/                        \
-                                                                           \
-    /*Third merge*/                                                        \
-    r0 = vec_mergeh(r8,  r12);  /*0,2,4,6,8,10,12,14 set 0*/               \
-    r1 = vec_mergel(r8,  r12);  /*0,2,4,6,8,10,12,14 set 1*/               \
-    r2 = vec_mergeh(r9,  r13);  /*0,2,4,6,8,10,12,14 set 2*/               \
-    r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/               \
-    r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/               \
-    r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/               \
-    /* Don't need to compute 3 and 7*/                                     \
-                                                                           \
-    /*Final merge*/                                                        \
-    r8  = vec_mergeh(r0, r4);   /*all set 0*/                              \
-    r9  = vec_mergel(r0, r4);   /*all set 1*/                              \
-    r10 = vec_mergeh(r1, r5);   /*all set 2*/                              \
-    r11 = vec_mergel(r1, r5);   /*all set 3*/                              \
-    r12 = vec_mergeh(r2, r6);   /*all set 4*/                              \
-    r13 = vec_mergel(r2, r6);   /*all set 5*/                              \
-    /* Don't need to compute 14 and 15*/                                   \
-                                                                           \
-}
-
-// out: o = |x-y| < a
-static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
-                                         register vec_u8 y,
-                                         register vec_u8 a) {
-
-    register vec_u8 diff = vec_subs(x, y);
-    register vec_u8 diffneg = vec_subs(y, x);
-    register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
-    o = (vec_u8)vec_cmplt(o, a);
-    return o;
-}
-
-static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
-                                           register vec_u8 p1,
-                                           register vec_u8 q0,
-                                           register vec_u8 q1,
-                                           register vec_u8 alpha,
-                                           register vec_u8 beta) {
-
-    register vec_u8 mask;
-    register vec_u8 tempmask;
-
-    mask = diff_lt_altivec(p0, q0, alpha);
-    tempmask = diff_lt_altivec(p1, p0, beta);
-    mask = vec_and(mask, tempmask);
-    tempmask = diff_lt_altivec(q1, q0, beta);
-    mask = vec_and(mask, tempmask);
-
-    return mask;
-}
-
-// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
-static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
-                                       register vec_u8 p1,
-                                       register vec_u8 p2,
-                                       register vec_u8 q0,
-                                       register vec_u8 tc0) {
-
-    register vec_u8 average = vec_avg(p0, q0);
-    register vec_u8 temp;
-    register vec_u8 uncliped;
-    register vec_u8 ones;
-    register vec_u8 max;
-    register vec_u8 min;
-    register vec_u8 newp1;
-
-    temp = vec_xor(average, p2);
-    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
-    ones = vec_splat_u8(1);
-    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
-    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
-    max = vec_adds(p1, tc0);
-    min = vec_subs(p1, tc0);
-    newp1 = vec_max(min, uncliped);
-    newp1 = vec_min(max, newp1);
-    return newp1;
-}
-
-#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
-                                                                                                  \
-    const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
-                                                                                                  \
-    register vec_u8 pq0bit = vec_xor(p0,q0);                                                    \
-    register vec_u8 q1minus;                                                                    \
-    register vec_u8 p0minus;                                                                    \
-    register vec_u8 stage1;                                                                     \
-    register vec_u8 stage2;                                                                     \
-    register vec_u8 vec160;                                                                     \
-    register vec_u8 delta;                                                                      \
-    register vec_u8 deltaneg;                                                                   \
-                                                                                                  \
-    q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
-    stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
-    stage2 = vec_sr(stage1, vec_splat_u8(1));  /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */     \
-    p0minus = vec_nor(p0, p0);                 /* 255 - p0 */                                     \
-    stage1 = vec_avg(q0, p0minus);             /* (q0 - p0 + 256)>>1 */                           \
-    pq0bit = vec_and(pq0bit, vec_splat_u8(1));                                                    \
-    stage2 = vec_avg(stage2, pq0bit);          /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
-    stage2 = vec_adds(stage2, stage1);         /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */  \
-    vec160 = vec_ld(0, &A0v);                                                                     \
-    deltaneg = vec_subs(vec160, stage2);       /* -d */                                           \
-    delta = vec_subs(stage2, vec160);          /* d */                                            \
-    deltaneg = vec_min(tc0masked, deltaneg);                                                      \
-    delta = vec_min(tc0masked, delta);                                                            \
-    p0 = vec_subs(p0, deltaneg);                                                                  \
-    q0 = vec_subs(q0, delta);                                                                     \
-    p0 = vec_adds(p0, delta);                                                                     \
-    q0 = vec_adds(q0, deltaneg);                                                                  \
-}
-
-#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
-    DECLARE_ALIGNED(16, unsigned char, temp)[16];                                             \
-    register vec_u8 alphavec;                                                              \
-    register vec_u8 betavec;                                                               \
-    register vec_u8 mask;                                                                  \
-    register vec_u8 p1mask;                                                                \
-    register vec_u8 q1mask;                                                                \
-    register vector signed   char tc0vec;                                                    \
-    register vec_u8 finaltc0;                                                              \
-    register vec_u8 tc0masked;                                                             \
-    register vec_u8 newp1;                                                                 \
-    register vec_u8 newq1;                                                                 \
-                                                                                             \
-    temp[0] = alpha;                                                                         \
-    temp[1] = beta;                                                                          \
-    alphavec = vec_ld(0, temp);                                                              \
-    betavec = vec_splat(alphavec, 0x1);                                                      \
-    alphavec = vec_splat(alphavec, 0x0);                                                     \
-    mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */            \
-                                                                                             \
-    *((int *)temp) = *((int *)tc0);                                                          \
-    tc0vec = vec_ld(0, (signed char*)temp);                                                  \
-    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
-    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
-    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
-    finaltc0 = vec_and((vec_u8)tc0vec, mask);     /* tc = tc0 */                           \
-                                                                                             \
-    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
-    p1mask = vec_and(p1mask, mask);                             /* if ( |p2 - p0| < beta) */ \
-    tc0masked = vec_and(p1mask, (vec_u8)tc0vec);                                           \
-    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
-    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
-    /*end if*/                                                                               \
-                                                                                             \
-    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
-    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
-    tc0masked = vec_and(q1mask, (vec_u8)tc0vec);                                           \
-    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
-    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
-    /*end if*/                                                                               \
-                                                                                             \
-    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);                                            \
-    p1 = newp1;                                                                              \
-    q1 = newq1;                                                                              \
-}
-
-static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
-
-    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
-        register vec_u8 p2 = vec_ld(-3*stride, pix);
-        register vec_u8 p1 = vec_ld(-2*stride, pix);
-        register vec_u8 p0 = vec_ld(-1*stride, pix);
-        register vec_u8 q0 = vec_ld(0, pix);
-        register vec_u8 q1 = vec_ld(stride, pix);
-        register vec_u8 q2 = vec_ld(2*stride, pix);
-        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
-        vec_st(p1, -2*stride, pix);
-        vec_st(p0, -1*stride, pix);
-        vec_st(q0, 0, pix);
-        vec_st(q1, stride, pix);
-    }
-}
-
-static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
-
-    register vec_u8 line0, line1, line2, line3, line4, line5;
-    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
-        return;
-    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
-    h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
-    transpose4x16(line1, line2, line3, line4);
-    write16x4(pix-2, stride, line1, line2, line3, line4);
-}
-
-static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
-{
-    int y, aligned;
-    vec_u8 vblock;
-    vec_s16 vtemp, vweight, voffset, v0, v1;
-    vec_u16 vlog2_denom;
-    DECLARE_ALIGNED(16, int32_t, temp)[4];
-    LOAD_ZERO;
-
-    offset <<= log2_denom;
-    if(log2_denom) offset += 1<<(log2_denom-1);
-    temp[0] = log2_denom;
-    temp[1] = weight;
-    temp[2] = offset;
-
-    vtemp = (vec_s16)vec_ld(0, temp);
-    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
-    vweight = vec_splat(vtemp, 3);
-    voffset = vec_splat(vtemp, 5);
-    aligned = !((unsigned long)block & 0xf);
-
-    for (y=0; y<h; y++) {
-        vblock = vec_ld(0, block);
-
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
-
-        if (w == 16 || aligned) {
-            v0 = vec_mladd(v0, vweight, zero_s16v);
-            v0 = vec_adds(v0, voffset);
-            v0 = vec_sra(v0, vlog2_denom);
-        }
-        if (w == 16 || !aligned) {
-            v1 = vec_mladd(v1, vweight, zero_s16v);
-            v1 = vec_adds(v1, voffset);
-            v1 = vec_sra(v1, vlog2_denom);
-        }
-        vblock = vec_packsu(v0, v1);
-        vec_st(vblock, 0, block);
-
-        block += stride;
-    }
-}
-
-static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-                               int weightd, int weights, int offset, int w, int h)
-{
-    int y, dst_aligned, src_aligned;
-    vec_u8 vsrc, vdst;
-    vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
-    vec_u16 vlog2_denom;
-    DECLARE_ALIGNED(16, int32_t, temp)[4];
-    LOAD_ZERO;
-
-    offset = ((offset + 1) | 1) << log2_denom;
-    temp[0] = log2_denom+1;
-    temp[1] = weights;
-    temp[2] = weightd;
-    temp[3] = offset;
-
-    vtemp = (vec_s16)vec_ld(0, temp);
-    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
-    vweights = vec_splat(vtemp, 3);
-    vweightd = vec_splat(vtemp, 5);
-    voffset = vec_splat(vtemp, 7);
-    dst_aligned = !((unsigned long)dst & 0xf);
-    src_aligned = !((unsigned long)src & 0xf);
-
-    for (y=0; y<h; y++) {
-        vdst = vec_ld(0, dst);
-        vsrc = vec_ld(0, src);
-
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
-        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
-        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
-
-        if (w == 8) {
-            if (src_aligned)
-                v3 = v2;
-            else
-                v2 = v3;
-        }
-
-        if (w == 16 || dst_aligned) {
-            v0 = vec_mladd(v0, vweightd, zero_s16v);
-            v2 = vec_mladd(v2, vweights, zero_s16v);
-
-            v0 = vec_adds(v0, voffset);
-            v0 = vec_adds(v0, v2);
-            v0 = vec_sra(v0, vlog2_denom);
-        }
-        if (w == 16 || !dst_aligned) {
-            v1 = vec_mladd(v1, vweightd, zero_s16v);
-            v3 = vec_mladd(v3, vweights, zero_s16v);
-
-            v1 = vec_adds(v1, voffset);
-            v1 = vec_adds(v1, v3);
-            v1 = vec_sra(v1, vlog2_denom);
-        }
-        vdst = vec_packsu(v0, v1);
-        vec_st(vdst, 0, dst);
-
-        dst += stride;
-        src += stride;
-    }
-}
-
-#define H264_WEIGHT(W,H) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
-}\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
-
-void dsputil_h264_init_ppc(DSPContext* c) {    
-	c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
-	c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
-
-#define dspfunc(PFX, IDX, NUM) \
-	c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
-	c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
-	c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
-	c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
-	c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
-	c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
-	c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
-	c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
-
-	dspfunc(put_h264_qpel, 0, 16);
-	dspfunc(avg_h264_qpel, 0, 16);
-#undef dspfunc
-}
-
-void ff_h264dsp_init_ppc(H264DSPContext *c){
-	c->h264_idct_dc_add= h264_idct_dc_add_altivec;
-	c->h264_idct_add = ff_h264_idct_add_altivec;
-	c->h264_idct_add8 = ff_h264_idct_add8_altivec;
-	c->h264_idct_add16 = ff_h264_idct_add16_altivec;
-	c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
-
-	c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
-	c->h264_idct8_add = ff_h264_idct8_add_altivec;
-	c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
-	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
-	c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
-
-	c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
-	c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
-	c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
-	c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
-	c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
-	c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
-	c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
-	c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
-	c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
-	c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,783 +0,0 @@
-/*
- * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-//#define DEBUG_ALIGNMENT
-#ifdef DEBUG_ALIGNMENT
-#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
-#else
-#define ASSERT_ALIGNED(ptr) ;
-#endif
-
-/* this code assume that stride % 16 == 0 */
-
-#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
-        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
-        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
-\
-        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
-        psum = vec_mladd(vB, vsrc1ssH, psum);\
-        psum = vec_mladd(vC, vsrc2ssH, psum);\
-        psum = vec_mladd(vD, vsrc3ssH, psum);\
-        psum = BIAS2(psum);\
-        psum = vec_sr(psum, v6us);\
-\
-        vdst = vec_ld(0, dst);\
-        ppsum = (vec_u8)vec_pack(psum, psum);\
-        vfdst = vec_perm(vdst, ppsum, fperm);\
-\
-        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
-\
-        vec_st(fsum, 0, dst);\
-\
-        vsrc0ssH = vsrc2ssH;\
-        vsrc1ssH = vsrc3ssH;\
-\
-        dst += stride;\
-        src += stride;
-
-#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
-\
-        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
-        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
-\
-        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
-        psum = vec_mladd(vE, vsrc1ssH, psum);\
-        psum = vec_sr(psum, v6us);\
-\
-        vdst = vec_ld(0, dst);\
-        ppsum = (vec_u8)vec_pack(psum, psum);\
-        vfdst = vec_perm(vdst, ppsum, fperm);\
-\
-        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
-\
-        vec_st(fsum, 0, dst);\
-\
-        dst += stride;\
-        src += stride;
-
-#define noop(a) a
-#define add28(a) vec_add(v28ss, a)
-
-static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
-                                    int stride, int h, int x, int y) {
-  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
-    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
-                        {((8 - x) * (8 - y)),
-                         ((    x) * (8 - y)),
-                         ((8 - x) * (    y)),
-                         ((    x) * (    y))};
-    register int i;
-    vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
-    LOAD_ZERO;
-    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
-    const vec_u16 v6us = vec_splat_u16(6);
-    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
-
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
-    vec_u8 vsrc0uc, vsrc1uc;
-    vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
-    vec_s16 vsrc2ssH, vsrc3ssH, psum;
-    vec_u8 vdst, ppsum, vfdst, fsum;
-
-  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
-
-    if (((unsigned long)dst) % 16 == 0) {
-        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
-                         0x14, 0x15, 0x16, 0x17,
-                         0x08, 0x09, 0x0A, 0x0B,
-                         0x0C, 0x0D, 0x0E, 0x0F};
-    } else {
-        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
-                         0x04, 0x05, 0x06, 0x07,
-                         0x18, 0x19, 0x1A, 0x1B,
-                         0x1C, 0x1D, 0x1E, 0x1F};
-    }
-
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
-
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
-
-    if (ABCD[3]) {
-        if (!loadSecond) {// -> !reallyBadAlign
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
-        } else {
-            vec_u8 vsrcDuc;
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrcDuc = vec_ld(stride + 16, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                if (reallyBadAlign)
-                    vsrc3uc = vsrcDuc;
-                else
-                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
-        }
-    } else {
-        const vec_s16 vE = vec_add(vB, vC);
-        if (ABCD[2]) { // x == 0 B == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrcDuc = vec_ld(stride + 15, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
-            }
-        } else { // y == 0 C == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrcDuc = vec_ld(15, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    if (reallyBadAlign)
-                        vsrc1uc = vsrcDuc;
-                    else
-                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
-            }
-        }
-    }
-    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
-}
-
-/* this code assume that stride % 16 == 0 */
-static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
-   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
-                        {((8 - x) * (8 - y)),
-                         ((    x) * (8 - y)),
-                         ((8 - x) * (    y)),
-                         ((    x) * (    y))};
-    register int i;
-    vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
-    LOAD_ZERO;
-    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
-    const vec_u16 v6us  = vec_splat_u16(6);
-    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
-
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
-    vec_u8 vsrc0uc, vsrc1uc;
-    vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
-    vec_s16 vsrc2ssH, vsrc3ssH, psum;
-    vec_u8 vdst, ppsum, vfdst, fsum;
-
-    if (((unsigned long)dst) % 16 == 0) {
-        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
-                         0x14, 0x15, 0x16, 0x17,
-                         0x08, 0x09, 0x0A, 0x0B,
-                         0x0C, 0x0D, 0x0E, 0x0F};
-    } else {
-        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
-                         0x04, 0x05, 0x06, 0x07,
-                         0x18, 0x19, 0x1A, 0x1B,
-                         0x1C, 0x1D, 0x1E, 0x1F};
-    }
-
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
-
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
-
-    if (!loadSecond) {// -> !reallyBadAlign
-        for (i = 0 ; i < h ; i++) {
-
-
-            vsrcCuc = vec_ld(stride + 0, src);
-
-            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
-    } else {
-        vec_u8 vsrcDuc;
-        for (i = 0 ; i < h ; i++) {
-            vsrcCuc = vec_ld(stride + 0, src);
-            vsrcDuc = vec_ld(stride + 16, src);
-
-            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-            if (reallyBadAlign)
-                vsrc3uc = vsrcDuc;
-            else
-                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
-    }
-}
-
-#undef noop
-#undef add28
-#undef CHROMA_MC8_ALTIVEC_CORE
-
-/* this code assume stride % 16 == 0 */
-static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
-    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
-    register int i;
-
-    LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
-    const vec_s16 v5ss = vec_splat_s16(5);
-    const vec_u16 v5us = vec_splat_u16(5);
-    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
-    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
-
-    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-
-    register int align = ((((unsigned long)src) - 2) % 16);
-
-    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
-              srcP2A, srcP2B, srcP3A, srcP3B,
-              srcM1A, srcM1B, srcM2A, srcM2B,
-              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
-              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
-              psumA, psumB, sumA, sumB;
-
-    vec_u8 sum, vdst, fsum;
-
-    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
-
-    for (i = 0 ; i < 16 ; i ++) {
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
-
-        sum1A = vec_adds(srcP0A, srcP1A);
-        sum1B = vec_adds(srcP0B, srcP1B);
-        sum2A = vec_adds(srcM1A, srcP2A);
-        sum2B = vec_adds(srcM1B, srcP2B);
-        sum3A = vec_adds(srcM2A, srcP3A);
-        sum3B = vec_adds(srcM2B, srcP3B);
-
-        pp1A = vec_mladd(sum1A, v20ss, v16ss);
-        pp1B = vec_mladd(sum1B, v20ss, v16ss);
-
-        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
-        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
-
-        pp3A = vec_add(sum3A, pp1A);
-        pp3B = vec_add(sum3B, pp1B);
-
-        psumA = vec_sub(pp3A, pp2A);
-        psumB = vec_sub(pp3B, pp2B);
-
-        sumA = vec_sra(psumA, v5us);
-        sumB = vec_sra(psumB, v5us);
-
-        sum = vec_packsu(sumA, sumB);
-
-        ASSERT_ALIGNED(dst);
-        vdst = vec_ld(0, dst);
-
-        OP_U8_ALTIVEC(fsum, sum, vdst);
-
-        vec_st(fsum, 0, dst);
-
-        src += srcStride;
-        dst += dstStride;
-    }
-    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
-}
-
-/* this code assume stride % 16 == 0 */
-static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
-    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
-
-    register int i;
-
-    LOAD_ZERO;
-    const vec_u8 perm = vec_lvsl(0, src);
-    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
-    const vec_u16 v5us = vec_splat_u16(5);
-    const vec_s16 v5ss = vec_splat_s16(5);
-    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
-
-    uint8_t *srcbis = src - (srcStride * 2);
-
-    const vec_u8 srcM2a = vec_ld(0, srcbis);
-    const vec_u8 srcM2b = vec_ld(16, srcbis);
-    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcM1b = vec_ld(16, srcbis);
-    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP0b = vec_ld(16, srcbis);
-    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP1b = vec_ld(16, srcbis);
-    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP2b = vec_ld(16, srcbis);
-    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
-    //srcbis += srcStride;
-
-    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
-    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
-    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
-    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
-    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
-
-    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
-              psumA, psumB, sumA, sumB,
-              srcP3ssA, srcP3ssB,
-              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
-
-    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
-
-    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
-
-    for (i = 0 ; i < 16 ; i++) {
-        srcP3a = vec_ld(0, srcbis += srcStride);
-        srcP3b = vec_ld(16, srcbis);
-        srcP3 = vec_perm(srcP3a, srcP3b, perm);
-        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
-        //srcbis += srcStride;
-
-        sum1A = vec_adds(srcP0ssA, srcP1ssA);
-        sum1B = vec_adds(srcP0ssB, srcP1ssB);
-        sum2A = vec_adds(srcM1ssA, srcP2ssA);
-        sum2B = vec_adds(srcM1ssB, srcP2ssB);
-        sum3A = vec_adds(srcM2ssA, srcP3ssA);
-        sum3B = vec_adds(srcM2ssB, srcP3ssB);
-
-        srcM2ssA = srcM1ssA;
-        srcM2ssB = srcM1ssB;
-        srcM1ssA = srcP0ssA;
-        srcM1ssB = srcP0ssB;
-        srcP0ssA = srcP1ssA;
-        srcP0ssB = srcP1ssB;
-        srcP1ssA = srcP2ssA;
-        srcP1ssB = srcP2ssB;
-        srcP2ssA = srcP3ssA;
-        srcP2ssB = srcP3ssB;
-
-        pp1A = vec_mladd(sum1A, v20ss, v16ss);
-        pp1B = vec_mladd(sum1B, v20ss, v16ss);
-
-        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
-        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
-
-        pp3A = vec_add(sum3A, pp1A);
-        pp3B = vec_add(sum3B, pp1B);
-
-        psumA = vec_sub(pp3A, pp2A);
-        psumB = vec_sub(pp3B, pp2B);
-
-        sumA = vec_sra(psumA, v5us);
-        sumB = vec_sra(psumB, v5us);
-
-        sum = vec_packsu(sumA, sumB);
-
-        ASSERT_ALIGNED(dst);
-        vdst = vec_ld(0, dst);
-
-        OP_U8_ALTIVEC(fsum, sum, vdst);
-
-        vec_st(fsum, 0, dst);
-
-        dst += dstStride;
-    }
-    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
-}
-
-/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
-static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
-    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
-    register int i;
-    LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
-    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
-    const vec_u32 v10ui = vec_splat_u32(10);
-    const vec_s16 v5ss = vec_splat_s16(5);
-    const vec_s16 v1ss = vec_splat_s16(1);
-    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
-    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
-
-    register int align = ((((unsigned long)src) - 2) % 16);
-
-    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
-              srcP2A, srcP2B, srcP3A, srcP3B,
-              srcM1A, srcM1B, srcM2A, srcM2B,
-              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
-              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
-
-    const vec_u8 mperm = (const vec_u8)
-        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
-         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
-    int16_t *tmpbis = tmp;
-
-    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
-              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
-              tmpP2ssA, tmpP2ssB;
-
-    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
-              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
-              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
-              ssumAe, ssumAo, ssumBe, ssumBo;
-    vec_u8 fsum, sumv, sum, vdst;
-    vec_s16 ssume, ssumo;
-
-    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
-    src -= (2 * srcStride);
-    for (i = 0 ; i < 21 ; i ++) {
-        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
-
-        sum1A = vec_adds(srcP0A, srcP1A);
-        sum1B = vec_adds(srcP0B, srcP1B);
-        sum2A = vec_adds(srcM1A, srcP2A);
-        sum2B = vec_adds(srcM1B, srcP2B);
-        sum3A = vec_adds(srcM2A, srcP3A);
-        sum3B = vec_adds(srcM2B, srcP3B);
-
-        pp1A = vec_mladd(sum1A, v20ss, sum3A);
-        pp1B = vec_mladd(sum1B, v20ss, sum3B);
-
-        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
-        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
-
-        psumA = vec_sub(pp1A, pp2A);
-        psumB = vec_sub(pp1B, pp2B);
-
-        vec_st(psumA, 0, tmp);
-        vec_st(psumB, 16, tmp);
-
-        src += srcStride;
-        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
-    }
-
-    tmpM2ssA = vec_ld(0, tmpbis);
-    tmpM2ssB = vec_ld(16, tmpbis);
-    tmpbis += tmpStride;
-    tmpM1ssA = vec_ld(0, tmpbis);
-    tmpM1ssB = vec_ld(16, tmpbis);
-    tmpbis += tmpStride;
-    tmpP0ssA = vec_ld(0, tmpbis);
-    tmpP0ssB = vec_ld(16, tmpbis);
-    tmpbis += tmpStride;
-    tmpP1ssA = vec_ld(0, tmpbis);
-    tmpP1ssB = vec_ld(16, tmpbis);
-    tmpbis += tmpStride;
-    tmpP2ssA = vec_ld(0, tmpbis);
-    tmpP2ssB = vec_ld(16, tmpbis);
-    tmpbis += tmpStride;
-
-    for (i = 0 ; i < 16 ; i++) {
-        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
-        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
-
-        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
-        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
-        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
-        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
-        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
-        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
-
-        tmpbis += tmpStride;
-
-        tmpM2ssA = tmpM1ssA;
-        tmpM2ssB = tmpM1ssB;
-        tmpM1ssA = tmpP0ssA;
-        tmpM1ssB = tmpP0ssB;
-        tmpP0ssA = tmpP1ssA;
-        tmpP0ssB = tmpP1ssB;
-        tmpP1ssA = tmpP2ssA;
-        tmpP1ssB = tmpP2ssB;
-        tmpP2ssA = tmpP3ssA;
-        tmpP2ssB = tmpP3ssB;
-
-        pp1Ae = vec_mule(sum1A, v20ss);
-        pp1Ao = vec_mulo(sum1A, v20ss);
-        pp1Be = vec_mule(sum1B, v20ss);
-        pp1Bo = vec_mulo(sum1B, v20ss);
-
-        pp2Ae = vec_mule(sum2A, v5ss);
-        pp2Ao = vec_mulo(sum2A, v5ss);
-        pp2Be = vec_mule(sum2B, v5ss);
-        pp2Bo = vec_mulo(sum2B, v5ss);
-
-        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
-        pp3Ao = vec_mulo(sum3A, v1ss);
-        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
-        pp3Bo = vec_mulo(sum3B, v1ss);
-
-        pp1cAe = vec_add(pp1Ae, v512si);
-        pp1cAo = vec_add(pp1Ao, v512si);
-        pp1cBe = vec_add(pp1Be, v512si);
-        pp1cBo = vec_add(pp1Bo, v512si);
-
-        pp32Ae = vec_sub(pp3Ae, pp2Ae);
-        pp32Ao = vec_sub(pp3Ao, pp2Ao);
-        pp32Be = vec_sub(pp3Be, pp2Be);
-        pp32Bo = vec_sub(pp3Bo, pp2Bo);
-
-        sumAe = vec_add(pp1cAe, pp32Ae);
-        sumAo = vec_add(pp1cAo, pp32Ao);
-        sumBe = vec_add(pp1cBe, pp32Be);
-        sumBo = vec_add(pp1cBo, pp32Bo);
-
-        ssumAe = vec_sra(sumAe, v10ui);
-        ssumAo = vec_sra(sumAo, v10ui);
-        ssumBe = vec_sra(sumBe, v10ui);
-        ssumBo = vec_sra(sumBo, v10ui);
-
-        ssume = vec_packs(ssumAe, ssumBe);
-        ssumo = vec_packs(ssumAo, ssumBo);
-
-        sumv = vec_packsu(ssume, ssumo);
-        sum = vec_perm(sumv, sumv, mperm);
-
-        ASSERT_ALIGNED(dst);
-        vdst = vec_ld(0, dst);
-
-        OP_U8_ALTIVEC(fsum, sum, vdst);
-
-        vec_st(fsum, 0, dst);
-
-        dst += dstStride;
-    }
-    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2001 Michel Lespinasse
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/*
- * NOTE: This code is based on GPL code from the libmpeg2 project.  The
- * author, Michel Lespinasses, has given explicit permission to release
- * under LGPL as part of FFmpeg.
- */
-
-/*
- * FFmpeg integration by Dieter Shirley
- *
- * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
- * project.  I've deleted all of the libmpeg2-specific code, renamed the
- * functions and reordered the function parameters.  The only change to the
- * IDCT function itself was to factor out the partial transposition, and to
- * perform a full transpose at the end of the function.
- */
-
-
-#include <stdlib.h>                                      /* malloc(), free() */
-#include <string.h>
-#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-#include "libavcodec/dsputil.h"
-#include "types_altivec.h"
-#include "dsputil_ppc.h"
-#include "dsputil_altivec.h"
-
-#define IDCT_HALF                                       \
-    /* 1st stage */                                     \
-    t1 = vec_mradds (a1, vx7, vx1 );                    \
-    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));    \
-    t7 = vec_mradds (a2, vx5, vx3);                     \
-    t3 = vec_mradds (ma2, vx3, vx5);                    \
-                                                        \
-    /* 2nd stage */                                     \
-    t5 = vec_adds (vx0, vx4);                           \
-    t0 = vec_subs (vx0, vx4);                           \
-    t2 = vec_mradds (a0, vx6, vx2);                     \
-    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));    \
-    t6 = vec_adds (t8, t3);                             \
-    t3 = vec_subs (t8, t3);                             \
-    t8 = vec_subs (t1, t7);                             \
-    t1 = vec_adds (t1, t7);                             \
-                                                        \
-    /* 3rd stage */                                     \
-    t7 = vec_adds (t5, t2);                             \
-    t2 = vec_subs (t5, t2);                             \
-    t5 = vec_adds (t0, t4);                             \
-    t0 = vec_subs (t0, t4);                             \
-    t4 = vec_subs (t8, t3);                             \
-    t3 = vec_adds (t8, t3);                             \
-                                                        \
-    /* 4th stage */                                     \
-    vy0 = vec_adds (t7, t1);                            \
-    vy7 = vec_subs (t7, t1);                            \
-    vy1 = vec_mradds (c4, t3, t5);                      \
-    vy6 = vec_mradds (mc4, t3, t5);                     \
-    vy2 = vec_mradds (c4, t4, t0);                      \
-    vy5 = vec_mradds (mc4, t4, t0);                     \
-    vy3 = vec_adds (t2, t6);                            \
-    vy4 = vec_subs (t2, t6);
-
-
-#define IDCT                                                            \
-    vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
-    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
-    vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
-    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
-    vec_u16 shift;                                                 \
-                                                                        \
-    c4 = vec_splat (constants[0], 0);                                   \
-    a0 = vec_splat (constants[0], 1);                                   \
-    a1 = vec_splat (constants[0], 2);                                   \
-    a2 = vec_splat (constants[0], 3);                                   \
-    mc4 = vec_splat (constants[0], 4);                                  \
-    ma2 = vec_splat (constants[0], 5);                                  \
-    bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3);     \
-                                                                        \
-    zero = vec_splat_s16 (0);                                           \
-    shift = vec_splat_u16 (4);                                          \
-                                                                        \
-    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);    \
-    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);    \
-    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);    \
-    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);    \
-    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);    \
-    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);    \
-    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);    \
-    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);    \
-                                                                        \
-    IDCT_HALF                                                           \
-                                                                        \
-    vx0 = vec_mergeh (vy0, vy4);                                        \
-    vx1 = vec_mergel (vy0, vy4);                                        \
-    vx2 = vec_mergeh (vy1, vy5);                                        \
-    vx3 = vec_mergel (vy1, vy5);                                        \
-    vx4 = vec_mergeh (vy2, vy6);                                        \
-    vx5 = vec_mergel (vy2, vy6);                                        \
-    vx6 = vec_mergeh (vy3, vy7);                                        \
-    vx7 = vec_mergel (vy3, vy7);                                        \
-                                                                        \
-    vy0 = vec_mergeh (vx0, vx4);                                        \
-    vy1 = vec_mergel (vx0, vx4);                                        \
-    vy2 = vec_mergeh (vx1, vx5);                                        \
-    vy3 = vec_mergel (vx1, vx5);                                        \
-    vy4 = vec_mergeh (vx2, vx6);                                        \
-    vy5 = vec_mergel (vx2, vx6);                                        \
-    vy6 = vec_mergeh (vx3, vx7);                                        \
-    vy7 = vec_mergel (vx3, vx7);                                        \
-                                                                        \
-    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                       \
-    vx1 = vec_mergel (vy0, vy4);                                        \
-    vx2 = vec_mergeh (vy1, vy5);                                        \
-    vx3 = vec_mergel (vy1, vy5);                                        \
-    vx4 = vec_mergeh (vy2, vy6);                                        \
-    vx5 = vec_mergel (vy2, vy6);                                        \
-    vx6 = vec_mergeh (vy3, vy7);                                        \
-    vx7 = vec_mergel (vy3, vy7);                                        \
-                                                                        \
-    IDCT_HALF                                                           \
-                                                                        \
-    shift = vec_splat_u16 (6);                                          \
-    vx0 = vec_sra (vy0, shift);                                         \
-    vx1 = vec_sra (vy1, shift);                                         \
-    vx2 = vec_sra (vy2, shift);                                         \
-    vx3 = vec_sra (vy3, shift);                                         \
-    vx4 = vec_sra (vy4, shift);                                         \
-    vx5 = vec_sra (vy5, shift);                                         \
-    vx6 = vec_sra (vy6, shift);                                         \
-    vx7 = vec_sra (vy7, shift);
-
-
-static const vec_s16 constants[5] = {
-    {23170, 13573,  6518, 21895, -23170, -21895,    32,    31},
-    {16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725},
-    {22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521},
-    {21407, 29692, 27969, 25172,  21407,  25172, 27969, 29692},
-    {19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722}
-};
-
-void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
-{
-POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
-    vec_s16 *block = (vec_s16*)blk;
-    vec_u8 tmp;
-
-#if CONFIG_POWERPC_PERF
-POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
-#endif
-    IDCT
-
-#define COPY(dest,src)                                          \
-    tmp = vec_packsu (src, src);                                \
-    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);       \
-    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
-
-    COPY (dest, vx0)    dest += stride;
-    COPY (dest, vx1)    dest += stride;
-    COPY (dest, vx2)    dest += stride;
-    COPY (dest, vx3)    dest += stride;
-    COPY (dest, vx4)    dest += stride;
-    COPY (dest, vx5)    dest += stride;
-    COPY (dest, vx6)    dest += stride;
-    COPY (dest, vx7)
-
-POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
-}
-
-void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
-{
-POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
-    vec_s16 *block = (vec_s16*)blk;
-    vec_u8 tmp;
-    vec_s16 tmp2, tmp3;
-    vec_u8 perm0;
-    vec_u8 perm1;
-    vec_u8 p0, p1, p;
-
-#if CONFIG_POWERPC_PERF
-POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
-#endif
-
-    IDCT
-
-    p0 = vec_lvsl (0, dest);
-    p1 = vec_lvsl (stride, dest);
-    p = vec_splat_u8 (-1);
-    perm0 = vec_mergeh (p, p0);
-    perm1 = vec_mergeh (p, p1);
-
-#define ADD(dest,src,perm)                                              \
-    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
-    tmp = vec_ld (0, dest);                                             \
-    tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
-    tmp3 = vec_adds (tmp2, src);                                        \
-    tmp = vec_packsu (tmp3, tmp3);                                      \
-    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
-    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
-
-    ADD (dest, vx0, perm0)      dest += stride;
-    ADD (dest, vx1, perm1)      dest += stride;
-    ADD (dest, vx2, perm0)      dest += stride;
-    ADD (dest, vx3, perm1)      dest += stride;
-    ADD (dest, vx4, perm0)      dest += stride;
-    ADD (dest, vx5, perm1)      dest += stride;
-    ADD (dest, vx6, perm0)      dest += stride;
-    ADD (dest, vx7, perm1)
-
-POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,79 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2001, 2002 Fabrice Bellard
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_PPC_MATHOPS_H
-#define AVCODEC_PPC_MATHOPS_H
-
-#include <stdint.h>
-#include "config.h"
-#include "libavutil/common.h"
-
-#if HAVE_PPC4XX
-/* signed 16x16 -> 32 multiply add accumulate */
-#define MAC16(rt, ra, rb) \
-    __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
-
-/* signed 16x16 -> 32 multiply */
-#define MUL16(ra, rb) \
-    ({ int __rt; \
-    __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
-    __rt; })
-#endif
-
-#define MULH MULH
-static inline av_const int MULH(int a, int b){
-    int r;
-    __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
-    return r;
-}
-
-#if !ARCH_PPC64
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
-{
-    union { uint64_t x; unsigned hl[2]; } x = { d };
-    int h, l;
-    __asm__ ("mullw %3, %4, %5   \n\t"
-             "mulhw %2, %4, %5   \n\t"
-             "addc  %1, %1, %3   \n\t"
-             "adde  %0, %0, %2   \n\t"
-             : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
-             : "r"(a), "r"(b));
-    return x.x;
-}
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
-
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
-{
-    union { uint64_t x; unsigned hl[2]; } x = { d };
-    int h, l;
-    __asm__ ("mullw %3, %4, %5   \n\t"
-             "mulhw %2, %4, %5   \n\t"
-             "subfc %1, %3, %1   \n\t"
-             "subfe %0, %2, %0   \n\t"
-             : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
-             : "r"(a), "r"(b));
-    return x.x;
-}
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
-#endif
-
-#endif /* AVCODEC_PPC_MATHOPS_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H
-#define AVCODEC_PPC_TYPES_ALTIVEC_H
-
-/***********************************************************************
- * Vector types
- **********************************************************************/
-#define vec_u8  vector unsigned char
-#define vec_s8  vector signed char
-#define vec_u16 vector unsigned short
-#define vec_s16 vector signed short
-#define vec_u32 vector unsigned int
-#define vec_s32 vector signed int
-
-/***********************************************************************
- * Null vector
- **********************************************************************/
-#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
-
-#define zero_u8v  (vec_u8)  zerov
-#define zero_s8v  (vec_s8)  zerov
-#define zero_u16v (vec_u16) zerov
-#define zero_s16v (vec_s16) zerov
-#define zero_u32v (vec_u32) zerov
-#define zero_s32v (vec_s32) zerov
-
-#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h
--- a/ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Contains misc utility macros and inline functions
- */
-
-#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H
-#define AVCODEC_PPC_UTIL_ALTIVEC_H
-
-#include <stdint.h>
-
-#include "config.h"
-
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
-// used to build registers permutation vectors (vcprm)
-// the 's' are for words in the _s_econd vector
-#define WORD_0 0x00,0x01,0x02,0x03
-#define WORD_1 0x04,0x05,0x06,0x07
-#define WORD_2 0x08,0x09,0x0a,0x0b
-#define WORD_3 0x0c,0x0d,0x0e,0x0f
-#define WORD_s0 0x10,0x11,0x12,0x13
-#define WORD_s1 0x14,0x15,0x16,0x17
-#define WORD_s2 0x18,0x19,0x1a,0x1b
-#define WORD_s3 0x1c,0x1d,0x1e,0x1f
-
-#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
-#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
-
-// vcprmle is used to keep the same index as in the SSE version.
-// it's the same as vcprm, with the index inversed
-// ('le' is Little Endian)
-#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
-
-// used to build inverse/identity vectors (vcii)
-// n is _n_egative, p is _p_ositive
-#define FLOAT_n -1.
-#define FLOAT_p 1.
-
-
-// Transpose 8x8 matrix of 16-bit elements (in-place)
-#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
-do { \
-    vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
-    vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
- \
-    A1 = vec_mergeh (a, e); \
-    B1 = vec_mergel (a, e); \
-    C1 = vec_mergeh (b, f); \
-    D1 = vec_mergel (b, f); \
-    E1 = vec_mergeh (c, g); \
-    F1 = vec_mergel (c, g); \
-    G1 = vec_mergeh (d, h); \
-    H1 = vec_mergel (d, h); \
- \
-    A2 = vec_mergeh (A1, E1); \
-    B2 = vec_mergel (A1, E1); \
-    C2 = vec_mergeh (B1, F1); \
-    D2 = vec_mergel (B1, F1); \
-    E2 = vec_mergeh (C1, G1); \
-    F2 = vec_mergel (C1, G1); \
-    G2 = vec_mergeh (D1, H1); \
-    H2 = vec_mergel (D1, H1); \
- \
-    a = vec_mergeh (A2, E2); \
-    b = vec_mergel (A2, E2); \
-    c = vec_mergeh (B2, F2); \
-    d = vec_mergel (B2, F2); \
-    e = vec_mergeh (C2, G2); \
-    f = vec_mergel (C2, G2); \
-    g = vec_mergeh (D2, H2); \
-    h = vec_mergel (D2, H2); \
-} while (0)
-
-
-/** \brief loads unaligned vector \a *src with offset \a offset
-    and returns it */
-static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
-{
-    register vector unsigned char first = vec_ld(offset, src);
-    register vector unsigned char second = vec_ld(offset+15, src);
-    register vector unsigned char mask = vec_lvsl(offset, src);
-    return vec_perm(first, second, mask);
-}
-
-#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/raw.h
--- a/ffmpeg_smp/h264dec/libavcodec/raw.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-/*
- * Raw Video Codec
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Raw Video Codec
- */
-
-#ifndef AVCODEC_RAW_H
-#define AVCODEC_RAW_H
-
-#include "avcodec.h"
-
-typedef struct PixelFormatTag {
-    enum PixelFormat pix_fmt;
-    unsigned int fourcc;
-} PixelFormatTag;
-
-extern const PixelFormatTag ff_raw_pixelFormatTags[];
-int raw_init_encoder(AVCodecContext *avctx);
-#endif /* AVCODEC_RAW_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/rectangle.h
--- a/ffmpeg_smp/h264dec/libavcodec/rectangle.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,92 +0,0 @@
-/*
- * rectangle filling function
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * useful rectangle filling function
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_RECTANGLE_H
-#define AVCODEC_RECTANGLE_H
-
-#include <assert.h>
-//#include "config.h"
-#include "libavutil/common.h"
-#include "dsputil.h"
-
-/**
- * fill a rectangle.
- * @param h height of the rectangle, should be a constant
- * @param w width of the rectangle, should be a constant
- * @param size the size of val (1, 2 or 4), should be a constant
- */
-static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
-    uint8_t *p= (uint8_t*)vp;
-    assert(size==1 || size==2 || size==4);
-    assert(w<=4);
-
-    w      *= size;
-    stride *= size;
-
-    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
-    assert((stride&(w-1))==0);
-    if(w==2){
-        const uint16_t v= size==4 ? val : val*0x0101;
-        *(uint16_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint16_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint16_t*)(p + 2*stride)= v;
-        *(uint16_t*)(p + 3*stride)= v;
-    }else if(w==4){
-        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
-        *(uint32_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint32_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint32_t*)(p + 2*stride)= v;
-        *(uint32_t*)(p + 3*stride)= v;
-    }else if(w==8){
-        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
-        *(uint64_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint64_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint64_t*)(p + 2*stride)= v;
-        *(uint64_t*)(p + 3*stride)= v;
-    }else if(w==16){
-        const uint64_t v= val*0x0100000001ULL;
-        *(uint64_t*)(p + 0+0*stride)= v;
-        *(uint64_t*)(p + 8+0*stride)= v;
-        *(uint64_t*)(p + 0+1*stride)= v;
-        *(uint64_t*)(p + 8+1*stride)= v;
-        if(h==2) return;
-        *(uint64_t*)(p + 0+2*stride)= v;
-        *(uint64_t*)(p + 8+2*stride)= v;
-        *(uint64_t*)(p + 0+3*stride)= v;
-        *(uint64_t*)(p + 8+3*stride)= v;
-    }else
-        assert(0);
-    assert(h==4);
-}
-
-#endif /* AVCODEC_RECTANGLE_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/scratch.c
--- a/ffmpeg_smp/h264dec/libavcodec/scratch.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,295 +0,0 @@
-static void *entropy_thread(void *arg){
-	H264Context *h = (H264Context *) arg;
-	EDSlice *s;
-	
-	H264Cabac hcabac;
-	CABACContext cabac;
-	
-	ff_init_cabac_states();
-	
-	if (init_cabac(h, &hcabac)<0)
-		return NULL;
-	
-	for(;;){
-		{
-			pthread_mutex_lock(&h->lock[ENTROPY]);
-			while (h->ed_cnt<=0)
-				pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
-			s= &h->ed_q[h->ed_fo];
-			pthread_mutex_unlock(&h->lock[ENTROPY]);
-			h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
-		}
-		if (s->state<0)
-			break;
-		
-		decode_slice_entropy(&hcabac, &cabac, s);
-		
-		{
-			pthread_mutex_lock(&h->lock[MBDEC]);
-			while (h->mbdec_cnt >= MAX_SLICE_COUNT)
-				pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
-			h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s);
-			h->mbdec_cnt++;
-			h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
-			pthread_cond_signal(&h->cond[MBDEC]);
-			pthread_mutex_unlock(&h->lock[MBDEC]);
-		}
-		{
-			pthread_mutex_lock(&h->lock[ENTROPY]);
-			h->ed_cnt--;
-			pthread_cond_signal(&h->cond[ENTROPY]);
-			pthread_mutex_unlock(&h->lock[ENTROPY]);
-		}
-	}
-	
-	{
-		pthread_mutex_lock(&h->lock[MBDEC]);
-		while (h->mbdec_cnt >= MAX_SLICE_COUNT)
-			pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
-		h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s);
-		h->mbdec_cnt++;
-		h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
-		pthread_cond_signal(&h->cond[MBDEC]);
-		pthread_mutex_unlock(&h->lock[MBDEC]);
-		
-	}
-	
-	free_cabac(&hcabac);
-	
-	pthread_exit(NULL);
-	return NULL;
-	
-}
-/*
-* The following code is the main loop of the file converter
-*/
-int av_transcode_1ed(int ifile, int ofile, int frame_width, int frame_height) {
-	H264Context *h;
-	pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;
-	
-	h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height);
-	
-	timer_start = av_gettime();
-	
-	//    pthread_create(&read_thr, NULL, read_thread, h);
-	//    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
-	pthread_create(&entropy_thr, NULL, entropy_mbd_thread, h);
-	
-	// pthread_create(&mbdec_thr, NULL, mbdec_thread, h);
-	
-	//   pthread_create(&write_thr, NULL, write_thread, h);
-	
-	//   pthread_join(read_thr, NULL);
-	//    pthread_join(parsenal_thr, NULL);
-	pthread_join(entropy_thr, NULL);
-	//    pthread_join(mbdec_thr, NULL);
-	//	printf("before write_thr\n");
-	//    pthread_join(write_thr, NULL);
-	
-	/* finished ! */
-	ff_h264_decode_end(h);
-	
-	return 0;
-}
-
-static void reset_h264mb(EDSlice *s, int mb_width, int mb_height){
-	for (int i=0; i<mb_height; i++){
-		for (int j=0; j<mb_width; j++){
-			H264Mb *m = &s->mbs[i*mb_width + j];
-
-			m->left_mb_xy=0;
-			m->top_mb_xy = 0;
-		}
-	}
-}
-
-static void *entropy_mbd_thread(void *arg){
-	H264Context *h = (H264Context *) arg;
-
-	EDSlice slice, *s=&slice;
-	MBSlice mbslice, *s2=&mbslice;
-	H264Cabac hcabac;
-	CABACContext cabac;
-	int frames =0;
-	MBDecContext mbdec, *d=&mbdec;
-	int size=h->width*h->height;
-	WriteContext write, *w=&write;
-	AVCodecParserContext parser, *pc= &parser;
-	NalContext nal, *n=&nal;
-
-
-	memset(pc, 0, sizeof(AVCodecParserContext));
-	pc->buffer_size = 2048;
-	pc->final_frame = 0;
-	pc->cur_len= 0;
-	pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE);
-	pc->size = 2048;
-	pc->eof_reached =0;
-	pc->ifile = h->ifile;
-
-	//init parse
-	memset(n, 0, sizeof(NalContext));
-	n->width = h->width;
-	n->height = h->height;
-	n->mb_height = h->mb_height;
-	n->mb_width  = h->mb_width;
-	n->b4_stride = n->mb_width*4 + 1;
-	n->mb_stride = n->mb_width + 1;
-	n->outputed_poc = INT_MIN;
-// 	memset(s, 0, sizeof(EDSlice));
-// 	ff_init_slice(n, s);
-//
-
-	memset(w, 0, sizeof(WriteContext));
-	w->bit_buffer_size= FFMAX(1024*256, 6*size + 200);
-	w->bit_buffer=  av_mallocz(w->bit_buffer_size);
-
-
-
-	ff_h264dsp_init(&d->hdsp);
-	ff_h264_pred_init(&d->hpc);
-	dsputil_init(&d->dsp);
-	d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab;
-	d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab;
-	d->mb_height = (h->height + 15) / 16;
-	d->mb_width  = (h->width  + 15) / 16;
-	d->linesize = h->width + EDGE_WIDTH*2;
-	d->uvlinesize = d->linesize>>1;
-
-	for(int i=0; i<16; i++){
-		d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3);
-	}
-	for(int i=0; i<4; i++){
-		d->block_offset[16+i]=
-		d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3);
-	}
-
-	d->scratchpad= av_mallocz((h->width+64)*4*16*2*sizeof(uint8_t));
-
-	ff_init_cabac_states();
-
-	if (init_cabac(h, &hcabac)<0)
-		return NULL;
-
-	while(!pc->final_frame && frames_max++ < 1000){
-		Picture *out;
-
-		RawFrame *frm;
-		Picture *pic=NULL;
-
-		RawFrame frm_read;
-		frm_read.state =0;
-		av_read_frame_internal(pc, &frm_read);
-		frm = &frm_read;
-
-		if (frm->state < 0)
-			break;
-/*
-		{
-			pthread_mutex_lock(&h->lock[PARSE2]);
-			while (h->slice_cnt<=0)
-				pthread_cond_wait(&h->cond[PARSE2], &h->lock[PARSE2]);
-			h->slice_cnt--;
-			s= &h->slices[h->slice_next++];
-			h->slice_next %= MAX_SLICE_COUNT;
-			pthread_mutex_unlock(&h->lock[PARSE2]);
-		}*/
-		ff_init_slice(n, s);
-		reset_h264mb(s, n->mb_width, n->mb_height);
-		for(int i=0; i<MAX_PIC_COUNT; i++){
-			if(h->picture[i].reference==0){
-				pic= &h->picture[i];
-				break;
-			}
-		}
-// 		{
-// 			pthread_mutex_lock(&h->lock[PARSE3]);
-// 			while (h->free_pic_cnt<=0)
-// 				pthread_cond_wait(&h->cond[PARSE3], &h->lock[PARSE3]);
-// 			h->free_pic_cnt--;
-// 			/* use first free picture */
-// 			for(int i=0; i<MAX_PIC_COUNT; i++){
-// 				if(h->picture[i].reference==0){
-// 					pic= &h->picture[i];
-// 					break;
-// 				}
-// 			}
-// 			pthread_mutex_unlock(&h->lock[PARSE3]);
-// 		}
-		ff_alloc_picture(n, s, pic);
-
-		decode_nal_units(n, s, frm, pic);
-
-
-		decode_slice_entropy(&hcabac, &cabac, s);
-		memcpy( s2, s, sizeof(MBSlice)); //this only copys the COMMON_SLICE part
-		av_freep(&s->gb.raw);
-		decode_slice_mb_seq(d, s2);
-
-//         if (s2->release_cnt>0) {
-//             int i;
-//             for (i=0; i<s2->release_cnt; i++){
-//                 if ((s2->release_ref[i]->reference & ~2) == 0)
-//                     default_release_buffer(h, s2->release_ref[i]);
-//                 else
-//                     s2->release_ref[i]->reference &= ~2;
-//             }
-//             s->release_cnt=0;
-//         }
-
-if (s->release_cnt>0) {
-	int i;
-	for (i=0; i<s->release_cnt; i++){
-		s->release_ref[i]->reference &= ~2;
-	}
-	s->release_cnt=0;
-}
-
-
-        {
-			pthread_mutex_lock(&h->lock[PARSE2]);
-			h->slice_cnt++;
-			pthread_cond_signal(&h->cond[PARSE2]);
-			pthread_mutex_unlock(&h->lock[PARSE2]);
-		}
-
-		out =output_frame(w, s2->current_picture, h->ofile, h->width, h->height);
-		print_report(w->frame_number, w->video_size, 0);
-
-		if (out){
-// 			if ((out->reference & ~1) == 0)
-// 				default_release_buffer(h, out);
-// 			else
-				out->reference &= ~1;
-		}
-
-		{
-			pthread_mutex_lock(&h->lock[ENTROPY]);
-			h->ed_cnt--;
-			pthread_cond_signal(&h->cond[ENTROPY]);
-			pthread_mutex_unlock(&h->lock[ENTROPY]);
-		}
-	}
-	while (output_frame(w, NULL, h->ofile, h->width, h->height));
-	print_report(w->frame_number, w->video_size, 1);
-
-	av_free(w->bit_buffer);
-
-	{//propagate exit
-		pthread_mutex_lock(&h->lock[WRITE]);
-		while (h->write_cnt>= MAX_DELAYED_PIC_COUNT)
-			pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
-		last_pic.reference = -1;
-		h->write_q[h->write_fi] = &last_pic;
-		h->write_cnt++;
-		h->write_fi++; h->write_fi %= MAX_DELAYED_PIC_COUNT;
-		pthread_cond_signal(&h->cond[WRITE]);
-		pthread_mutex_unlock(&h->lock[WRITE]);
-
-	}
-	free_cabac(&hcabac);
-
-	pthread_exit(NULL);
-	return NULL;
-
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/simple_idct.c
--- a/ffmpeg_smp/h264dec/libavcodec/simple_idct.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,372 +0,0 @@
-/*
- * Simple IDCT
- *
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * simpleidct in C.
- */
-
-/*
-  based upon some outcommented c code from mpeg2dec (idct_mmx.c
-  written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
- */
-#include "avcodec.h"
-#include "dsputil.h"
-#include "mathops.h"
-#include "simple_idct.h"
-
-#if 0
-#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
-#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
-#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
-#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
-#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
-#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
-#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
-#define ROW_SHIFT 8
-#define COL_SHIFT 17
-#else
-#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define ROW_SHIFT 11
-#define COL_SHIFT 20 // 6
-#endif
-
-static inline void idctRowCondDC (DCTELEM * row)
-{
-        int a0, a1, a2, a3, b0, b1, b2, b3;
-        uint64_t temp;
-
-#if HAVE_BIGENDIAN
-#define ROW0_MASK 0xffff000000000000LL
-#else
-#define ROW0_MASK 0xffffLL
-#endif
-        if(sizeof(DCTELEM)==2){
-            if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
-                  ((uint64_t *)row)[1]) == 0) {
-                temp = (row[0] << 3) & 0xffff;
-                temp += temp << 16;
-                temp += temp << 32;
-                ((uint64_t *)row)[0] = temp;
-                ((uint64_t *)row)[1] = temp;
-                return;
-            }
-        }else{
-            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
-                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
-                return;
-            }
-        }
-
-        a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
-        a1 = a0;
-        a2 = a0;
-        a3 = a0;
-
-        /* no need to optimize : gcc does it */
-        a0 += W2 * row[2];
-        a1 += W6 * row[2];
-        a2 -= W6 * row[2];
-        a3 -= W2 * row[2];
-
-        b0 = MUL16(W1, row[1]);
-        MAC16(b0, W3, row[3]);
-        b1 = MUL16(W3, row[1]);
-        MAC16(b1, -W7, row[3]);
-        b2 = MUL16(W5, row[1]);
-        MAC16(b2, -W1, row[3]);
-        b3 = MUL16(W7, row[1]);
-        MAC16(b3, -W5, row[3]);
-
-        temp = ((uint64_t*)row)[1];
-
-        if (temp != 0) {
-            a0 += W4*row[4] + W6*row[6];
-            a1 += - W4*row[4] - W2*row[6];
-            a2 += - W4*row[4] + W2*row[6];
-            a3 += W4*row[4] - W6*row[6];
-
-            MAC16(b0, W5, row[5]);
-            MAC16(b0, W7, row[7]);
-
-            MAC16(b1, -W1, row[5]);
-            MAC16(b1, -W5, row[7]);
-
-            MAC16(b2, W7, row[5]);
-            MAC16(b2, W3, row[7]);
-
-            MAC16(b3, W3, row[5]);
-            MAC16(b3, -W1, row[7]);
-        }
-
-        row[0] = (a0 + b0) >> ROW_SHIFT;
-        row[7] = (a0 - b0) >> ROW_SHIFT;
-        row[1] = (a1 + b1) >> ROW_SHIFT;
-        row[6] = (a1 - b1) >> ROW_SHIFT;
-        row[2] = (a2 + b2) >> ROW_SHIFT;
-        row[5] = (a2 - b2) >> ROW_SHIFT;
-        row[3] = (a3 + b3) >> ROW_SHIFT;
-        row[4] = (a3 - b3) >> ROW_SHIFT;
-}
-
-static inline void idctSparseColPut (uint8_t *dest, int line_size,
-                                     DCTELEM * col)
-{
-        int a0, a1, a2, a3, b0, b1, b2, b3;
-        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-        /* XXX: I did that only to give same values as previous code */
-        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
-        a1 = a0;
-        a2 = a0;
-        a3 = a0;
-
-        a0 +=  + W2*col[8*2];
-        a1 +=  + W6*col[8*2];
-        a2 +=  - W6*col[8*2];
-        a3 +=  - W2*col[8*2];
-
-        b0 = MUL16(W1, col[8*1]);
-        b1 = MUL16(W3, col[8*1]);
-        b2 = MUL16(W5, col[8*1]);
-        b3 = MUL16(W7, col[8*1]);
-
-        MAC16(b0, + W3, col[8*3]);
-        MAC16(b1, - W7, col[8*3]);
-        MAC16(b2, - W1, col[8*3]);
-        MAC16(b3, - W5, col[8*3]);
-
-        if(col[8*4]){
-            a0 += + W4*col[8*4];
-            a1 += - W4*col[8*4];
-            a2 += - W4*col[8*4];
-            a3 += + W4*col[8*4];
-        }
-
-        if (col[8*5]) {
-            MAC16(b0, + W5, col[8*5]);
-            MAC16(b1, - W1, col[8*5]);
-            MAC16(b2, + W7, col[8*5]);
-            MAC16(b3, + W3, col[8*5]);
-        }
-
-        if(col[8*6]){
-            a0 += + W6*col[8*6];
-            a1 += - W2*col[8*6];
-            a2 += + W2*col[8*6];
-            a3 += - W6*col[8*6];
-        }
-
-        if (col[8*7]) {
-            MAC16(b0, + W7, col[8*7]);
-            MAC16(b1, - W5, col[8*7]);
-            MAC16(b2, + W3, col[8*7]);
-            MAC16(b3, - W1, col[8*7]);
-        }
-
-        dest[0] = cm[(a0 + b0) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a1 + b1) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a2 + b2) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a3 + b3) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a3 - b3) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a2 - b2) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a1 - b1) >> COL_SHIFT];
-        dest += line_size;
-        dest[0] = cm[(a0 - b0) >> COL_SHIFT];
-}
-
-static inline void idctSparseColAdd (uint8_t *dest, int line_size,
-                                     DCTELEM * col)
-{
-        int a0, a1, a2, a3, b0, b1, b2, b3;
-        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-        /* XXX: I did that only to give same values as previous code */
-        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
-        a1 = a0;
-        a2 = a0;
-        a3 = a0;
-
-        a0 +=  + W2*col[8*2];
-        a1 +=  + W6*col[8*2];
-        a2 +=  - W6*col[8*2];
-        a3 +=  - W2*col[8*2];
-
-        b0 = MUL16(W1, col[8*1]);
-        b1 = MUL16(W3, col[8*1]);
-        b2 = MUL16(W5, col[8*1]);
-        b3 = MUL16(W7, col[8*1]);
-
-        MAC16(b0, + W3, col[8*3]);
-        MAC16(b1, - W7, col[8*3]);
-        MAC16(b2, - W1, col[8*3]);
-        MAC16(b3, - W5, col[8*3]);
-
-        if(col[8*4]){
-            a0 += + W4*col[8*4];
-            a1 += - W4*col[8*4];
-            a2 += - W4*col[8*4];
-            a3 += + W4*col[8*4];
-        }
-
-        if (col[8*5]) {
-            MAC16(b0, + W5, col[8*5]);
-            MAC16(b1, - W1, col[8*5]);
-            MAC16(b2, + W7, col[8*5]);
-            MAC16(b3, + W3, col[8*5]);
-        }
-
-        if(col[8*6]){
-            a0 += + W6*col[8*6];
-            a1 += - W2*col[8*6];
-            a2 += + W2*col[8*6];
-            a3 += - W6*col[8*6];
-        }
-
-        if (col[8*7]) {
-            MAC16(b0, + W7, col[8*7]);
-            MAC16(b1, - W5, col[8*7]);
-            MAC16(b2, + W3, col[8*7]);
-            MAC16(b3, - W1, col[8*7]);
-        }
-
-        dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
-        dest += line_size;
-        dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
-}
-
-static inline void idctSparseCol (DCTELEM * col)
-{
-        int a0, a1, a2, a3, b0, b1, b2, b3;
-
-        /* XXX: I did that only to give same values as previous code */
-        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
-        a1 = a0;
-        a2 = a0;
-        a3 = a0;
-
-        a0 +=  + W2*col[8*2];
-        a1 +=  + W6*col[8*2];
-        a2 +=  - W6*col[8*2];
-        a3 +=  - W2*col[8*2];
-
-        b0 = MUL16(W1, col[8*1]);
-        b1 = MUL16(W3, col[8*1]);
-        b2 = MUL16(W5, col[8*1]);
-        b3 = MUL16(W7, col[8*1]);
-
-        MAC16(b0, + W3, col[8*3]);
-        MAC16(b1, - W7, col[8*3]);
-        MAC16(b2, - W1, col[8*3]);
-        MAC16(b3, - W5, col[8*3]);
-
-        if(col[8*4]){
-            a0 += + W4*col[8*4];
-            a1 += - W4*col[8*4];
-            a2 += - W4*col[8*4];
-            a3 += + W4*col[8*4];
-        }
-
-        if (col[8*5]) {
-            MAC16(b0, + W5, col[8*5]);
-            MAC16(b1, - W1, col[8*5]);
-            MAC16(b2, + W7, col[8*5]);
-            MAC16(b3, + W3, col[8*5]);
-        }
-
-        if(col[8*6]){
-            a0 += + W6*col[8*6];
-            a1 += - W2*col[8*6];
-            a2 += + W2*col[8*6];
-            a3 += - W6*col[8*6];
-        }
-
-        if (col[8*7]) {
-            MAC16(b0, + W7, col[8*7]);
-            MAC16(b1, - W5, col[8*7]);
-            MAC16(b2, + W3, col[8*7]);
-            MAC16(b3, - W1, col[8*7]);
-        }
-
-        col[0 ] = ((a0 + b0) >> COL_SHIFT);
-        col[8 ] = ((a1 + b1) >> COL_SHIFT);
-        col[16] = ((a2 + b2) >> COL_SHIFT);
-        col[24] = ((a3 + b3) >> COL_SHIFT);
-        col[32] = ((a3 - b3) >> COL_SHIFT);
-        col[40] = ((a2 - b2) >> COL_SHIFT);
-        col[48] = ((a1 - b1) >> COL_SHIFT);
-        col[56] = ((a0 - b0) >> COL_SHIFT);
-}
-
-void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    int i;
-    for(i=0; i<8; i++)
-        idctRowCondDC(block + i*8);
-
-    for(i=0; i<8; i++)
-        idctSparseColPut(dest + i, line_size, block + i);
-}
-
-void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    int i;
-    for(i=0; i<8; i++)
-        idctRowCondDC(block + i*8);
-
-    for(i=0; i<8; i++)
-        idctSparseColAdd(dest + i, line_size, block + i);
-}
-
-void ff_simple_idct(DCTELEM *block)
-{
-    int i;
-    for(i=0; i<8; i++)
-        idctRowCondDC(block + i*8);
-
-    for(i=0; i<8; i++)
-        idctSparseCol(block + i);
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/simple_idct.h
--- a/ffmpeg_smp/h264dec/libavcodec/simple_idct.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-/*
- * Simple IDCT
- *
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * simple idct header.
- */
-
-#ifndef AVCODEC_SIMPLE_IDCT_H
-#define AVCODEC_SIMPLE_IDCT_H
-
-#include <stdint.h>
-#include "dsputil.h"
-
-void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_simple_idct_mmx(int16_t *block);
-void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct(DCTELEM *block);
-
-void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block);
-
-void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block);
-
-#endif /* AVCODEC_SIMPLE_IDCT_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/utils.c
--- a/ffmpeg_smp/h264dec/libavcodec/utils.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,68 +0,0 @@
-/*
- * utils for libavcodec
- * Copyright (c) 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * utils.
- */
-
-/* needed for mkstemp() */
-#define _XOPEN_SOURCE 600
-
-#include "avcodec.h"
-#include "dsputil.h"
-
-#include <stdlib.h>
-#include <stdarg.h>
-#include <limits.h>
-#include <float.h>
-//#undef NDEBUG
-#include <assert.h>
-
-#include <fcntl.h>
-
-void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size)
-{
-    if(min_size < *size)
-        return ptr;
-
-    *size= FFMAX(17*min_size/16 + 32, min_size);
-
-    ptr= av_realloc(ptr, *size);
-    if(!ptr) //we could set this to the unmodified min_size but this is safer if the user lost the ptr and uses NULL now
-        *size= 0;
-
-    return ptr;
-}
-
-void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size)
-{
-    void **p = ptr;
-    if (min_size < *size)
-        return;
-    *size= FFMAX(17*min_size/16 + 32, min_size);
-    av_free(*p);
-    *p = av_malloc(*size);
-    if (!*p) *size = 0;
-}
-
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c
--- a/ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,135 +0,0 @@
-/*
- * CPU detection code, extracted from mmx.h
- * (c)1997-99 by H. Dietz and R. Fisher
- * Converted to C and improved by Fabrice Bellard.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdlib.h>
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-
-#undef printf
-
-/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
-#define cpuid(index,eax,ebx,ecx,edx)\
-    __asm__ volatile\
-        ("mov %%"REG_b", %%"REG_S"\n\t"\
-         "cpuid\n\t"\
-         "xchg %%"REG_b", %%"REG_S\
-         : "=a" (eax), "=S" (ebx),\
-           "=c" (ecx), "=d" (edx)\
-         : "0" (index));
-
-/* Function to test if multimedia instructions are supported...  */
-int mm_support()
-{
-    int rval = 0;
-    int eax, ebx, ecx, edx;
-    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
-
-#if ARCH_X86_32
-    x86_reg a, c;
-    __asm__ volatile (
-        /* See if CPUID instruction is supported ... */
-        /* ... Get copies of EFLAGS into eax and ecx */
-        "pushfl\n\t"
-        "pop %0\n\t"
-        "mov %0, %1\n\t"
-
-        /* ... Toggle the ID bit in one copy and store */
-        /*     to the EFLAGS reg */
-        "xor $0x200000, %0\n\t"
-        "push %0\n\t"
-        "popfl\n\t"
-
-        /* ... Get the (hopefully modified) EFLAGS */
-        "pushfl\n\t"
-        "pop %0\n\t"
-        : "=a" (a), "=c" (c)
-        :
-        : "cc"
-        );
-
-    if (a == c)
-        return 0; /* CPUID not supported */
-#endif
-
-    cpuid(0, max_std_level, ebx, ecx, edx);
-
-    if(max_std_level >= 1){
-        cpuid(1, eax, ebx, ecx, std_caps);
-        if (std_caps & (1<<23))
-            rval |= FF_MM_MMX;
-        if (std_caps & (1<<25))
-            rval |= FF_MM_MMX2
-#if HAVE_SSE
-                  | FF_MM_SSE;
-        if (std_caps & (1<<26))
-            rval |= FF_MM_SSE2;
-        if (ecx & 1)
-            rval |= FF_MM_SSE3;
-        if (ecx & 0x00000200 )
-            rval |= FF_MM_SSSE3;
-        if (ecx & 0x00080000 )
-            rval |= FF_MM_SSE4;
-        if (ecx & 0x00100000 )
-            rval |= FF_MM_SSE42;
-#endif
-                  ;
-    }
-
-    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
-
-    if(max_ext_level >= 0x80000001){
-        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
-        if (ext_caps & (1<<31))
-            rval |= FF_MM_3DNOW;
-        if (ext_caps & (1<<30))
-            rval |= FF_MM_3DNOWEXT;
-        if (ext_caps & (1<<23))
-            rval |= FF_MM_MMX;
-        if (ext_caps & (1<<22))
-            rval |= FF_MM_MMX2;
-    }
-
-#if 0
-    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
-        (rval&FF_MM_MMX) ? "MMX ":"",
-        (rval&FF_MM_MMX2) ? "MMX2 ":"",
-        (rval&FF_MM_SSE) ? "SSE ":"",
-        (rval&FF_MM_SSE2) ? "SSE2 ":"",
-        (rval&FF_MM_SSE3) ? "SSE3 ":"",
-        (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
-        (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
-        (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
-        (rval&FF_MM_3DNOW) ? "3DNow ":"",
-        (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
-#endif
-    return rval;
-}
-
-#ifdef TEST
-int main ( void )
-{
-    int mm_flags;
-    mm_flags = mm_support();
-    printf("mm_support = 0x%08X\n",mm_flags);
-    return 0;
-}
-#endif
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c
--- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
- *                    Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * MMX optimized version of (put|avg)_h264_chroma_mc8.
- * H264_CHROMA_MC8_TMPL must be defined to the desired function name
- * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
- * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
- */
-static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
-{
-    DECLARE_ALIGNED(8, uint64_t, AA);
-    DECLARE_ALIGNED(8, uint64_t, DD);
-    int i;
-
-    if(y==0 && x==0) {
-        /* no filter needed */
-        H264_CHROMA_MC8_MV0(dst, src, stride, h);
-        return;
-    }
-
-    assert(x<8 && y<8 && x>=0 && y>=0);
-
-    if(y==0 || x==0)
-    {
-        /* 1 dimensional filter only */
-        const int dxy = x ? 1 : stride;
-
-        __asm__ volatile(
-            "movd %0, %%mm5\n\t"
-            "movq %1, %%mm4\n\t"
-            "movq %2, %%mm6\n\t"         /* mm6 = rnd >> 3 */
-            "punpcklwd %%mm5, %%mm5\n\t"
-            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
-            "pxor %%mm7, %%mm7\n\t"
-            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
-            :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
-
-        for(i=0; i<h; i++) {
-            __asm__ volatile(
-                /* mm0 = src[0..7], mm1 = src[1..8] */
-                "movq %0, %%mm0\n\t"
-                "movq %1, %%mm2\n\t"
-                :: "m"(src[0]), "m"(src[dxy]));
-
-            __asm__ volatile(
-                /* [mm0,mm1] = A * src[0..7] */
-                /* [mm2,mm3] = B * src[1..8] */
-                "movq %%mm0, %%mm1\n\t"
-                "movq %%mm2, %%mm3\n\t"
-                "punpcklbw %%mm7, %%mm0\n\t"
-                "punpckhbw %%mm7, %%mm1\n\t"
-                "punpcklbw %%mm7, %%mm2\n\t"
-                "punpckhbw %%mm7, %%mm3\n\t"
-                "pmullw %%mm4, %%mm0\n\t"
-                "pmullw %%mm4, %%mm1\n\t"
-                "pmullw %%mm5, %%mm2\n\t"
-                "pmullw %%mm5, %%mm3\n\t"
-
-                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
-                "paddw %%mm6, %%mm0\n\t"
-                "paddw %%mm6, %%mm1\n\t"
-                "paddw %%mm2, %%mm0\n\t"
-                "paddw %%mm3, %%mm1\n\t"
-                "psrlw $3, %%mm0\n\t"
-                "psrlw $3, %%mm1\n\t"
-                "packuswb %%mm1, %%mm0\n\t"
-                H264_CHROMA_OP(%0, %%mm0)
-                "movq %%mm0, %0\n\t"
-                : "=m" (dst[0]));
-
-            src += stride;
-            dst += stride;
-        }
-        return;
-    }
-
-    /* general case, bilinear */
-    __asm__ volatile("movd %2, %%mm4\n\t"
-                 "movd %3, %%mm6\n\t"
-                 "punpcklwd %%mm4, %%mm4\n\t"
-                 "punpcklwd %%mm6, %%mm6\n\t"
-                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
-                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
-                 "movq %%mm4, %%mm5\n\t"
-                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
-                 "psllw $3, %%mm5\n\t"
-                 "psllw $3, %%mm6\n\t"
-                 "movq %%mm5, %%mm7\n\t"
-                 "paddw %%mm6, %%mm7\n\t"
-                 "movq %%mm4, %1\n\t"         /* DD = x * y */
-                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
-                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
-                 "paddw %4, %%mm4\n\t"
-                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
-                 "pxor %%mm7, %%mm7\n\t"
-                 "movq %%mm4, %0\n\t"
-                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
-
-    __asm__ volatile(
-        /* mm0 = src[0..7], mm1 = src[1..8] */
-        "movq %0, %%mm0\n\t"
-        "movq %1, %%mm1\n\t"
-        : : "m" (src[0]), "m" (src[1]));
-
-    for(i=0; i<h; i++) {
-        src += stride;
-
-        __asm__ volatile(
-            /* mm2 = A * src[0..3] + B * src[1..4] */
-            /* mm3 = A * src[4..7] + B * src[5..8] */
-            "movq %%mm0, %%mm2\n\t"
-            "movq %%mm1, %%mm3\n\t"
-            "punpckhbw %%mm7, %%mm0\n\t"
-            "punpcklbw %%mm7, %%mm1\n\t"
-            "punpcklbw %%mm7, %%mm2\n\t"
-            "punpckhbw %%mm7, %%mm3\n\t"
-            "pmullw %0, %%mm0\n\t"
-            "pmullw %0, %%mm2\n\t"
-            "pmullw %%mm5, %%mm1\n\t"
-            "pmullw %%mm5, %%mm3\n\t"
-            "paddw %%mm1, %%mm2\n\t"
-            "paddw %%mm0, %%mm3\n\t"
-            : : "m" (AA));
-
-        __asm__ volatile(
-            /* [mm2,mm3] += C * src[0..7] */
-            "movq %0, %%mm0\n\t"
-            "movq %%mm0, %%mm1\n\t"
-            "punpcklbw %%mm7, %%mm0\n\t"
-            "punpckhbw %%mm7, %%mm1\n\t"
-            "pmullw %%mm6, %%mm0\n\t"
-            "pmullw %%mm6, %%mm1\n\t"
-            "paddw %%mm0, %%mm2\n\t"
-            "paddw %%mm1, %%mm3\n\t"
-            : : "m" (src[0]));
-
-        __asm__ volatile(
-            /* [mm2,mm3] += D * src[1..8] */
-            "movq %1, %%mm1\n\t"
-            "movq %%mm1, %%mm0\n\t"
-            "movq %%mm1, %%mm4\n\t"
-            "punpcklbw %%mm7, %%mm0\n\t"
-            "punpckhbw %%mm7, %%mm4\n\t"
-            "pmullw %2, %%mm0\n\t"
-            "pmullw %2, %%mm4\n\t"
-            "paddw %%mm0, %%mm2\n\t"
-            "paddw %%mm4, %%mm3\n\t"
-            "movq %0, %%mm0\n\t"
-            : : "m" (src[0]), "m" (src[1]), "m" (DD));
-
-        __asm__ volatile(
-            /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
-            "paddw %1, %%mm2\n\t"
-            "paddw %1, %%mm3\n\t"
-            "psrlw $6, %%mm2\n\t"
-            "psrlw $6, %%mm3\n\t"
-            "packuswb %%mm3, %%mm2\n\t"
-            H264_CHROMA_OP(%0, %%mm2)
-            "movq %%mm2, %0\n\t"
-            : "=m" (dst[0]) : "m" (*rnd_reg));
-        dst+= stride;
-    }
-}
-
-static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
-{
-    __asm__ volatile(
-        "pxor   %%mm7, %%mm7        \n\t"
-        "movd %5, %%mm2             \n\t"
-        "movd %6, %%mm3             \n\t"
-        "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
-        "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
-        "punpcklwd %%mm2, %%mm2     \n\t"
-        "punpcklwd %%mm3, %%mm3     \n\t"
-        "punpcklwd %%mm2, %%mm2     \n\t"
-        "punpcklwd %%mm3, %%mm3     \n\t"
-        "psubw %%mm2, %%mm4         \n\t"
-        "psubw %%mm3, %%mm5         \n\t"
-
-        "movd  (%1), %%mm0          \n\t"
-        "movd 1(%1), %%mm6          \n\t"
-        "add %3, %1                 \n\t"
-        "punpcklbw %%mm7, %%mm0     \n\t"
-        "punpcklbw %%mm7, %%mm6     \n\t"
-        "pmullw %%mm4, %%mm0        \n\t"
-        "pmullw %%mm2, %%mm6        \n\t"
-        "paddw %%mm0, %%mm6         \n\t"
-
-        "1:                         \n\t"
-        "movd  (%1), %%mm0          \n\t"
-        "movd 1(%1), %%mm1          \n\t"
-        "add %3, %1                 \n\t"
-        "punpcklbw %%mm7, %%mm0     \n\t"
-        "punpcklbw %%mm7, %%mm1     \n\t"
-        "pmullw %%mm4, %%mm0        \n\t"
-        "pmullw %%mm2, %%mm1        \n\t"
-        "paddw %%mm0, %%mm1         \n\t"
-        "movq %%mm1, %%mm0          \n\t"
-        "pmullw %%mm5, %%mm6        \n\t"
-        "pmullw %%mm3, %%mm1        \n\t"
-        "paddw %4, %%mm6            \n\t"
-        "paddw %%mm6, %%mm1         \n\t"
-        "psrlw $6, %%mm1            \n\t"
-        "packuswb %%mm1, %%mm1      \n\t"
-        H264_CHROMA_OP4((%0), %%mm1, %%mm6)
-        "movd %%mm1, (%0)           \n\t"
-        "add %3, %0                 \n\t"
-        "movd  (%1), %%mm6          \n\t"
-        "movd 1(%1), %%mm1          \n\t"
-        "add %3, %1                 \n\t"
-        "punpcklbw %%mm7, %%mm6     \n\t"
-        "punpcklbw %%mm7, %%mm1     \n\t"
-        "pmullw %%mm4, %%mm6        \n\t"
-        "pmullw %%mm2, %%mm1        \n\t"
-        "paddw %%mm6, %%mm1         \n\t"
-        "movq %%mm1, %%mm6          \n\t"
-        "pmullw %%mm5, %%mm0        \n\t"
-        "pmullw %%mm3, %%mm1        \n\t"
-        "paddw %4, %%mm0            \n\t"
-        "paddw %%mm0, %%mm1         \n\t"
-        "psrlw $6, %%mm1            \n\t"
-        "packuswb %%mm1, %%mm1      \n\t"
-        H264_CHROMA_OP4((%0), %%mm1, %%mm0)
-        "movd %%mm1, (%0)           \n\t"
-        "add %3, %0                 \n\t"
-        "sub $2, %2                 \n\t"
-        "jnz 1b                     \n\t"
-        : "+r"(dst), "+r"(src), "+r"(h)
-        : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
-    );
-}
-
-#ifdef H264_CHROMA_MC2_TMPL
-static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    int tmp = ((1<<16)-1)*x + 8;
-    int CD= tmp*y;
-    int AB= (tmp<<3) - CD;
-    __asm__ volatile(
-        /* mm5 = {A,B,A,B} */
-        /* mm6 = {C,D,C,D} */
-        "movd %0, %%mm5\n\t"
-        "movd %1, %%mm6\n\t"
-        "punpckldq %%mm5, %%mm5\n\t"
-        "punpckldq %%mm6, %%mm6\n\t"
-        "pxor %%mm7, %%mm7\n\t"
-        /* mm0 = src[0,1,1,2] */
-        "movd %2, %%mm2\n\t"
-        "punpcklbw %%mm7, %%mm2\n\t"
-        "pshufw $0x94, %%mm2, %%mm2\n\t"
-        :: "r"(AB), "r"(CD), "m"(src[0]));
-
-
-    __asm__ volatile(
-        "1:\n\t"
-        "add %4, %1\n\t"
-        /* mm1 = A * src[0,1] + B * src[1,2] */
-        "movq    %%mm2, %%mm1\n\t"
-        "pmaddwd %%mm5, %%mm1\n\t"
-        /* mm0 = src[0,1,1,2] */
-        "movd (%1), %%mm0\n\t"
-        "punpcklbw %%mm7, %%mm0\n\t"
-        "pshufw $0x94, %%mm0, %%mm0\n\t"
-        /* mm1 += C * src[0,1] + D * src[1,2] */
-        "movq    %%mm0, %%mm2\n\t"
-        "pmaddwd %%mm6, %%mm0\n\t"
-        "paddw      %3, %%mm1\n\t"
-        "paddw   %%mm0, %%mm1\n\t"
-        /* dst[0,1] = pack((mm1 + 32) >> 6) */
-        "psrlw $6, %%mm1\n\t"
-        "packssdw %%mm7, %%mm1\n\t"
-        "packuswb %%mm7, %%mm1\n\t"
-        H264_CHROMA_OP4((%0), %%mm1, %%mm3)
-        "movd %%mm1, %%esi\n\t"
-        "movw %%si, (%0)\n\t"
-        "add %4, %0\n\t"
-        "sub $1, %2\n\t"
-        "jnz 1b\n\t"
-        : "+r" (dst), "+r"(src), "+r"(h)
-        : "m" (ff_pw_32), "r"((x86_reg)stride)
-        : "%esi");
-
-}
-#endif
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c
--- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2008 Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
- * H264_CHROMA_MC8_TMPL must be defined to the desired function name
- * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
- * AVG_OP must be defined to empty for put and the identify for avg
- */
-static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
-{
-    if(y==0 && x==0) {
-        /* no filter needed */
-        H264_CHROMA_MC8_MV0(dst, src, stride, h);
-        return;
-    }
-
-    assert(x<8 && y<8 && x>=0 && y>=0);
-
-    if(y==0 || x==0)
-    {
-        /* 1 dimensional filter only */
-        __asm__ volatile(
-            "movd %0, %%xmm7 \n\t"
-            "movq %1, %%xmm6 \n\t"
-            "pshuflw $0, %%xmm7, %%xmm7 \n\t"
-            "movlhps %%xmm6, %%xmm6 \n\t"
-            "movlhps %%xmm7, %%xmm7 \n\t"
-            :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
-        );
-
-        if(x) {
-            __asm__ volatile(
-                "1: \n\t"
-                "movq (%1), %%xmm0 \n\t"
-                "movq 1(%1), %%xmm1 \n\t"
-                "movq (%1,%3), %%xmm2 \n\t"
-                "movq 1(%1,%3), %%xmm3 \n\t"
-                "punpcklbw %%xmm1, %%xmm0 \n\t"
-                "punpcklbw %%xmm3, %%xmm2 \n\t"
-                "pmaddubsw %%xmm7, %%xmm0 \n\t"
-                "pmaddubsw %%xmm7, %%xmm2 \n\t"
-         AVG_OP("movq (%0), %%xmm4 \n\t")
-         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
-                "paddw %%xmm6, %%xmm0 \n\t"
-                "paddw %%xmm6, %%xmm2 \n\t"
-                "psrlw $3, %%xmm0 \n\t"
-                "psrlw $3, %%xmm2 \n\t"
-                "packuswb %%xmm2, %%xmm0 \n\t"
-         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
-                "movq %%xmm0, (%0) \n\t"
-                "movhps %%xmm0, (%0,%3) \n\t"
-                "sub $2, %2 \n\t"
-                "lea (%1,%3,2), %1 \n\t"
-                "lea (%0,%3,2), %0 \n\t"
-                "jg 1b \n\t"
-                :"+r"(dst), "+r"(src), "+r"(h)
-                :"r"((x86_reg)stride)
-            );
-        } else {
-            __asm__ volatile(
-                "1: \n\t"
-                "movq (%1), %%xmm0 \n\t"
-                "movq (%1,%3), %%xmm1 \n\t"
-                "movdqa %%xmm1, %%xmm2 \n\t"
-                "movq (%1,%3,2), %%xmm3 \n\t"
-                "punpcklbw %%xmm1, %%xmm0 \n\t"
-                "punpcklbw %%xmm3, %%xmm2 \n\t"
-                "pmaddubsw %%xmm7, %%xmm0 \n\t"
-                "pmaddubsw %%xmm7, %%xmm2 \n\t"
-         AVG_OP("movq (%0), %%xmm4 \n\t")
-         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
-                "paddw %%xmm6, %%xmm0 \n\t"
-                "paddw %%xmm6, %%xmm2 \n\t"
-                "psrlw $3, %%xmm0 \n\t"
-                "psrlw $3, %%xmm2 \n\t"
-                "packuswb %%xmm2, %%xmm0 \n\t"
-         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
-                "movq %%xmm0, (%0) \n\t"
-                "movhps %%xmm0, (%0,%3) \n\t"
-                "sub $2, %2 \n\t"
-                "lea (%1,%3,2), %1 \n\t"
-                "lea (%0,%3,2), %0 \n\t"
-                "jg 1b \n\t"
-                :"+r"(dst), "+r"(src), "+r"(h)
-                :"r"((x86_reg)stride)
-            );
-        }
-        return;
-    }
-
-    /* general case, bilinear */
-    __asm__ volatile(
-        "movd %0, %%xmm7 \n\t"
-        "movd %1, %%xmm6 \n\t"
-        "movdqa %2, %%xmm5 \n\t"
-        "pshuflw $0, %%xmm7, %%xmm7 \n\t"
-        "pshuflw $0, %%xmm6, %%xmm6 \n\t"
-        "movlhps %%xmm7, %%xmm7 \n\t"
-        "movlhps %%xmm6, %%xmm6 \n\t"
-        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
-    );
-
-    __asm__ volatile(
-        "movq (%1), %%xmm0 \n\t"
-        "movq 1(%1), %%xmm1 \n\t"
-        "punpcklbw %%xmm1, %%xmm0 \n\t"
-        "add %3, %1 \n\t"
-        "1: \n\t"
-        "movq (%1), %%xmm1 \n\t"
-        "movq 1(%1), %%xmm2 \n\t"
-        "movq (%1,%3), %%xmm3 \n\t"
-        "movq 1(%1,%3), %%xmm4 \n\t"
-        "lea (%1,%3,2), %1 \n\t"
-        "punpcklbw %%xmm2, %%xmm1 \n\t"
-        "punpcklbw %%xmm4, %%xmm3 \n\t"
-        "movdqa %%xmm1, %%xmm2 \n\t"
-        "movdqa %%xmm3, %%xmm4 \n\t"
-        "pmaddubsw %%xmm7, %%xmm0 \n\t"
-        "pmaddubsw %%xmm6, %%xmm1 \n\t"
-        "pmaddubsw %%xmm7, %%xmm2 \n\t"
-        "pmaddubsw %%xmm6, %%xmm3 \n\t"
-        "paddw %%xmm5, %%xmm0 \n\t"
-        "paddw %%xmm5, %%xmm2 \n\t"
-        "paddw %%xmm0, %%xmm1 \n\t"
-        "paddw %%xmm2, %%xmm3 \n\t"
-        "movdqa %%xmm4, %%xmm0 \n\t"
-        "psrlw $6, %%xmm1 \n\t"
-        "psrlw $6, %%xmm3 \n\t"
- AVG_OP("movq (%0), %%xmm2 \n\t")
- AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
-        "packuswb %%xmm3, %%xmm1 \n\t"
- AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
-        "movq %%xmm1, (%0)\n\t"
-        "movhps %%xmm1, (%0,%3)\n\t"
-        "sub $2, %2 \n\t"
-        "lea (%0,%3,2), %0 \n\t"
-        "jg 1b \n\t"
-        :"+r"(dst), "+r"(src), "+r"(h)
-        :"r"((x86_reg)stride)
-    );
-}
-
-static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    __asm__ volatile(
-        "movd %0, %%mm7 \n\t"
-        "movd %1, %%mm6 \n\t"
-        "movq %2, %%mm5 \n\t"
-        "pshufw $0, %%mm7, %%mm7 \n\t"
-        "pshufw $0, %%mm6, %%mm6 \n\t"
-        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
-    );
-
-    __asm__ volatile(
-        "movd (%1), %%mm0 \n\t"
-        "punpcklbw 1(%1), %%mm0 \n\t"
-        "add %3, %1 \n\t"
-        "1: \n\t"
-        "movd (%1), %%mm1 \n\t"
-        "movd (%1,%3), %%mm3 \n\t"
-        "punpcklbw 1(%1), %%mm1 \n\t"
-        "punpcklbw 1(%1,%3), %%mm3 \n\t"
-        "lea (%1,%3,2), %1 \n\t"
-        "movq %%mm1, %%mm2 \n\t"
-        "movq %%mm3, %%mm4 \n\t"
-        "pmaddubsw %%mm7, %%mm0 \n\t"
-        "pmaddubsw %%mm6, %%mm1 \n\t"
-        "pmaddubsw %%mm7, %%mm2 \n\t"
-        "pmaddubsw %%mm6, %%mm3 \n\t"
-        "paddw %%mm5, %%mm0 \n\t"
-        "paddw %%mm5, %%mm2 \n\t"
-        "paddw %%mm0, %%mm1 \n\t"
-        "paddw %%mm2, %%mm3 \n\t"
-        "movq %%mm4, %%mm0 \n\t"
-        "psrlw $6, %%mm1 \n\t"
-        "psrlw $6, %%mm3 \n\t"
-        "packuswb %%mm1, %%mm1 \n\t"
-        "packuswb %%mm3, %%mm3 \n\t"
- AVG_OP("pavgb (%0), %%mm1 \n\t")
- AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
-        "movd %%mm1, (%0)\n\t"
-        "movd %%mm3, (%0,%3)\n\t"
-        "sub $2, %2 \n\t"
-        "lea (%0,%3,2), %0 \n\t"
-        "jg 1b \n\t"
-        :"+r"(dst), "+r"(src), "+r"(h)
-        :"r"((x86_reg)stride)
-    );
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c
--- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,821 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- */
-
-#include "libavutil/x86_cpu.h"
-#include "libavutil/internal.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/h264_dsp.h"
-#include "dsputil_mmx.h"
-
-
-//#undef NDEBUG
-//#include <assert.h>
-
-int mm_flags; /* multimedia extension flags */
-
-/* pixel operations */
-DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
-{0x8000000080000000ULL, 0x8000000080000000ULL};
-
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
-
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
-
-DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
-DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
-
-#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
-#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
-#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
-
-#define MOVQ_BFE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
-    "paddb %%" #regd ", %%" #regd " \n\t" ::)
-
-#ifndef PIC
-#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
-#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
-#else
-// for shared library it's better to use this way for accessing constants
-// pcmpeqd -> -1
-#define MOVQ_BONE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd " \n\t" \
-    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
-
-#define MOVQ_WTWO(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd " \n\t" \
-    "psllw $1, %%" #regd " \n\t"::)
-
-#endif
-
-// using regr as temporary and for the output result
-// first argument is unmodifed and second is trashed
-// regfe is supposed to contain 0xfefefefefefefefe
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "pand " #regb ", " #regr "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pand " #regfe "," #regb "  \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "paddb " #regb ", " #regr " \n\t"
-
-#define PAVGB_MMX(rega, regb, regr, regfe) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "por  " #regb ", " #regr "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pand " #regfe "," #regb "  \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "psubb " #regb ", " #regr " \n\t"
-
-// mm6 is supposed to contain 0xfefefefefefefefe
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "movq " #regc ", " #regp "  \n\t"\
-    "pand " #regb ", " #regr "  \n\t"\
-    "pand " #regd ", " #regp "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pxor " #regc ", " #regd "  \n\t"\
-    "pand %%mm6, " #regb "      \n\t"\
-    "pand %%mm6, " #regd "      \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "psrlq $1, " #regd "        \n\t"\
-    "paddb " #regb ", " #regr " \n\t"\
-    "paddb " #regd ", " #regp " \n\t"
-
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "movq " #regc ", " #regp "  \n\t"\
-    "por  " #regb ", " #regr "  \n\t"\
-    "por  " #regd ", " #regp "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pxor " #regc ", " #regd "  \n\t"\
-    "pand %%mm6, " #regb "      \n\t"\
-    "pand %%mm6, " #regd "      \n\t"\
-    "psrlq $1, " #regd "        \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "psubb " #regb ", " #regr " \n\t"\
-    "psubb " #regd ", " #regp " \n\t"
-
-/***********************************/
-/* MMX2 specific */
-
-#define DEF(x) x ## _mmx2
-
-/* Introduced only in MMX2 set */
-#define PAVGB "pavgb"
-#define OP_AVG PAVGB
-
-#include "dsputil_mmx_avg_template.c"
-
-#undef DEF
-#undef PAVGB
-#undef OP_AVG
-
-#define put_no_rnd_pixels16_mmx put_pixels16_mmx
-#define put_no_rnd_pixels8_mmx put_pixels8_mmx
-#define put_pixels16_mmx2 put_pixels16_mmx
-#define put_pixels8_mmx2 put_pixels8_mmx
-#define put_pixels4_mmx2 put_pixels4_mmx
-#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
-#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
-#define put_pixels16_3dnow put_pixels16_mmx
-#define put_pixels8_3dnow put_pixels8_mmx
-#define put_pixels4_3dnow put_pixels4_mmx
-#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
-#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
-
-/***********************************/
-/* standard MMX */
-
-void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    const DCTELEM *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    /* unrolled loop */
-        __asm__ volatile(
-                "movq   %3, %%mm0               \n\t"
-                "movq   8%3, %%mm1              \n\t"
-                "movq   16%3, %%mm2             \n\t"
-                "movq   24%3, %%mm3             \n\t"
-                "movq   32%3, %%mm4             \n\t"
-                "movq   40%3, %%mm5             \n\t"
-                "movq   48%3, %%mm6             \n\t"
-                "movq   56%3, %%mm7             \n\t"
-                "packuswb %%mm1, %%mm0          \n\t"
-                "packuswb %%mm3, %%mm2          \n\t"
-                "packuswb %%mm5, %%mm4          \n\t"
-                "packuswb %%mm7, %%mm6          \n\t"
-                "movq   %%mm0, (%0)             \n\t"
-                "movq   %%mm2, (%0, %1)         \n\t"
-                "movq   %%mm4, (%0, %1, 2)      \n\t"
-                "movq   %%mm6, (%0, %2)         \n\t"
-                ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
-                :"memory");
-        pix += line_size*4;
-        p += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile(
-            "movq       (%3), %%mm0             \n\t"
-            "movq       8(%3), %%mm1            \n\t"
-            "movq       16(%3), %%mm2           \n\t"
-            "movq       24(%3), %%mm3           \n\t"
-            "movq       32(%3), %%mm4           \n\t"
-            "movq       40(%3), %%mm5           \n\t"
-            "movq       48(%3), %%mm6           \n\t"
-            "movq       56(%3), %%mm7           \n\t"
-            "packuswb %%mm1, %%mm0              \n\t"
-            "packuswb %%mm3, %%mm2              \n\t"
-            "packuswb %%mm5, %%mm4              \n\t"
-            "packuswb %%mm7, %%mm6              \n\t"
-            "movq       %%mm0, (%0)             \n\t"
-            "movq       %%mm2, (%0, %1)         \n\t"
-            "movq       %%mm4, (%0, %1, 2)      \n\t"
-            "movq       %%mm6, (%0, %2)         \n\t"
-            ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
-            :"memory");
-}
-
-DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
-#define put_signed_pixels_clamped_mmx_half(off) \
-            "movq    "#off"(%2), %%mm1          \n\t"\
-            "movq 16+"#off"(%2), %%mm2          \n\t"\
-            "movq 32+"#off"(%2), %%mm3          \n\t"\
-            "movq 48+"#off"(%2), %%mm4          \n\t"\
-            "packsswb  8+"#off"(%2), %%mm1      \n\t"\
-            "packsswb 24+"#off"(%2), %%mm2      \n\t"\
-            "packsswb 40+"#off"(%2), %%mm3      \n\t"\
-            "packsswb 56+"#off"(%2), %%mm4      \n\t"\
-            "paddb %%mm0, %%mm1                 \n\t"\
-            "paddb %%mm0, %%mm2                 \n\t"\
-            "paddb %%mm0, %%mm3                 \n\t"\
-            "paddb %%mm0, %%mm4                 \n\t"\
-            "movq %%mm1, (%0)                   \n\t"\
-            "movq %%mm2, (%0, %3)               \n\t"\
-            "movq %%mm3, (%0, %3, 2)            \n\t"\
-            "movq %%mm4, (%0, %1)               \n\t"
-
-void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-            "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
-            "lea (%3, %3, 2), %1                \n\t"
-            put_signed_pixels_clamped_mmx_half(0)
-            "lea (%0, %3, 4), %0                \n\t"
-            put_signed_pixels_clamped_mmx_half(64)
-            :"+&r" (pixels), "=&r" (line_skip3)
-            :"r" (block), "r"(line_skip)
-            :"memory");
-}
-
-void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    const DCTELEM *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile(
-                "movq   (%2), %%mm0     \n\t"
-                "movq   8(%2), %%mm1    \n\t"
-                "movq   16(%2), %%mm2   \n\t"
-                "movq   24(%2), %%mm3   \n\t"
-                "movq   %0, %%mm4       \n\t"
-                "movq   %1, %%mm6       \n\t"
-                "movq   %%mm4, %%mm5    \n\t"
-                "punpcklbw %%mm7, %%mm4 \n\t"
-                "punpckhbw %%mm7, %%mm5 \n\t"
-                "paddsw %%mm4, %%mm0    \n\t"
-                "paddsw %%mm5, %%mm1    \n\t"
-                "movq   %%mm6, %%mm5    \n\t"
-                "punpcklbw %%mm7, %%mm6 \n\t"
-                "punpckhbw %%mm7, %%mm5 \n\t"
-                "paddsw %%mm6, %%mm2    \n\t"
-                "paddsw %%mm5, %%mm3    \n\t"
-                "packuswb %%mm1, %%mm0  \n\t"
-                "packuswb %%mm3, %%mm2  \n\t"
-                "movq   %%mm0, %0       \n\t"
-                "movq   %%mm2, %1       \n\t"
-                :"+m"(*pix), "+m"(*(pix+line_size))
-                :"r"(p)
-                :"memory");
-        pix += line_size*2;
-        p += 16;
-    } while (--i);
-}
-
-static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "lea (%3, %3), %%"REG_a"       \n\t"
-         ASMALIGN(3)
-         "1:                            \n\t"
-         "movq (%1), %%mm0              \n\t"
-         "movq (%1, %3), %%mm1          \n\t"
-         "movq %%mm0, (%2)              \n\t"
-         "movq %%mm1, (%2, %3)          \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "movq (%1), %%mm0              \n\t"
-         "movq (%1, %3), %%mm1          \n\t"
-         "movq %%mm0, (%2)              \n\t"
-         "movq %%mm1, (%2, %3)          \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "subl $4, %0                   \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size)
-         : "%"REG_a, "memory"
-        );
-}
-
-static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "1:                            \n\t"
-         "movdqu (%1), %%xmm0           \n\t"
-         "movdqu (%1,%3), %%xmm1        \n\t"
-         "movdqu (%1,%3,2), %%xmm2      \n\t"
-         "movdqu (%1,%4), %%xmm3        \n\t"
-         "movdqa %%xmm0, (%2)           \n\t"
-         "movdqa %%xmm1, (%2,%3)        \n\t"
-         "movdqa %%xmm2, (%2,%3,2)      \n\t"
-         "movdqa %%xmm3, (%2,%4)        \n\t"
-         "subl $4, %0                   \n\t"
-         "lea (%1,%3,4), %1             \n\t"
-         "lea (%2,%3,4), %2             \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-         : "memory"
-        );
-}
-
-static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "1:                            \n\t"
-         "movdqu (%1), %%xmm0           \n\t"
-         "movdqu (%1,%3), %%xmm1        \n\t"
-         "movdqu (%1,%3,2), %%xmm2      \n\t"
-         "movdqu (%1,%4), %%xmm3        \n\t"
-         "pavgb  (%2), %%xmm0           \n\t"
-         "pavgb  (%2,%3), %%xmm1        \n\t"
-         "pavgb  (%2,%3,2), %%xmm2      \n\t"
-         "pavgb  (%2,%4), %%xmm3        \n\t"
-         "movdqa %%xmm0, (%2)           \n\t"
-         "movdqa %%xmm1, (%2,%3)        \n\t"
-         "movdqa %%xmm2, (%2,%3,2)      \n\t"
-         "movdqa %%xmm3, (%2,%4)        \n\t"
-         "subl $4, %0                   \n\t"
-         "lea (%1,%3,4), %1             \n\t"
-         "lea (%2,%3,4), %2             \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-         : "memory"
-        );
-}
-
-static void clear_block_sse(DCTELEM *block)
-{
-    __asm__ volatile(
-        "xorps  %%xmm0, %%xmm0  \n"
-        "movaps %%xmm0,    (%0) \n"
-        "movaps %%xmm0,  16(%0) \n"
-        "movaps %%xmm0,  32(%0) \n"
-        "movaps %%xmm0,  48(%0) \n"
-        "movaps %%xmm0,  64(%0) \n"
-        "movaps %%xmm0,  80(%0) \n"
-        "movaps %%xmm0,  96(%0) \n"
-        "movaps %%xmm0, 112(%0) \n"
-        :: "r"(block)
-        : "memory"
-    );
-}
-
-static void clear_blocks_sse(DCTELEM *blocks)
-{\
-    __asm__ volatile(
-        "xorps  %%xmm0, %%xmm0  \n"
-        "mov     %1, %%"REG_a"  \n"
-        "1:                     \n"
-        "movaps %%xmm0,    (%0, %%"REG_a") \n"
-        "movaps %%xmm0,  16(%0, %%"REG_a") \n"
-        "movaps %%xmm0,  32(%0, %%"REG_a") \n"
-        "movaps %%xmm0,  48(%0, %%"REG_a") \n"
-        "movaps %%xmm0,  64(%0, %%"REG_a") \n"
-        "movaps %%xmm0,  80(%0, %%"REG_a") \n"
-        "movaps %%xmm0,  96(%0, %%"REG_a") \n"
-        "movaps %%xmm0, 112(%0, %%"REG_a") \n"
-        "add $128, %%"REG_a"    \n"
-        " js 1b                 \n"
-        : : "r" (((uint8_t *)blocks)+128*6),
-            "i" (-128*6)
-        : "%"REG_a
-    );
-}
-
-static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
-    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %4, %%mm0                \n\t"
-        "movd  %5, %%mm1                \n\t"
-        "movd  %6, %%mm2                \n\t"
-        "movd  %7, %%mm3                \n\t"
-        "punpcklbw %%mm1, %%mm0         \n\t"
-        "punpcklbw %%mm3, %%mm2         \n\t"
-        "movq %%mm0, %%mm1              \n\t"
-        "punpcklwd %%mm2, %%mm0         \n\t"
-        "punpckhwd %%mm2, %%mm1         \n\t"
-        "movd  %%mm0, %0                \n\t"
-        "punpckhdq %%mm0, %%mm0         \n\t"
-        "movd  %%mm0, %1                \n\t"
-        "movd  %%mm1, %2                \n\t"
-        "punpckhdq %%mm1, %%mm1         \n\t"
-        "movd  %%mm1, %3                \n\t"
-
-        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 3*dst_stride))
-        :  "m" (*(uint32_t*)(src + 0*src_stride)),
-           "m" (*(uint32_t*)(src + 1*src_stride)),
-           "m" (*(uint32_t*)(src + 2*src_stride)),
-           "m" (*(uint32_t*)(src + 3*src_stride))
-    );
-}
-
-#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
-\
-static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[9];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[17*2];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[17*2];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[17*2];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
-}
-
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgusb " #temp ", " #a "        \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-#define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgb " #temp ", " #a "          \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-
-#define PREFETCH(name, op) \
-static void name(void *mem, int stride, int h){\
-    const uint8_t *p= mem;\
-    do{\
-        __asm__ volatile(#op" %0" :: "m"(*p));\
-        p+= stride;\
-    }while(--h);\
-}
-PREFETCH(prefetch_mmx2,  prefetcht0)
-#undef PREFETCH 
-
-#include "h264dsp_mmx.c"
-
-void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
-void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
-void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
-void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
-void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
-
-void dsputil_init_mmx(DSPContext* c)
-{
-    mm_flags = mm_support();
-
-    if (mm_flags & FF_MM_MMX) {
-        c->clear_block  = clear_block_sse;
-        c->clear_blocks = clear_blocks_sse;
-        c->prefetch = prefetch_mmx2;
-
-
-#define H264_QPEL_FUNCS(x, y, CPU)\
-            c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
-            c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
-            c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
-            c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
-
-        if((mm_flags & FF_MM_SSE2)){
-            c->put_pixels_tab[0][0] = put_pixels16_sse2;
-            c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
-
-        }
-        if(mm_flags & FF_MM_SSE2){
-            H264_QPEL_FUNCS(0, 1, sse2);
-            H264_QPEL_FUNCS(0, 2, sse2);
-            H264_QPEL_FUNCS(0, 3, sse2);
-            H264_QPEL_FUNCS(1, 1, sse2);
-            H264_QPEL_FUNCS(1, 2, sse2);
-            H264_QPEL_FUNCS(1, 3, sse2);
-            H264_QPEL_FUNCS(2, 1, sse2);
-            H264_QPEL_FUNCS(2, 2, sse2);
-            H264_QPEL_FUNCS(2, 3, sse2);
-            H264_QPEL_FUNCS(3, 1, sse2);
-            H264_QPEL_FUNCS(3, 2, sse2);
-            H264_QPEL_FUNCS(3, 3, sse2);
-        }
-#if HAVE_SSSE3
-        if(mm_flags & FF_MM_SSSE3){
-            H264_QPEL_FUNCS(1, 0, ssse3);
-            H264_QPEL_FUNCS(1, 1, ssse3);
-            H264_QPEL_FUNCS(1, 2, ssse3);
-            H264_QPEL_FUNCS(1, 3, ssse3);
-            H264_QPEL_FUNCS(2, 0, ssse3);
-            H264_QPEL_FUNCS(2, 1, ssse3);
-            H264_QPEL_FUNCS(2, 2, ssse3);
-            H264_QPEL_FUNCS(2, 3, ssse3);
-            H264_QPEL_FUNCS(3, 0, ssse3);
-            H264_QPEL_FUNCS(3, 1, ssse3);
-            H264_QPEL_FUNCS(3, 2, ssse3);
-            H264_QPEL_FUNCS(3, 3, ssse3);
-
-            c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
-            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
-            c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
-            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
-        }
-#endif
-
-
-    }
-}
-
-void ff_h264dsp_init_x86(H264DSPContext *c)
-{
-    mm_flags = mm_support();
-
-    if (mm_flags & FF_MM_MMX) {
-        c->h264_idct_dc_add=
-        c->h264_idct_add= ff_h264_idct_add_mmx;
-        c->h264_idct8_dc_add=
-        c->h264_idct8_add= ff_h264_idct8_add_mmx;
-
-        if (mm_flags & FF_MM_MMX2) {            
-            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
-            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
-			c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
-            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
-
-			c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
-			c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
-
-			c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
-            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
-            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
-            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
-            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
-            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
-            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
-
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
-        }
-        if(mm_flags & FF_MM_SSE2){
-            c->h264_idct8_add = ff_h264_idct8_add_sse2;
-            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
-        }
-
-    }
-}
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h
--- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,170 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2007  Aurelien Jacobs <aurel@gnuage.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DSPUTIL_MMX_H
-#define AVCODEC_X86_DSPUTIL_MMX_H
-
-#include <stdint.h>
-#include "libavcodec/dsputil.h"
-
-typedef struct { uint64_t a, b; } xmm_reg;
-
-extern const uint64_t ff_bone;
-extern const uint64_t ff_wtwo;
-
-extern const uint64_t ff_pdw_80000000[2];
-
-extern const uint64_t ff_pw_3;
-extern const uint64_t ff_pw_4;
-extern const xmm_reg  ff_pw_5;
-extern const xmm_reg  ff_pw_8;
-extern const uint64_t ff_pw_15;
-extern const xmm_reg  ff_pw_16;
-extern const uint64_t ff_pw_20;
-extern const xmm_reg  ff_pw_28;
-extern const xmm_reg  ff_pw_32;
-extern const uint64_t ff_pw_42;
-extern const xmm_reg  ff_pw_64;
-extern const uint64_t ff_pw_96;
-extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
-
-extern const uint64_t ff_pb_1;
-extern const uint64_t ff_pb_3;
-extern const uint64_t ff_pb_7;
-extern const uint64_t ff_pb_1F;
-extern const uint64_t ff_pb_3F;
-extern const uint64_t ff_pb_81;
-extern const uint64_t ff_pb_A1;
-extern const uint64_t ff_pb_FC;
-
-extern const double ff_pd_1[2];
-extern const double ff_pd_2[2];
-
-#define LOAD4(stride,in,a,b,c,d)\
-    "movq 0*"#stride"+"#in", "#a"\n\t"\
-    "movq 1*"#stride"+"#in", "#b"\n\t"\
-    "movq 2*"#stride"+"#in", "#c"\n\t"\
-    "movq 3*"#stride"+"#in", "#d"\n\t"
-
-#define STORE4(stride,out,a,b,c,d)\
-    "movq "#a", 0*"#stride"+"#out"\n\t"\
-    "movq "#b", 1*"#stride"+"#out"\n\t"\
-    "movq "#c", 2*"#stride"+"#out"\n\t"\
-    "movq "#d", 3*"#stride"+"#out"\n\t"
-
-/* in/out: mma=mma+mmb, mmb=mmb-mma */
-#define SUMSUB_BA( a, b ) \
-    "paddw "#b", "#a" \n\t"\
-    "paddw "#b", "#b" \n\t"\
-    "psubw "#a", "#b" \n\t"
-
-#define SBUTTERFLY(a,b,t,n,m)\
-    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
-    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
-    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
-
-#define TRANSPOSE4(a,b,c,d,t)\
-    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
-    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
-    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
-    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
-
-// e,f,g,h can be memory
-// out: a,d,t,c
-#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
-    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
-    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
-    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
-    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
-    SBUTTERFLY(a, b, t, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
-                                 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
-    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
-                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
-    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
-                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
-    SBUTTERFLY(t, b, c, wd, q)   /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
-                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
-
-#if ARCH_X86_64
-// permutes 01234567 -> 05736421
-#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
-    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
-    SBUTTERFLY(c,d,b,wd,dqa)\
-    SBUTTERFLY(e,f,d,wd,dqa)\
-    SBUTTERFLY(g,h,f,wd,dqa)\
-    SBUTTERFLY(a,c,h,dq,dqa)\
-    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
-    SBUTTERFLY(e,g,b,dq,dqa)\
-    SBUTTERFLY(d,f,g,dq,dqa)\
-    SBUTTERFLY(a,e,f,qdq,dqa)\
-    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
-    SBUTTERFLY(h,b,d,qdq,dqa)\
-    SBUTTERFLY(c,g,b,qdq,dqa)\
-    "movdqa %%xmm8, "#g"              \n\t"
-#else
-#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
-    "movdqa "#h", "#t"                \n\t"\
-    SBUTTERFLY(a,b,h,wd,dqa)\
-    "movdqa "#h", 16"#t"              \n\t"\
-    "movdqa "#t", "#h"                \n\t"\
-    SBUTTERFLY(c,d,b,wd,dqa)\
-    SBUTTERFLY(e,f,d,wd,dqa)\
-    SBUTTERFLY(g,h,f,wd,dqa)\
-    SBUTTERFLY(a,c,h,dq,dqa)\
-    "movdqa "#h", "#t"                \n\t"\
-    "movdqa 16"#t", "#h"              \n\t"\
-    SBUTTERFLY(h,b,c,dq,dqa)\
-    SBUTTERFLY(e,g,b,dq,dqa)\
-    SBUTTERFLY(d,f,g,dq,dqa)\
-    SBUTTERFLY(a,e,f,qdq,dqa)\
-    SBUTTERFLY(h,d,e,qdq,dqa)\
-    "movdqa "#h", 16"#t"              \n\t"\
-    "movdqa "#t", "#h"                \n\t"\
-    SBUTTERFLY(h,b,d,qdq,dqa)\
-    SBUTTERFLY(c,g,b,qdq,dqa)\
-    "movdqa 16"#t", "#g"              \n\t"
-#endif
-
-#define MOVQ_WONE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd ::)
-
-void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
-void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
-void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
-
-void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-
-void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-
-void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag,
-                                   double *autoc);
-
-void ff_mmx_idct(DCTELEM *block);
-void ff_mmxext_idct(DCTELEM *block);
-
-#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c
--- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,250 +0,0 @@
-/*
- * DSP utils : average functions are compiled twice for 3dnow/mmx2
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
- * and improved by Zdenek Kabelac <kabi@users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-
-static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c
--- a/ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1741 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "dsputil_mmx.h"
-
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
-
-/***********************************/
-/* IDCT */
-
-#define SUMSUB_BADC( a, b, c, d ) \
-    "paddw "#b", "#a" \n\t"\
-    "paddw "#d", "#c" \n\t"\
-    "paddw "#b", "#b" \n\t"\
-    "paddw "#d", "#d" \n\t"\
-    "psubw "#a", "#b" \n\t"\
-    "psubw "#c", "#d" \n\t"
-
-#define SUMSUBD2_AB( a, b, t ) \
-    "movq  "#b", "#t" \n\t"\
-    "psraw  $1 , "#b" \n\t"\
-    "paddw "#a", "#b" \n\t"\
-    "psraw  $1 , "#a" \n\t"\
-    "psubw "#t", "#a" \n\t"
-
-#define IDCT4_1D( s02, s13, d02, d13, t ) \
-    SUMSUB_BA  ( s02, d02 )\
-    SUMSUBD2_AB( s13, d13, t )\
-    SUMSUB_BADC( d13, s02, s13, d02 )
-
-#define STORE_DIFF_4P( p, t, z ) \
-    "psraw      $6,     "#p" \n\t"\
-    "movd       (%0),   "#t" \n\t"\
-    "punpcklbw "#z",    "#t" \n\t"\
-    "paddsw    "#t",    "#p" \n\t"\
-    "packuswb  "#z",    "#p" \n\t"\
-    "movd      "#p",    (%0) \n\t"
-
-static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
-    /* Load dct coeffs */
-    __asm__ volatile(
-        "movq   (%0), %%mm0 \n\t"
-        "movq  8(%0), %%mm1 \n\t"
-        "movq 16(%0), %%mm2 \n\t"
-        "movq 24(%0), %%mm3 \n\t"
-    :: "r"(block) );
-
-    __asm__ volatile(
-        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
-        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
-
-        "movq      %0,    %%mm6 \n\t"
-        /* in: 1,4,0,2  out: 1,2,3,0 */
-        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
-
-        "paddw     %%mm6, %%mm3 \n\t"
-
-        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
-        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
-
-        "pxor %%mm7, %%mm7    \n\t"
-    :: "m"(ff_pw_32));
-
-    __asm__ volatile(
-    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
-        "add %1, %0             \n\t"
-    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
-        "add %1, %0             \n\t"
-    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
-        "add %1, %0             \n\t"
-    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
-        : "+r"(dst)
-        : "r" ((x86_reg)stride)
-    );
-}
-
-static inline void h264_idct8_1d(int16_t *block)
-{
-    __asm__ volatile(
-        "movq 112(%0), %%mm7  \n\t"
-        "movq  80(%0), %%mm0  \n\t"
-        "movq  48(%0), %%mm3  \n\t"
-        "movq  16(%0), %%mm5  \n\t"
-
-        "movq   %%mm0, %%mm4  \n\t"
-        "movq   %%mm5, %%mm1  \n\t"
-        "psraw  $1,    %%mm4  \n\t"
-        "psraw  $1,    %%mm1  \n\t"
-        "paddw  %%mm0, %%mm4  \n\t"
-        "paddw  %%mm5, %%mm1  \n\t"
-        "paddw  %%mm7, %%mm4  \n\t"
-        "paddw  %%mm0, %%mm1  \n\t"
-        "psubw  %%mm5, %%mm4  \n\t"
-        "paddw  %%mm3, %%mm1  \n\t"
-
-        "psubw  %%mm3, %%mm5  \n\t"
-        "psubw  %%mm3, %%mm0  \n\t"
-        "paddw  %%mm7, %%mm5  \n\t"
-        "psubw  %%mm7, %%mm0  \n\t"
-        "psraw  $1,    %%mm3  \n\t"
-        "psraw  $1,    %%mm7  \n\t"
-        "psubw  %%mm3, %%mm5  \n\t"
-        "psubw  %%mm7, %%mm0  \n\t"
-
-        "movq   %%mm4, %%mm3  \n\t"
-        "movq   %%mm1, %%mm7  \n\t"
-        "psraw  $2,    %%mm1  \n\t"
-        "psraw  $2,    %%mm3  \n\t"
-        "paddw  %%mm5, %%mm3  \n\t"
-        "psraw  $2,    %%mm5  \n\t"
-        "paddw  %%mm0, %%mm1  \n\t"
-        "psraw  $2,    %%mm0  \n\t"
-        "psubw  %%mm4, %%mm5  \n\t"
-        "psubw  %%mm0, %%mm7  \n\t"
-
-        "movq  32(%0), %%mm2  \n\t"
-        "movq  96(%0), %%mm6  \n\t"
-        "movq   %%mm2, %%mm4  \n\t"
-        "movq   %%mm6, %%mm0  \n\t"
-        "psraw  $1,    %%mm4  \n\t"
-        "psraw  $1,    %%mm6  \n\t"
-        "psubw  %%mm0, %%mm4  \n\t"
-        "paddw  %%mm2, %%mm6  \n\t"
-
-        "movq    (%0), %%mm2  \n\t"
-        "movq  64(%0), %%mm0  \n\t"
-        SUMSUB_BA( %%mm0, %%mm2 )
-        SUMSUB_BA( %%mm6, %%mm0 )
-        SUMSUB_BA( %%mm4, %%mm2 )
-        SUMSUB_BA( %%mm7, %%mm6 )
-        SUMSUB_BA( %%mm5, %%mm4 )
-        SUMSUB_BA( %%mm3, %%mm2 )
-        SUMSUB_BA( %%mm1, %%mm0 )
-        :: "r"(block)
-    );
-}
-
-static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
-    int i;
-    DECLARE_ALIGNED(8, int16_t, b2)[64];
-
-    block[0] += 32;
-
-    for(i=0; i<2; i++){
-        DECLARE_ALIGNED(8, uint64_t, tmp);
-
-        h264_idct8_1d(block+4*i);
-
-        __asm__ volatile(
-            "movq   %%mm7,    %0   \n\t"
-            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
-            "movq   %%mm0,  8(%1)  \n\t"
-            "movq   %%mm6, 24(%1)  \n\t"
-            "movq   %%mm7, 40(%1)  \n\t"
-            "movq   %%mm4, 56(%1)  \n\t"
-            "movq    %0,    %%mm7  \n\t"
-            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
-            "movq   %%mm7,   (%1)  \n\t"
-            "movq   %%mm1, 16(%1)  \n\t"
-            "movq   %%mm0, 32(%1)  \n\t"
-            "movq   %%mm3, 48(%1)  \n\t"
-            : "=m"(tmp)
-            : "r"(b2+32*i)
-            : "memory"
-        );
-    }
-
-    for(i=0; i<2; i++){
-        h264_idct8_1d(b2+4*i);
-
-        __asm__ volatile(
-            "psraw     $6, %%mm7  \n\t"
-            "psraw     $6, %%mm6  \n\t"
-            "psraw     $6, %%mm5  \n\t"
-            "psraw     $6, %%mm4  \n\t"
-            "psraw     $6, %%mm3  \n\t"
-            "psraw     $6, %%mm2  \n\t"
-            "psraw     $6, %%mm1  \n\t"
-            "psraw     $6, %%mm0  \n\t"
-
-            "movq   %%mm7,    (%0)  \n\t"
-            "movq   %%mm5,  16(%0)  \n\t"
-            "movq   %%mm3,  32(%0)  \n\t"
-            "movq   %%mm1,  48(%0)  \n\t"
-            "movq   %%mm0,  64(%0)  \n\t"
-            "movq   %%mm2,  80(%0)  \n\t"
-            "movq   %%mm4,  96(%0)  \n\t"
-            "movq   %%mm6, 112(%0)  \n\t"
-            :: "r"(b2+4*i)
-            : "memory"
-        );
-    }
-
-    add_pixels_clamped_mmx(b2, dst, stride);
-}
-
-#define STORE_DIFF_8P( p, d, t, z )\
-        "movq       "#d", "#t" \n"\
-        "psraw       $6,  "#p" \n"\
-        "punpcklbw  "#z", "#t" \n"\
-        "paddsw     "#t", "#p" \n"\
-        "packuswb   "#p", "#p" \n"\
-        "movq       "#p", "#d" \n"
-
-#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
-        "movdqa     "#c", "#a" \n"\
-        "movdqa     "#g", "#e" \n"\
-        "psraw       $1,  "#c" \n"\
-        "psraw       $1,  "#g" \n"\
-        "psubw      "#e", "#c" \n"\
-        "paddw      "#a", "#g" \n"\
-        "movdqa     "#b", "#e" \n"\
-        "psraw       $1,  "#e" \n"\
-        "paddw      "#b", "#e" \n"\
-        "paddw      "#d", "#e" \n"\
-        "paddw      "#f", "#e" \n"\
-        "movdqa     "#f", "#a" \n"\
-        "psraw       $1,  "#a" \n"\
-        "paddw      "#f", "#a" \n"\
-        "paddw      "#h", "#a" \n"\
-        "psubw      "#b", "#a" \n"\
-        "psubw      "#d", "#b" \n"\
-        "psubw      "#d", "#f" \n"\
-        "paddw      "#h", "#b" \n"\
-        "psubw      "#h", "#f" \n"\
-        "psraw       $1,  "#d" \n"\
-        "psraw       $1,  "#h" \n"\
-        "psubw      "#d", "#b" \n"\
-        "psubw      "#h", "#f" \n"\
-        "movdqa     "#e", "#d" \n"\
-        "movdqa     "#a", "#h" \n"\
-        "psraw       $2,  "#d" \n"\
-        "psraw       $2,  "#h" \n"\
-        "paddw      "#f", "#d" \n"\
-        "paddw      "#b", "#h" \n"\
-        "psraw       $2,  "#f" \n"\
-        "psraw       $2,  "#b" \n"\
-        "psubw      "#f", "#e" \n"\
-        "psubw      "#a", "#b" \n"\
-        "movdqa 0x00(%1), "#a" \n"\
-        "movdqa 0x40(%1), "#f" \n"\
-        SUMSUB_BA(f, a)\
-        SUMSUB_BA(g, f)\
-        SUMSUB_BA(c, a)\
-        SUMSUB_BA(e, g)\
-        SUMSUB_BA(b, c)\
-        SUMSUB_BA(h, a)\
-        SUMSUB_BA(d, f)
-
-static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
-{
-    __asm__ volatile(
-        "movdqa   0x10(%1), %%xmm1 \n"
-        "movdqa   0x20(%1), %%xmm2 \n"
-        "movdqa   0x30(%1), %%xmm3 \n"
-        "movdqa   0x50(%1), %%xmm5 \n"
-        "movdqa   0x60(%1), %%xmm6 \n"
-        "movdqa   0x70(%1), %%xmm7 \n"
-        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
-        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
-        "paddw          %4, %%xmm4 \n"
-        "movdqa     %%xmm4, 0x00(%1) \n"
-        "movdqa     %%xmm2, 0x40(%1) \n"
-        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
-        "movdqa     %%xmm6, 0x60(%1) \n"
-        "movdqa     %%xmm7, 0x70(%1) \n"
-        "pxor       %%xmm7, %%xmm7 \n"
-        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
-        "lea     (%0,%2,4), %0 \n"
-        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
-        "movdqa   0x60(%1), %%xmm0 \n"
-        "movdqa   0x70(%1), %%xmm1 \n"
-        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
-        :"+r"(dst)
-        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
-    );
-}
-
-static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
-    int dc = (block[0] + 32) >> 6;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dst+0*stride)),
-         "+m"(*(uint32_t*)(dst+1*stride)),
-         "+m"(*(uint32_t*)(dst+2*stride)),
-         "+m"(*(uint32_t*)(dst+3*stride))
-    );
-}
-
-static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
-    int dc = (block[0] + 32) >> 6;
-    int y;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    for(y=2; y--; dst += 4*stride){
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint64_t*)(dst+0*stride)),
-         "+m"(*(uint64_t*)(dst+1*stride)),
-         "+m"(*(uint64_t*)(dst+2*stride)),
-         "+m"(*(uint64_t*)(dst+3*stride))
-    );
-    }
-}
-
-//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-static const uint8_t scan8[16 + 2*4]={
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
- 1+1*8, 2+1*8,
- 1+2*8, 2+2*8,
- 1+4*8, 2+4*8,
- 1+5*8, 2+5*8,
-};
-
-static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i++){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
-            else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
-        }
-    }
-}
-
-static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i++){
-        if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
-        else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
-    }
-}
-
-static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i+=4){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
-            else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
-        }
-    }
-}
-
-static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=0; i<16; i+=4){
-        int nnz = nnzc[ scan8[i] ];
-        if(nnz){
-            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
-            else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
-        }
-    }
-}
-
-static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=16; i<16+8; i++){
-        if(nnzc[ scan8[i] ])
-            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
-        else if(block[i*16])
-            ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
-    }
-}
-
-/***********************************/
-/* deblocking */
-
-// out: o = |x-y|>a
-// clobbers: t
-#define DIFF_GT_MMX(x,y,a,o,t)\
-    "movq     "#y", "#t"  \n\t"\
-    "movq     "#x", "#o"  \n\t"\
-    "psubusb  "#x", "#t"  \n\t"\
-    "psubusb  "#y", "#o"  \n\t"\
-    "por      "#t", "#o"  \n\t"\
-    "psubusb  "#a", "#o"  \n\t"
-
-// out: o = |x-y|>a
-// clobbers: t
-#define DIFF_GT2_MMX(x,y,a,o,t)\
-    "movq     "#y", "#t"  \n\t"\
-    "movq     "#x", "#o"  \n\t"\
-    "psubusb  "#x", "#t"  \n\t"\
-    "psubusb  "#y", "#o"  \n\t"\
-    "psubusb  "#a", "#t"  \n\t"\
-    "psubusb  "#a", "#o"  \n\t"\
-    "pcmpeqb  "#t", "#o"  \n\t"\
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
-// out: mm5=beta-1, mm7=mask
-// clobbers: mm4,mm6
-#define H264_DEBLOCK_MASK(alpha1, beta1) \
-    "pshufw $0, "#alpha1", %%mm4 \n\t"\
-    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
-    "packuswb  %%mm4, %%mm4      \n\t"\
-    "packuswb  %%mm5, %%mm5      \n\t"\
-    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
-    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
-    "por       %%mm4, %%mm7      \n\t"\
-    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
-    "por       %%mm4, %%mm7      \n\t"\
-    "pxor      %%mm6, %%mm6      \n\t"\
-    "pcmpeqb   %%mm6, %%mm7      \n\t"
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
-// out: mm1=p0' mm2=q0'
-// clobbers: mm0,3-6
-#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
-        "movq    %%mm1              , %%mm5 \n\t"\
-        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
-        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
-        "pcmpeqb %%mm4              , %%mm4 \n\t"\
-        "pxor    %%mm4              , %%mm3 \n\t"\
-        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
-        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
-        "pxor    %%mm1              , %%mm4 \n\t"\
-        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
-        "pavgb   %%mm5              , %%mm3 \n\t"\
-        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
-        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
-        "psubusb %%mm3              , %%mm6 \n\t"\
-        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
-        "pminub  %%mm7              , %%mm6 \n\t"\
-        "pminub  %%mm7              , %%mm3 \n\t"\
-        "psubusb %%mm6              , %%mm1 \n\t"\
-        "psubusb %%mm3              , %%mm2 \n\t"\
-        "paddusb %%mm3              , %%mm1 \n\t"\
-        "paddusb %%mm6              , %%mm2 \n\t"
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
-// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-// clobbers: q2, tmp, tc0
-#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
-        "movq     %%mm1,  "#tmp"   \n\t"\
-        "pavgb    %%mm2,  "#tmp"   \n\t"\
-        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
-        "pxor   "q2addr", "#tmp"   \n\t"\
-        "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
-        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
-        "movq     "#p1",  "#tmp"   \n\t"\
-        "psubusb  "#tc0", "#tmp"   \n\t"\
-        "paddusb  "#p1",  "#tc0"   \n\t"\
-        "pmaxub   "#tmp", "#q2"    \n\t"\
-        "pminub   "#tc0", "#q2"    \n\t"\
-        "movq     "#q2",  "q1addr" \n\t"
-
-static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
-{
-    DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
-
-    __asm__ volatile(
-        "movq    (%2,%4), %%mm0    \n\t" //p1
-        "movq    (%2,%4,2), %%mm1  \n\t" //p0
-        "movq    (%3),    %%mm2    \n\t" //q0
-        "movq    (%3,%4), %%mm3    \n\t" //q1
-        H264_DEBLOCK_MASK(%7, %8)
-
-        "movd      %6,    %%mm4    \n\t"
-        "punpcklbw %%mm4, %%mm4    \n\t"
-        "punpcklwd %%mm4, %%mm4    \n\t"
-        "pcmpeqb   %%mm3, %%mm3    \n\t"
-        "movq      %%mm4, %%mm6    \n\t"
-        "pcmpgtb   %%mm3, %%mm4    \n\t"
-        "movq      %%mm6, %1       \n\t"
-        "pand      %%mm4, %%mm7    \n\t"
-        "movq      %%mm7, %0       \n\t"
-
-        /* filter p1 */
-        "movq     (%2),   %%mm3    \n\t" //p2
-        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
-        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
-        "pand     %1,     %%mm7    \n\t" // mask & tc0
-        "movq     %%mm7,  %%mm4    \n\t"
-        "psubb    %%mm6,  %%mm7    \n\t"
-        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
-        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
-
-        /* filter q1 */
-        "movq    (%3,%4,2), %%mm4  \n\t" //q2
-        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
-        "pand     %0,     %%mm6    \n\t"
-        "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
-        "pand     %%mm6,  %%mm5    \n\t"
-        "psubb    %%mm6,  %%mm7    \n\t"
-        "movq    (%3,%4), %%mm3    \n\t"
-        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
-
-        /* filter p0, q0 */
-        H264_DEBLOCK_P0_Q0(%9, unused)
-        "movq      %%mm1, (%2,%4,2) \n\t"
-        "movq      %%mm2, (%3)      \n\t"
-
-        : "=m"(tmp0[0]), "=m"(tmp0[1])
-        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
-          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
-          "m"(ff_bone)
-    );
-}
-
-static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    if((tc0[0] & tc0[1]) >= 0)
-        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
-    if((tc0[2] & tc0[3]) >= 0)
-        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
-}
-static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    //FIXME: could cut some load/stores by merging transpose with filter
-    // also, it only needs to transpose 6x8
-    DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
-    int i;
-    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
-        if((tc0[0] & tc0[1]) < 0)
-            continue;
-        transpose4x4(trans,       pix-4,          8, stride);
-        transpose4x4(trans  +4*8, pix,            8, stride);
-        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
-        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
-        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
-        transpose4x4(pix-2,          trans  +2*8, stride, 8);
-        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
-    }
-}
-
-static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
-{
-    __asm__ volatile(
-        "movq    (%0),    %%mm0     \n\t" //p1
-        "movq    (%0,%2), %%mm1     \n\t" //p0
-        "movq    (%1),    %%mm2     \n\t" //q0
-        "movq    (%1,%2), %%mm3     \n\t" //q1
-        H264_DEBLOCK_MASK(%4, %5)
-        "movd      %3,    %%mm6     \n\t"
-        "punpcklbw %%mm6, %%mm6     \n\t"
-        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
-        H264_DEBLOCK_P0_Q0(%6, %7)
-        "movq      %%mm1, (%0,%2)   \n\t"
-        "movq      %%mm2, (%1)      \n\t"
-
-        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
-           "r"(*(uint32_t*)tc0),
-           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
-    );
-}
-
-static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
-}
-
-static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    //FIXME: could cut some load/stores by merging transpose with filter
-    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
-    transpose4x4(trans, pix-2, 8, stride);
-    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
-    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
-    transpose4x4(pix-2, trans, stride, 8);
-    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
-}
-
-// p0 = (p0 + q1 + 2*p1 + 2) >> 2
-#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
-    "movq    "#p0", %%mm4  \n\t"\
-    "pxor    "#q1", %%mm4  \n\t"\
-    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
-    "pavgb   "#q1", "#p0"  \n\t"\
-    "psubusb %%mm4, "#p0"  \n\t"\
-    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
-
-static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
-{
-    __asm__ volatile(
-        "movq    (%0),    %%mm0     \n\t"
-        "movq    (%0,%2), %%mm1     \n\t"
-        "movq    (%1),    %%mm2     \n\t"
-        "movq    (%1,%2), %%mm3     \n\t"
-        H264_DEBLOCK_MASK(%3, %4)
-        "movq    %%mm1,   %%mm5     \n\t"
-        "movq    %%mm2,   %%mm6     \n\t"
-        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
-        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
-        "psubb   %%mm5,   %%mm1     \n\t"
-        "psubb   %%mm6,   %%mm2     \n\t"
-        "pand    %%mm7,   %%mm1     \n\t"
-        "pand    %%mm7,   %%mm2     \n\t"
-        "paddb   %%mm5,   %%mm1     \n\t"
-        "paddb   %%mm6,   %%mm2     \n\t"
-        "movq    %%mm1,   (%0,%2)   \n\t"
-        "movq    %%mm2,   (%1)      \n\t"
-        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
-           "m"(alpha1), "m"(beta1), "m"(ff_bone)
-    );
-}
-
-static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
-}
-
-static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
-{
-    //FIXME: could cut some load/stores by merging transpose with filter
-    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
-    transpose4x4(trans, pix-2, 8, stride);
-    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
-    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
-    transpose4x4(pix-2, trans, stride, 8);
-    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
-}
-
-static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
-                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
-    int dir;
-    __asm__ volatile(
-        "movq %0, %%mm7 \n"
-        "movq %1, %%mm6 \n"
-        ::"m"(ff_pb_1), "m"(ff_pb_3)
-    );
-    if(field)
-        __asm__ volatile(
-            "movq %0, %%mm6 \n"
-            ::"m"(ff_pb_3_1)
-        );
-    __asm__ volatile(
-        "movq  %%mm6, %%mm5 \n"
-        "paddb %%mm5, %%mm5 \n"
-    :);
-
-    // could do a special case for dir==0 && edges==1, but it only reduces the
-    // average filter time by 1.2%
-    for( dir=1; dir>=0; dir-- ) {
-        const x86_reg d_idx = dir ? -8 : -1;
-        const int mask_mv = dir ? mask_mv1 : mask_mv0;
-        DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
-        int b_idx, edge;
-        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
-            __asm__ volatile(
-                "pand %0, %%mm0 \n\t"
-                ::"m"(mask_dir)
-            );
-            if(!(mask_mv & edge)) {
-                if(bidir) {
-                    __asm__ volatile(
-                        "movd         (%1,%0), %%mm2 \n"
-                        "punpckldq  40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
-                        "pshufw $0x44,   (%1), %%mm0 \n" // { ref0[b], ref0[b] }
-                        "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
-                        "pshufw $0x4E, %%mm2, %%mm3 \n"
-                        "psubb         %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
-                        "psubb         %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
-                        "1: \n"
-                        "por           %%mm1, %%mm0 \n"
-                        "movq      (%2,%0,4), %%mm1 \n"
-                        "movq     8(%2,%0,4), %%mm2 \n"
-                        "movq          %%mm1, %%mm3 \n"
-                        "movq          %%mm2, %%mm4 \n"
-                        "psubw          (%2), %%mm1 \n"
-                        "psubw         8(%2), %%mm2 \n"
-                        "psubw       160(%2), %%mm3 \n"
-                        "psubw       168(%2), %%mm4 \n"
-                        "packsswb      %%mm2, %%mm1 \n"
-                        "packsswb      %%mm4, %%mm3 \n"
-                        "paddb         %%mm6, %%mm1 \n"
-                        "paddb         %%mm6, %%mm3 \n"
-                        "psubusb       %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
-                        "psubusb       %%mm5, %%mm3 \n"
-                        "packsswb      %%mm3, %%mm1 \n"
-                        "add $40, %0 \n"
-                        "cmp $40, %0 \n"
-                        "jl 1b \n"
-                        "sub $80, %0 \n"
-                        "pshufw $0x4E, %%mm1, %%mm1 \n"
-                        "por           %%mm1, %%mm0 \n"
-                        "pshufw $0x4E, %%mm0, %%mm1 \n"
-                        "pminub        %%mm1, %%mm0 \n"
-                        ::"r"(d_idx),
-                          "r"(ref[0]+b_idx),
-                          "r"(mv[0]+b_idx)
-                    );
-                } else {
-                    __asm__ volatile(
-                        "movd        (%1), %%mm0 \n"
-                        "psubb    (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
-                        "movq        (%2), %%mm1 \n"
-                        "movq       8(%2), %%mm2 \n"
-                        "psubw  (%2,%0,4), %%mm1 \n"
-                        "psubw 8(%2,%0,4), %%mm2 \n"
-                        "packsswb   %%mm2, %%mm1 \n"
-                        "paddb      %%mm6, %%mm1 \n"
-                        "psubusb    %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
-                        "packsswb   %%mm1, %%mm1 \n"
-                        "por        %%mm1, %%mm0 \n"
-                        ::"r"(d_idx),
-                          "r"(ref[0]+b_idx),
-                          "r"(mv[0]+b_idx)
-                    );
-                }
-            }
-            __asm__ volatile(
-                "movd %0, %%mm1 \n"
-                "por  %1, %%mm1 \n" // nnz[b] || nnz[bn]
-                ::"m"(nnz[b_idx]),
-                  "m"(nnz[b_idx+d_idx])
-            );
-            __asm__ volatile(
-                "pminub    %%mm7, %%mm1 \n"
-                "pminub    %%mm7, %%mm0 \n"
-                "psllw        $1, %%mm1 \n"
-                "pxor      %%mm2, %%mm2 \n"
-                "pmaxub    %%mm0, %%mm1 \n"
-                "punpcklbw %%mm2, %%mm1 \n"
-                "movq      %%mm1, %0    \n"
-                :"=m"(*bS[dir][edge])
-                ::"memory"
-            );
-        }
-        edges = 4;
-        step = 1;
-    }
-    __asm__ volatile(
-        "movq   (%0), %%mm0 \n\t"
-        "movq  8(%0), %%mm1 \n\t"
-        "movq 16(%0), %%mm2 \n\t"
-        "movq 24(%0), %%mm3 \n\t"
-        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
-        "movq %%mm0,   (%0) \n\t"
-        "movq %%mm3,  8(%0) \n\t"
-        "movq %%mm4, 16(%0) \n\t"
-        "movq %%mm2, 24(%0) \n\t"
-        ::"r"(bS[0])
-        :"memory"
-    );
-}
-
-/***********************************/
-/* motion compensation */
-
-#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
-        "mov"#q" "#C", "#T"         \n\t"\
-        "mov"#d" (%0), "#F"         \n\t"\
-        "paddw "#D", "#T"           \n\t"\
-        "psllw $2, "#T"             \n\t"\
-        "psubw "#B", "#T"           \n\t"\
-        "psubw "#E", "#T"           \n\t"\
-        "punpcklbw "#Z", "#F"       \n\t"\
-        "pmullw %4, "#T"            \n\t"\
-        "paddw %5, "#A"             \n\t"\
-        "add %2, %0                 \n\t"\
-        "paddw "#F", "#A"           \n\t"\
-        "paddw "#A", "#T"           \n\t"\
-        "psraw $5, "#T"             \n\t"\
-        "packuswb "#T", "#T"        \n\t"\
-        OP(T, (%1), A, d)\
-        "add %3, %1                 \n\t"
-
-#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
-        "mov"#q" "#C", "#T"         \n\t"\
-        "mov"#d" (%0), "#F"         \n\t"\
-        "paddw "#D", "#T"           \n\t"\
-        "psllw $2, "#T"             \n\t"\
-        "paddw %4, "#A"             \n\t"\
-        "psubw "#B", "#T"           \n\t"\
-        "psubw "#E", "#T"           \n\t"\
-        "punpcklbw "#Z", "#F"       \n\t"\
-        "pmullw %3, "#T"            \n\t"\
-        "paddw "#F", "#A"           \n\t"\
-        "add %2, %0                 \n\t"\
-        "paddw "#A", "#T"           \n\t"\
-        "mov"#q" "#T", "#OF"(%1)    \n\t"
-
-#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
-#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
-#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
-#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
-
-
-#define QPEL_H264(OPNAME, OP, MMX)\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movq %0, %%mm6             \n\t"\
-        :: "m"(ff_pw_5)\
-    );\
-    do{\
-    __asm__ volatile(\
-        "movq    (%0), %%mm0        \n\t"\
-        "movq   1(%0), %%mm2        \n\t"\
-        "movq %%mm0, %%mm1          \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpckhbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm3, %%mm1         \n\t"\
-        "psllw $2, %%mm0            \n\t"\
-        "psllw $2, %%mm1            \n\t"\
-        "movq   -1(%0), %%mm2       \n\t"\
-        "movq    2(%0), %%mm4       \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "movq %%mm4, %%mm5          \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        "punpckhbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm4, %%mm2         \n\t"\
-        "paddw %%mm3, %%mm5         \n\t"\
-        "psubw %%mm2, %%mm0         \n\t"\
-        "psubw %%mm5, %%mm1         \n\t"\
-        "pmullw %%mm6, %%mm0        \n\t"\
-        "pmullw %%mm6, %%mm1        \n\t"\
-        "movd   -2(%0), %%mm2       \n\t"\
-        "movd    7(%0), %%mm5       \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm3, %%mm2         \n\t"\
-        "paddw %%mm5, %%mm4         \n\t"\
-        "movq %5, %%mm5             \n\t"\
-        "paddw %%mm5, %%mm2         \n\t"\
-        "paddw %%mm5, %%mm4         \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm4, %%mm1         \n\t"\
-        "psraw $5, %%mm0            \n\t"\
-        "psraw $5, %%mm1            \n\t"\
-        "movq (%2), %%mm4           \n\t"\
-        "packuswb %%mm1, %%mm0      \n\t"\
-        PAVGB" %%mm4, %%mm0         \n\t"\
-        OP(%%mm0, (%1),%%mm5, q)\
-        "add %4, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "add %3, %2                 \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_16)\
-        : "memory"\
-    );\
-    }while(--h);\
-}\
-\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
-    int w = size>>4;\
-    do{\
-    int h = size;\
-    __asm__ volatile(\
-        "1:                         \n\t"\
-        "movq     (%0), %%mm0       \n\t"\
-        "movq    8(%0), %%mm3       \n\t"\
-        "movq    2(%0), %%mm1       \n\t"\
-        "movq   10(%0), %%mm4       \n\t"\
-        "paddw   %%mm4, %%mm0       \n\t"\
-        "paddw   %%mm3, %%mm1       \n\t"\
-        "paddw  18(%0), %%mm3       \n\t"\
-        "paddw  16(%0), %%mm4       \n\t"\
-        "movq    4(%0), %%mm2       \n\t"\
-        "movq   12(%0), %%mm5       \n\t"\
-        "paddw   6(%0), %%mm2       \n\t"\
-        "paddw  14(%0), %%mm5       \n\t"\
-        "psubw %%mm1, %%mm0         \n\t"\
-        "psubw %%mm4, %%mm3         \n\t"\
-        "psraw $2, %%mm0            \n\t"\
-        "psraw $2, %%mm3            \n\t"\
-        "psubw %%mm1, %%mm0         \n\t"\
-        "psubw %%mm4, %%mm3         \n\t"\
-        "paddsw %%mm2, %%mm0        \n\t"\
-        "paddsw %%mm5, %%mm3        \n\t"\
-        "psraw $2, %%mm0            \n\t"\
-        "psraw $2, %%mm3            \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm5, %%mm3         \n\t"\
-        "psraw $6, %%mm0            \n\t"\
-        "psraw $6, %%mm3            \n\t"\
-        "packuswb %%mm3, %%mm0      \n\t"\
-        OP(%%mm0, (%1),%%mm7, q)\
-        "add $48, %0                \n\t"\
-        "add %3, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(tmp), "+c"(dst), "+g"(h)\
-        : "S"((x86_reg)dstStride)\
-        : "memory"\
-    );\
-    tmp += 8 - size*24;\
-    dst += 8 - size*dstStride;\
-    }while(w--);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-    src += 8*dstStride;\
-    dst += 8*dstStride;\
-    src2 += 8*src2Stride;\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}\
-static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
-    do{\
-    __asm__ volatile(\
-        "movq      (%1), %%mm0          \n\t"\
-        "movq     8(%1), %%mm1          \n\t"\
-        "movq    48(%1), %%mm2          \n\t"\
-        "movq  8+48(%1), %%mm3          \n\t"\
-        "psraw      $5,  %%mm0          \n\t"\
-        "psraw      $5,  %%mm1          \n\t"\
-        "psraw      $5,  %%mm2          \n\t"\
-        "psraw      $5,  %%mm3          \n\t"\
-        "packuswb %%mm1, %%mm0          \n\t"\
-        "packuswb %%mm3, %%mm2          \n\t"\
-        PAVGB"     (%0), %%mm0          \n\t"\
-        PAVGB"  (%0,%3), %%mm2          \n\t"\
-        OP(%%mm0, (%2), %%mm5, q)\
-        OP(%%mm2, (%2,%4), %%mm5, q)\
-        ::"a"(src8), "c"(src16), "d"(dst),\
-          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
-        :"memory");\
-        src8 += 2L*src8Stride;\
-        src16 += 48;\
-        dst += 2L*dstStride;\
-    }while(h-=2);\
-}\
-static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
-    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
-    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
-}\
-
-
-#if ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=16;\
-    __asm__ volatile(\
-        "pxor %%xmm15, %%xmm15      \n\t"\
-        "movdqa %6, %%xmm14         \n\t"\
-        "movdqa %7, %%xmm13         \n\t"\
-        "1:                         \n\t"\
-        "lddqu    6(%0), %%xmm1     \n\t"\
-        "lddqu   -2(%0), %%xmm7     \n\t"\
-        "movdqa  %%xmm1, %%xmm0     \n\t"\
-        "punpckhbw %%xmm15, %%xmm1  \n\t"\
-        "punpcklbw %%xmm15, %%xmm0  \n\t"\
-        "punpcklbw %%xmm15, %%xmm7  \n\t"\
-        "movdqa  %%xmm1, %%xmm2     \n\t"\
-        "movdqa  %%xmm0, %%xmm6     \n\t"\
-        "movdqa  %%xmm1, %%xmm3     \n\t"\
-        "movdqa  %%xmm0, %%xmm8     \n\t"\
-        "movdqa  %%xmm1, %%xmm4     \n\t"\
-        "movdqa  %%xmm0, %%xmm9     \n\t"\
-        "movdqa  %%xmm0, %%xmm12    \n\t"\
-        "movdqa  %%xmm1, %%xmm11    \n\t"\
-        "palignr $10,%%xmm0, %%xmm11\n\t"\
-        "palignr $10,%%xmm7, %%xmm12\n\t"\
-        "palignr $2, %%xmm0, %%xmm4 \n\t"\
-        "palignr $2, %%xmm7, %%xmm9 \n\t"\
-        "palignr $4, %%xmm0, %%xmm3 \n\t"\
-        "palignr $4, %%xmm7, %%xmm8 \n\t"\
-        "palignr $6, %%xmm0, %%xmm2 \n\t"\
-        "palignr $6, %%xmm7, %%xmm6 \n\t"\
-        "paddw   %%xmm0 ,%%xmm11    \n\t"\
-        "palignr $8, %%xmm0, %%xmm1 \n\t"\
-        "palignr $8, %%xmm7, %%xmm0 \n\t"\
-        "paddw   %%xmm12,%%xmm7     \n\t"\
-        "paddw   %%xmm3, %%xmm2     \n\t"\
-        "paddw   %%xmm8, %%xmm6     \n\t"\
-        "paddw   %%xmm4, %%xmm1     \n\t"\
-        "paddw   %%xmm9, %%xmm0     \n\t"\
-        "psllw   $2,     %%xmm2     \n\t"\
-        "psllw   $2,     %%xmm6     \n\t"\
-        "psubw   %%xmm1, %%xmm2     \n\t"\
-        "psubw   %%xmm0, %%xmm6     \n\t"\
-        "paddw   %%xmm13,%%xmm11    \n\t"\
-        "paddw   %%xmm13,%%xmm7     \n\t"\
-        "pmullw  %%xmm14,%%xmm2     \n\t"\
-        "pmullw  %%xmm14,%%xmm6     \n\t"\
-        "lddqu   (%2),   %%xmm3     \n\t"\
-        "paddw   %%xmm11,%%xmm2     \n\t"\
-        "paddw   %%xmm7, %%xmm6     \n\t"\
-        "psraw   $5,     %%xmm2     \n\t"\
-        "psraw   $5,     %%xmm6     \n\t"\
-        "packuswb %%xmm2,%%xmm6     \n\t"\
-        "pavgb   %%xmm3, %%xmm6     \n\t"\
-        OP(%%xmm6, (%1), %%xmm4, dqa)\
-        "add %5, %0                 \n\t"\
-        "add %5, %1                 \n\t"\
-        "add %4, %2                 \n\t"\
-        "decl %3                    \n\t"\
-        "jg 1b                      \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}
-#else // ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-    src += 8*dstStride;\
-    dst += 8*dstStride;\
-    src2 += 8*src2Stride;\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}
-#endif // ARCH_X86_64
-
-#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%xmm7, %%xmm7        \n\t"\
-        "movdqa %0, %%xmm6          \n\t"\
-        :: "m"(ff_pw_5)\
-    );\
-    do{\
-    __asm__ volatile(\
-        "lddqu   -2(%0), %%xmm1     \n\t"\
-        "movdqa  %%xmm1, %%xmm0     \n\t"\
-        "punpckhbw %%xmm7, %%xmm1   \n\t"\
-        "punpcklbw %%xmm7, %%xmm0   \n\t"\
-        "movdqa  %%xmm1, %%xmm2     \n\t"\
-        "movdqa  %%xmm1, %%xmm3     \n\t"\
-        "movdqa  %%xmm1, %%xmm4     \n\t"\
-        "movdqa  %%xmm1, %%xmm5     \n\t"\
-        "palignr $2, %%xmm0, %%xmm4 \n\t"\
-        "palignr $4, %%xmm0, %%xmm3 \n\t"\
-        "palignr $6, %%xmm0, %%xmm2 \n\t"\
-        "palignr $8, %%xmm0, %%xmm1 \n\t"\
-        "palignr $10,%%xmm0, %%xmm5 \n\t"\
-        "paddw   %%xmm5, %%xmm0     \n\t"\
-        "paddw   %%xmm3, %%xmm2     \n\t"\
-        "paddw   %%xmm4, %%xmm1     \n\t"\
-        "psllw   $2,     %%xmm2     \n\t"\
-        "movq    (%2),   %%xmm3     \n\t"\
-        "psubw   %%xmm1, %%xmm2     \n\t"\
-        "paddw   %5,     %%xmm0     \n\t"\
-        "pmullw  %%xmm6, %%xmm2     \n\t"\
-        "paddw   %%xmm0, %%xmm2     \n\t"\
-        "psraw   $5,     %%xmm2     \n\t"\
-        "packuswb %%xmm2, %%xmm2    \n\t"\
-        "pavgb   %%xmm3, %%xmm2     \n\t"\
-        OP(%%xmm2, (%1), %%xmm4, q)\
-        "add %4, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "add %3, %2                 \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_16)\
-        : "memory"\
-    );\
-    }while(--h);\
-}\
-QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%xmm7, %%xmm7        \n\t"\
-        "movdqa %5, %%xmm6          \n\t"\
-        "1:                         \n\t"\
-        "lddqu   -2(%0), %%xmm1     \n\t"\
-        "movdqa  %%xmm1, %%xmm0     \n\t"\
-        "punpckhbw %%xmm7, %%xmm1   \n\t"\
-        "punpcklbw %%xmm7, %%xmm0   \n\t"\
-        "movdqa  %%xmm1, %%xmm2     \n\t"\
-        "movdqa  %%xmm1, %%xmm3     \n\t"\
-        "movdqa  %%xmm1, %%xmm4     \n\t"\
-        "movdqa  %%xmm1, %%xmm5     \n\t"\
-        "palignr $2, %%xmm0, %%xmm4 \n\t"\
-        "palignr $4, %%xmm0, %%xmm3 \n\t"\
-        "palignr $6, %%xmm0, %%xmm2 \n\t"\
-        "palignr $8, %%xmm0, %%xmm1 \n\t"\
-        "palignr $10,%%xmm0, %%xmm5 \n\t"\
-        "paddw   %%xmm5, %%xmm0     \n\t"\
-        "paddw   %%xmm3, %%xmm2     \n\t"\
-        "paddw   %%xmm4, %%xmm1     \n\t"\
-        "psllw   $2,     %%xmm2     \n\t"\
-        "psubw   %%xmm1, %%xmm2     \n\t"\
-        "paddw   %6,     %%xmm0     \n\t"\
-        "pmullw  %%xmm6, %%xmm2     \n\t"\
-        "paddw   %%xmm0, %%xmm2     \n\t"\
-        "psraw   $5,     %%xmm2     \n\t"\
-        "packuswb %%xmm2, %%xmm2    \n\t"\
-        OP(%%xmm2, (%1), %%xmm4, q)\
-        "add %3, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+g"(h)\
-        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}\
-static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-
-#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    src -= 2*srcStride;\
-    \
-    __asm__ volatile(\
-        "pxor %%xmm7, %%xmm7        \n\t"\
-        "movq (%0), %%xmm0          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm1          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm2          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm3          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm4          \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%xmm7, %%xmm0   \n\t"\
-        "punpcklbw %%xmm7, %%xmm1   \n\t"\
-        "punpcklbw %%xmm7, %%xmm2   \n\t"\
-        "punpcklbw %%xmm7, %%xmm3   \n\t"\
-        "punpcklbw %%xmm7, %%xmm4   \n\t"\
-        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
-        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
-        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
-        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
-        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
-        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
-        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
-        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
-         \
-        : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-    if(h==16){\
-        __asm__ volatile(\
-            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
-            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
-            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
-            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
-            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
-            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
-            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
-            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
-            \
-            : "+a"(src), "+c"(dst)\
-            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-            : "memory"\
-        );\
-    }\
-}\
-static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
-}\
-static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}
-
-static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
-    int w = (size+8)>>3;
-    src -= 2*srcStride+2;
-    while(w--){
-        __asm__ volatile(
-            "pxor %%xmm7, %%xmm7        \n\t"
-            "movq (%0), %%xmm0          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm1          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm2          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm3          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm4          \n\t"
-            "add %2, %0                 \n\t"
-            "punpcklbw %%xmm7, %%xmm0   \n\t"
-            "punpcklbw %%xmm7, %%xmm1   \n\t"
-            "punpcklbw %%xmm7, %%xmm2   \n\t"
-            "punpcklbw %%xmm7, %%xmm3   \n\t"
-            "punpcklbw %%xmm7, %%xmm4   \n\t"
-            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
-            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
-            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
-            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
-            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
-            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
-            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
-            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
-            : "+a"(src)
-            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
-            : "memory"
-        );
-        if(size==16){
-            __asm__ volatile(
-                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
-                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
-                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
-                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
-                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
-                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
-                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
-                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
-                : "+a"(src)
-                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
-                : "memory"
-            );
-        }
-        tmp += 8;
-        src += 8 - (size+5)*srcStride;
-    }
-}
-
-#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
-    int h = size;\
-    if(size == 16){\
-        __asm__ volatile(\
-            "1:                         \n\t"\
-            "movdqa 32(%0), %%xmm4      \n\t"\
-            "movdqa 16(%0), %%xmm5      \n\t"\
-            "movdqa   (%0), %%xmm7      \n\t"\
-            "movdqa %%xmm4, %%xmm3      \n\t"\
-            "movdqa %%xmm4, %%xmm2      \n\t"\
-            "movdqa %%xmm4, %%xmm1      \n\t"\
-            "movdqa %%xmm4, %%xmm0      \n\t"\
-            "palignr $10, %%xmm5, %%xmm0 \n\t"\
-            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
-            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
-            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
-            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
-            "paddw  %%xmm5, %%xmm0      \n\t"\
-            "paddw  %%xmm4, %%xmm1      \n\t"\
-            "paddw  %%xmm3, %%xmm2      \n\t"\
-            "movdqa %%xmm5, %%xmm6      \n\t"\
-            "movdqa %%xmm5, %%xmm4      \n\t"\
-            "movdqa %%xmm5, %%xmm3      \n\t"\
-            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
-            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
-            "palignr $10, %%xmm7, %%xmm3 \n\t"\
-            "paddw  %%xmm6, %%xmm4      \n\t"\
-            "movdqa %%xmm5, %%xmm6      \n\t"\
-            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
-            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
-            "paddw  %%xmm7, %%xmm3      \n\t"\
-            "paddw  %%xmm6, %%xmm5      \n\t"\
-            \
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "psubw  %%xmm4, %%xmm3      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm3      \n\t"\
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "psubw  %%xmm4, %%xmm3      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "paddw  %%xmm5, %%xmm3      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm3      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "paddw  %%xmm5, %%xmm3      \n\t"\
-            "psraw      $6, %%xmm0      \n\t"\
-            "psraw      $6, %%xmm3      \n\t"\
-            "packuswb %%xmm0, %%xmm3    \n\t"\
-            OP(%%xmm3, (%1), %%xmm7, dqa)\
-            "add $48, %0                \n\t"\
-            "add %3, %1                 \n\t"\
-            "decl %2                    \n\t"\
-            " jnz 1b                    \n\t"\
-            : "+a"(tmp), "+c"(dst), "+g"(h)\
-            : "S"((x86_reg)dstStride)\
-            : "memory"\
-        );\
-    }else{\
-        __asm__ volatile(\
-            "1:                         \n\t"\
-            "movdqa 16(%0), %%xmm1      \n\t"\
-            "movdqa   (%0), %%xmm0      \n\t"\
-            "movdqa %%xmm1, %%xmm2      \n\t"\
-            "movdqa %%xmm1, %%xmm3      \n\t"\
-            "movdqa %%xmm1, %%xmm4      \n\t"\
-            "movdqa %%xmm1, %%xmm5      \n\t"\
-            "palignr $10, %%xmm0, %%xmm5 \n\t"\
-            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
-            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
-            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
-            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
-            "paddw  %%xmm5, %%xmm0      \n\t"\
-            "paddw  %%xmm4, %%xmm1      \n\t"\
-            "paddw  %%xmm3, %%xmm2      \n\t"\
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "psraw      $6, %%xmm0      \n\t"\
-            "packuswb %%xmm0, %%xmm0    \n\t"\
-            OP(%%xmm0, (%1), %%xmm7, q)\
-            "add $48, %0                \n\t"\
-            "add %3, %1                 \n\t"\
-            "decl %2                    \n\t"\
-            " jnz 1b                    \n\t"\
-            : "+a"(tmp), "+c"(dst), "+g"(h)\
-            : "S"((x86_reg)dstStride)\
-            : "memory"\
-        );\
-    }\
-}
-
-#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
-          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
-    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
-}\
-static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
-}\
-
-#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
-#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
-#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
-#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
-#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
-#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
-#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
-#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
-
-#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
-#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
-#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
-#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
-#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
-#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
-#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
-#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
-
-#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
-#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
-#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
-#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
-
-#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
-#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
-#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
-#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
-
-#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
-#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
-
-#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
-
-// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
-//     put_pixels16_sse2(dst, src, stride, 16);
-// }
-// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
-//     avg_pixels16_sse2(dst, src, stride, 16);
-// }
-#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
-#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
-
-#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
-}\
-
-#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
-}\
-
-#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
-}\
-
-#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
-}\
-
-#define H264_MC_4816(MMX)\
-H264_MC(put_, 4, MMX, 8)\
-H264_MC(put_, 8, MMX, 8)\
-H264_MC(put_, 16,MMX, 8)\
-H264_MC(avg_, 4, MMX, 8)\
-H264_MC(avg_, 8, MMX, 8)\
-H264_MC(avg_, 16,MMX, 8)\
-
-#define H264_MC_816(QPEL, XMM)\
-QPEL(put_, 8, XMM, 16)\
-QPEL(put_, 16,XMM, 16)\
-QPEL(avg_, 8, XMM, 16)\
-QPEL(avg_, 16,XMM, 16)\
-
-
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgusb " #temp ", " #a "        \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-#define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgb " #temp ", " #a "          \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-
-///this does not get detected correctly, uncomment on AMD machine
-#ifdef HAVE_AMD3DNOW
-#define PAVGB "pavgusb"
-//QPEL_H264(put_,       PUT_OP, 3dnow)
-//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
-#undef PAVGB
-#endif
-
-#define PAVGB "pavgb"
-QPEL_H264(put_,       PUT_OP, mmx2)
-QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
-QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
-QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
-QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
-QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
-#if HAVE_SSSE3
-QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
-QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
-QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
-QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
-QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
-QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
-#endif
-#undef PAVGB
-
-H264_MC_816(H264_MC_V, sse2)
-H264_MC_816(H264_MC_HV, sse2)
-#if HAVE_SSSE3
-H264_MC_816(H264_MC_H, ssse3)
-H264_MC_816(H264_MC_HV, ssse3)
-#endif
-
-/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
-DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
-    0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
-};
-
-#if HAVE_SSSE3
-#define AVG_OP(X)
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
-#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
-#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
-#include "dsputil_h264_template_ssse3.c"
-static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
-}
-
-#undef AVG_OP
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-#define AVG_OP(X) X
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
-#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
-#include "dsputil_h264_template_ssse3.c"
-static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
-}
-#undef AVG_OP
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-#endif
-
-/***********************************/
-/* weighted prediction */
-
-static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
-{
-    int x, y;
-    offset <<= log2_denom;
-    offset += (1 << log2_denom) >> 1;
-    __asm__ volatile(
-        "movd    %0, %%mm4        \n\t"
-        "movd    %1, %%mm5        \n\t"
-        "movd    %2, %%mm6        \n\t"
-        "pshufw  $0, %%mm4, %%mm4 \n\t"
-        "pshufw  $0, %%mm5, %%mm5 \n\t"
-        "pxor    %%mm7, %%mm7     \n\t"
-        :: "g"(weight), "g"(offset), "g"(log2_denom)
-    );
-    for(y=0; y<h; y+=2){
-        for(x=0; x<w; x+=4){
-            __asm__ volatile(
-                "movd      %0,    %%mm0 \n\t"
-                "movd      %1,    %%mm1 \n\t"
-                "punpcklbw %%mm7, %%mm0 \n\t"
-                "punpcklbw %%mm7, %%mm1 \n\t"
-                "pmullw    %%mm4, %%mm0 \n\t"
-                "pmullw    %%mm4, %%mm1 \n\t"
-                "paddsw    %%mm5, %%mm0 \n\t"
-                "paddsw    %%mm5, %%mm1 \n\t"
-                "psraw     %%mm6, %%mm0 \n\t"
-                "psraw     %%mm6, %%mm1 \n\t"
-                "packuswb  %%mm7, %%mm0 \n\t"
-                "packuswb  %%mm7, %%mm1 \n\t"
-                "movd      %%mm0, %0    \n\t"
-                "movd      %%mm1, %1    \n\t"
-                : "+m"(*(uint32_t*)(dst+x)),
-                  "+m"(*(uint32_t*)(dst+x+stride))
-            );
-        }
-        dst += 2*stride;
-    }
-}
-
-static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
-{
-    int x, y;
-    offset = ((offset + 1) | 1) << log2_denom;
-    __asm__ volatile(
-        "movd    %0, %%mm3        \n\t"
-        "movd    %1, %%mm4        \n\t"
-        "movd    %2, %%mm5        \n\t"
-        "movd    %3, %%mm6        \n\t"
-        "pshufw  $0, %%mm3, %%mm3 \n\t"
-        "pshufw  $0, %%mm4, %%mm4 \n\t"
-        "pshufw  $0, %%mm5, %%mm5 \n\t"
-        "pxor    %%mm7, %%mm7     \n\t"
-        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
-    );
-    for(y=0; y<h; y++){
-        for(x=0; x<w; x+=4){
-            __asm__ volatile(
-                "movd      %0,    %%mm0 \n\t"
-                "movd      %1,    %%mm1 \n\t"
-                "punpcklbw %%mm7, %%mm0 \n\t"
-                "punpcklbw %%mm7, %%mm1 \n\t"
-                "pmullw    %%mm3, %%mm0 \n\t"
-                "pmullw    %%mm4, %%mm1 \n\t"
-                "paddsw    %%mm1, %%mm0 \n\t"
-                "paddsw    %%mm5, %%mm0 \n\t"
-                "psraw     %%mm6, %%mm0 \n\t"
-                "packuswb  %%mm0, %%mm0 \n\t"
-                "movd      %%mm0, %0    \n\t"
-                : "+m"(*(uint32_t*)(dst+x))
-                :  "m"(*(uint32_t*)(src+x))
-            );
-        }
-        src += stride;
-        dst += stride;
-    }
-}
-
-#define H264_WEIGHT(W,H) \
-static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
-} \
-static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
-    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
-H264_WEIGHT( 4, 8)
-H264_WEIGHT( 4, 4)
-H264_WEIGHT( 4, 2)
-
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/mathops.h
--- a/ffmpeg_smp/h264dec/libavcodec/x86/mathops.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,67 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_MATHOPS_H
-#define AVCODEC_X86_MATHOPS_H
-
-#include "config.h"
-#include "libavutil/common.h"
-
-#if ARCH_X86_32
-#define MULL(ra, rb, shift) \
-        ({ int rt, dummy; __asm__ (\
-            "imull %3               \n\t"\
-            "shrdl %4, %%edx, %%eax \n\t"\
-            : "=a"(rt), "=d"(dummy)\
-            : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\
-         rt; })
-
-#define MULH(ra, rb) \
-    ({ int rt, dummy;\
-     __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\
-     rt; })
-
-#define MUL64(ra, rb) \
-    ({ int64_t rt;\
-     __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\
-     rt; })
-#endif
-
-// avoid +32 for shift optimization (gcc should do that ...)
-#define NEG_SSR32 NEG_SSR32
-static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
-    __asm__ ("sarl %1, %0\n\t"
-         : "+r" (a)
-         : "ic" ((uint8_t)(-s))
-    );
-    return a;
-}
-
-#define NEG_USR32 NEG_USR32
-static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
-    __asm__ ("shrl %1, %0\n\t"
-         : "+r" (a)
-         : "ic" ((uint8_t)(-s))
-    );
-    return a;
-}
-
-#endif /* AVCODEC_X86_MATHOPS_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/mmx.h
--- a/ffmpeg_smp/h264dec/libavcodec/x86/mmx.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,267 +0,0 @@
-/*
- * mmx.h
- * Copyright (C) 1997-2001 H. Dietz and R. Fisher
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef AVCODEC_X86_MMX_H
-#define AVCODEC_X86_MMX_H
-
-#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected.
-
-
-#define         mmx_i2r(op,imm,reg) \
-        __asm__ volatile (#op " %0, %%" #reg \
-                              : /* nothing */ \
-                              : "i" (imm) )
-
-#define         mmx_m2r(op,mem,reg) \
-        __asm__ volatile (#op " %0, %%" #reg \
-                              : /* nothing */ \
-                              : "m" (mem))
-
-#define         mmx_r2m(op,reg,mem) \
-        __asm__ volatile (#op " %%" #reg ", %0" \
-                              : "=m" (mem) \
-                              : /* nothing */ )
-
-#define         mmx_r2r(op,regs,regd) \
-        __asm__ volatile (#op " %" #regs ", %" #regd)
-
-
-#define         emms() __asm__ volatile ("emms")
-
-#define         movd_m2r(var,reg)           mmx_m2r (movd, var, reg)
-#define         movd_r2m(reg,var)           mmx_r2m (movd, reg, var)
-#define         movd_r2r(regs,regd)         mmx_r2r (movd, regs, regd)
-
-#define         movq_m2r(var,reg)           mmx_m2r (movq, var, reg)
-#define         movq_r2m(reg,var)           mmx_r2m (movq, reg, var)
-#define         movq_r2r(regs,regd)         mmx_r2r (movq, regs, regd)
-
-#define         packssdw_m2r(var,reg)       mmx_m2r (packssdw, var, reg)
-#define         packssdw_r2r(regs,regd)     mmx_r2r (packssdw, regs, regd)
-#define         packsswb_m2r(var,reg)       mmx_m2r (packsswb, var, reg)
-#define         packsswb_r2r(regs,regd)     mmx_r2r (packsswb, regs, regd)
-
-#define         packuswb_m2r(var,reg)       mmx_m2r (packuswb, var, reg)
-#define         packuswb_r2r(regs,regd)     mmx_r2r (packuswb, regs, regd)
-
-#define         paddb_m2r(var,reg)          mmx_m2r (paddb, var, reg)
-#define         paddb_r2r(regs,regd)        mmx_r2r (paddb, regs, regd)
-#define         paddd_m2r(var,reg)          mmx_m2r (paddd, var, reg)
-#define         paddd_r2r(regs,regd)        mmx_r2r (paddd, regs, regd)
-#define         paddw_m2r(var,reg)          mmx_m2r (paddw, var, reg)
-#define         paddw_r2r(regs,regd)        mmx_r2r (paddw, regs, regd)
-
-#define         paddsb_m2r(var,reg)         mmx_m2r (paddsb, var, reg)
-#define         paddsb_r2r(regs,regd)       mmx_r2r (paddsb, regs, regd)
-#define         paddsw_m2r(var,reg)         mmx_m2r (paddsw, var, reg)
-#define         paddsw_r2r(regs,regd)       mmx_r2r (paddsw, regs, regd)
-
-#define         paddusb_m2r(var,reg)        mmx_m2r (paddusb, var, reg)
-#define         paddusb_r2r(regs,regd)      mmx_r2r (paddusb, regs, regd)
-#define         paddusw_m2r(var,reg)        mmx_m2r (paddusw, var, reg)
-#define         paddusw_r2r(regs,regd)      mmx_r2r (paddusw, regs, regd)
-
-#define         pand_m2r(var,reg)           mmx_m2r (pand, var, reg)
-#define         pand_r2r(regs,regd)         mmx_r2r (pand, regs, regd)
-
-#define         pandn_m2r(var,reg)          mmx_m2r (pandn, var, reg)
-#define         pandn_r2r(regs,regd)        mmx_r2r (pandn, regs, regd)
-
-#define         pcmpeqb_m2r(var,reg)        mmx_m2r (pcmpeqb, var, reg)
-#define         pcmpeqb_r2r(regs,regd)      mmx_r2r (pcmpeqb, regs, regd)
-#define         pcmpeqd_m2r(var,reg)        mmx_m2r (pcmpeqd, var, reg)
-#define         pcmpeqd_r2r(regs,regd)      mmx_r2r (pcmpeqd, regs, regd)
-#define         pcmpeqw_m2r(var,reg)        mmx_m2r (pcmpeqw, var, reg)
-#define         pcmpeqw_r2r(regs,regd)      mmx_r2r (pcmpeqw, regs, regd)
-
-#define         pcmpgtb_m2r(var,reg)        mmx_m2r (pcmpgtb, var, reg)
-#define         pcmpgtb_r2r(regs,regd)      mmx_r2r (pcmpgtb, regs, regd)
-#define         pcmpgtd_m2r(var,reg)        mmx_m2r (pcmpgtd, var, reg)
-#define         pcmpgtd_r2r(regs,regd)      mmx_r2r (pcmpgtd, regs, regd)
-#define         pcmpgtw_m2r(var,reg)        mmx_m2r (pcmpgtw, var, reg)
-#define         pcmpgtw_r2r(regs,regd)      mmx_r2r (pcmpgtw, regs, regd)
-
-#define         pmaddwd_m2r(var,reg)        mmx_m2r (pmaddwd, var, reg)
-#define         pmaddwd_r2r(regs,regd)      mmx_r2r (pmaddwd, regs, regd)
-
-#define         pmulhw_m2r(var,reg)         mmx_m2r (pmulhw, var, reg)
-#define         pmulhw_r2r(regs,regd)       mmx_r2r (pmulhw, regs, regd)
-
-#define         pmullw_m2r(var,reg)         mmx_m2r (pmullw, var, reg)
-#define         pmullw_r2r(regs,regd)       mmx_r2r (pmullw, regs, regd)
-
-#define         por_m2r(var,reg)            mmx_m2r (por, var, reg)
-#define         por_r2r(regs,regd)          mmx_r2r (por, regs, regd)
-
-#define         pslld_i2r(imm,reg)          mmx_i2r (pslld, imm, reg)
-#define         pslld_m2r(var,reg)          mmx_m2r (pslld, var, reg)
-#define         pslld_r2r(regs,regd)        mmx_r2r (pslld, regs, regd)
-#define         psllq_i2r(imm,reg)          mmx_i2r (psllq, imm, reg)
-#define         psllq_m2r(var,reg)          mmx_m2r (psllq, var, reg)
-#define         psllq_r2r(regs,regd)        mmx_r2r (psllq, regs, regd)
-#define         psllw_i2r(imm,reg)          mmx_i2r (psllw, imm, reg)
-#define         psllw_m2r(var,reg)          mmx_m2r (psllw, var, reg)
-#define         psllw_r2r(regs,regd)        mmx_r2r (psllw, regs, regd)
-
-#define         psrad_i2r(imm,reg)          mmx_i2r (psrad, imm, reg)
-#define         psrad_m2r(var,reg)          mmx_m2r (psrad, var, reg)
-#define         psrad_r2r(regs,regd)        mmx_r2r (psrad, regs, regd)
-#define         psraw_i2r(imm,reg)          mmx_i2r (psraw, imm, reg)
-#define         psraw_m2r(var,reg)          mmx_m2r (psraw, var, reg)
-#define         psraw_r2r(regs,regd)        mmx_r2r (psraw, regs, regd)
-
-#define         psrld_i2r(imm,reg)          mmx_i2r (psrld, imm, reg)
-#define         psrld_m2r(var,reg)          mmx_m2r (psrld, var, reg)
-#define         psrld_r2r(regs,regd)        mmx_r2r (psrld, regs, regd)
-#define         psrlq_i2r(imm,reg)          mmx_i2r (psrlq, imm, reg)
-#define         psrlq_m2r(var,reg)          mmx_m2r (psrlq, var, reg)
-#define         psrlq_r2r(regs,regd)        mmx_r2r (psrlq, regs, regd)
-#define         psrlw_i2r(imm,reg)          mmx_i2r (psrlw, imm, reg)
-#define         psrlw_m2r(var,reg)          mmx_m2r (psrlw, var, reg)
-#define         psrlw_r2r(regs,regd)        mmx_r2r (psrlw, regs, regd)
-
-#define         psubb_m2r(var,reg)          mmx_m2r (psubb, var, reg)
-#define         psubb_r2r(regs,regd)        mmx_r2r (psubb, regs, regd)
-#define         psubd_m2r(var,reg)          mmx_m2r (psubd, var, reg)
-#define         psubd_r2r(regs,regd)        mmx_r2r (psubd, regs, regd)
-#define         psubw_m2r(var,reg)          mmx_m2r (psubw, var, reg)
-#define         psubw_r2r(regs,regd)        mmx_r2r (psubw, regs, regd)
-
-#define         psubsb_m2r(var,reg)         mmx_m2r (psubsb, var, reg)
-#define         psubsb_r2r(regs,regd)       mmx_r2r (psubsb, regs, regd)
-#define         psubsw_m2r(var,reg)         mmx_m2r (psubsw, var, reg)
-#define         psubsw_r2r(regs,regd)       mmx_r2r (psubsw, regs, regd)
-
-#define         psubusb_m2r(var,reg)        mmx_m2r (psubusb, var, reg)
-#define         psubusb_r2r(regs,regd)      mmx_r2r (psubusb, regs, regd)
-#define         psubusw_m2r(var,reg)        mmx_m2r (psubusw, var, reg)
-#define         psubusw_r2r(regs,regd)      mmx_r2r (psubusw, regs, regd)
-
-#define         punpckhbw_m2r(var,reg)      mmx_m2r (punpckhbw, var, reg)
-#define         punpckhbw_r2r(regs,regd)    mmx_r2r (punpckhbw, regs, regd)
-#define         punpckhdq_m2r(var,reg)      mmx_m2r (punpckhdq, var, reg)
-#define         punpckhdq_r2r(regs,regd)    mmx_r2r (punpckhdq, regs, regd)
-#define         punpckhwd_m2r(var,reg)      mmx_m2r (punpckhwd, var, reg)
-#define         punpckhwd_r2r(regs,regd)    mmx_r2r (punpckhwd, regs, regd)
-
-#define         punpcklbw_m2r(var,reg)      mmx_m2r (punpcklbw, var, reg)
-#define         punpcklbw_r2r(regs,regd)    mmx_r2r (punpcklbw, regs, regd)
-#define         punpckldq_m2r(var,reg)      mmx_m2r (punpckldq, var, reg)
-#define         punpckldq_r2r(regs,regd)    mmx_r2r (punpckldq, regs, regd)
-#define         punpcklwd_m2r(var,reg)      mmx_m2r (punpcklwd, var, reg)
-#define         punpcklwd_r2r(regs,regd)    mmx_r2r (punpcklwd, regs, regd)
-
-#define         pxor_m2r(var,reg)           mmx_m2r (pxor, var, reg)
-#define         pxor_r2r(regs,regd)         mmx_r2r (pxor, regs, regd)
-
-
-/* 3DNOW extensions */
-
-#define         pavgusb_m2r(var,reg)        mmx_m2r (pavgusb, var, reg)
-#define         pavgusb_r2r(regs,regd)      mmx_r2r (pavgusb, regs, regd)
-
-
-/* AMD MMX extensions - also available in intel SSE */
-
-
-#define         mmx_m2ri(op,mem,reg,imm) \
-        __asm__ volatile (#op " %1, %0, %%" #reg \
-                              : /* nothing */ \
-                              : "m" (mem), "i" (imm))
-#define         mmx_r2ri(op,regs,regd,imm) \
-        __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \
-                              : /* nothing */ \
-                              : "i" (imm) )
-
-#define         mmx_fetch(mem,hint) \
-        __asm__ volatile ("prefetch" #hint " %0" \
-                              : /* nothing */ \
-                              : "m" (mem))
-
-
-#define         maskmovq(regs,maskreg)      mmx_r2ri (maskmovq, regs, maskreg)
-
-#define         movntq_r2m(mmreg,var)       mmx_r2m (movntq, mmreg, var)
-
-#define         pavgb_m2r(var,reg)          mmx_m2r (pavgb, var, reg)
-#define         pavgb_r2r(regs,regd)        mmx_r2r (pavgb, regs, regd)
-#define         pavgw_m2r(var,reg)          mmx_m2r (pavgw, var, reg)
-#define         pavgw_r2r(regs,regd)        mmx_r2r (pavgw, regs, regd)
-
-#define         pextrw_r2r(mmreg,reg,imm)   mmx_r2ri (pextrw, mmreg, reg, imm)
-
-#define         pinsrw_r2r(reg,mmreg,imm)   mmx_r2ri (pinsrw, reg, mmreg, imm)
-
-#define         pmaxsw_m2r(var,reg)         mmx_m2r (pmaxsw, var, reg)
-#define         pmaxsw_r2r(regs,regd)       mmx_r2r (pmaxsw, regs, regd)
-
-#define         pmaxub_m2r(var,reg)         mmx_m2r (pmaxub, var, reg)
-#define         pmaxub_r2r(regs,regd)       mmx_r2r (pmaxub, regs, regd)
-
-#define         pminsw_m2r(var,reg)         mmx_m2r (pminsw, var, reg)
-#define         pminsw_r2r(regs,regd)       mmx_r2r (pminsw, regs, regd)
-
-#define         pminub_m2r(var,reg)         mmx_m2r (pminub, var, reg)
-#define         pminub_r2r(regs,regd)       mmx_r2r (pminub, regs, regd)
-
-#define         pmovmskb(mmreg,reg) \
-        __asm__ volatile ("movmskps %" #mmreg ", %" #reg)
-
-#define         pmulhuw_m2r(var,reg)        mmx_m2r (pmulhuw, var, reg)
-#define         pmulhuw_r2r(regs,regd)      mmx_r2r (pmulhuw, regs, regd)
-
-#define         prefetcht0(mem)             mmx_fetch (mem, t0)
-#define         prefetcht1(mem)             mmx_fetch (mem, t1)
-#define         prefetcht2(mem)             mmx_fetch (mem, t2)
-#define         prefetchnta(mem)            mmx_fetch (mem, nta)
-
-#define         psadbw_m2r(var,reg)         mmx_m2r (psadbw, var, reg)
-#define         psadbw_r2r(regs,regd)       mmx_r2r (psadbw, regs, regd)
-
-#define         pshufw_m2r(var,reg,imm)     mmx_m2ri(pshufw, var, reg, imm)
-#define         pshufw_r2r(regs,regd,imm)   mmx_r2ri(pshufw, regs, regd, imm)
-
-#define         sfence() __asm__ volatile ("sfence\n\t")
-
-/* SSE2 */
-#define         pshufhw_m2r(var,reg,imm)    mmx_m2ri(pshufhw, var, reg, imm)
-#define         pshufhw_r2r(regs,regd,imm)  mmx_r2ri(pshufhw, regs, regd, imm)
-#define         pshuflw_m2r(var,reg,imm)    mmx_m2ri(pshuflw, var, reg, imm)
-#define         pshuflw_r2r(regs,regd,imm)  mmx_r2ri(pshuflw, regs, regd, imm)
-
-#define         pshufd_r2r(regs,regd,imm)   mmx_r2ri(pshufd, regs, regd, imm)
-
-#define         movdqa_m2r(var,reg)         mmx_m2r (movdqa, var, reg)
-#define         movdqa_r2m(reg,var)         mmx_r2m (movdqa, reg, var)
-#define         movdqa_r2r(regs,regd)       mmx_r2r (movdqa, regs, regd)
-#define         movdqu_m2r(var,reg)         mmx_m2r (movdqu, var, reg)
-#define         movdqu_r2m(reg,var)         mmx_r2m (movdqu, reg, var)
-#define         movdqu_r2r(regs,regd)       mmx_r2r (movdqu, regs, regd)
-
-#define         pmullw_r2m(reg,var)         mmx_r2m (pmullw, reg, var)
-
-#define         pslldq_i2r(imm,reg)         mmx_i2r (pslldq, imm, reg)
-#define         psrldq_i2r(imm,reg)         mmx_i2r (psrldq, imm, reg)
-
-#define         punpcklqdq_r2r(regs,regd)   mmx_r2r (punpcklqdq, regs, regd)
-#define         punpckhqdq_r2r(regs,regd)   mmx_r2r (punpckhqdq, regs, regd)
-
-
-#endif /* AVCODEC_X86_MMX_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/arm/bswap.h
--- a/ffmpeg_smp/h264dec/libavutil/arm/bswap.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,72 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_ARM_BSWAP_H
-#define AVUTIL_ARM_BSWAP_H
-
-#include <stdint.h>
-#include "config.h"
-#include "libavutil/attributes.h"
-
-#ifdef __ARMCC_VERSION
-
-#if HAVE_ARMV6
-#define bswap_16 bswap_16
-static av_always_inline av_const unsigned bswap_16(unsigned x)
-{
-    __asm { rev16 x, x }
-    return x;
-}
-
-#define bswap_32 bswap_32
-static av_always_inline av_const uint32_t bswap_32(uint32_t x)
-{
-    return __rev(x);
-}
-#endif /* HAVE_ARMV6 */
-
-#elif HAVE_INLINE_ASM
-
-#if HAVE_ARMV6
-#define bswap_16 bswap_16
-static av_always_inline av_const unsigned bswap_16(unsigned x)
-{
-    __asm__("rev16 %0, %0" : "+r"(x));
-    return x;
-}
-#endif
-
-#define bswap_32 bswap_32
-static av_always_inline av_const uint32_t bswap_32(uint32_t x)
-{
-#if HAVE_ARMV6
-    __asm__("rev %0, %0" : "+r"(x));
-#else
-    uint32_t t;
-    __asm__ ("eor %1, %0, %0, ror #16 \n\t"
-             "bic %1, %1, #0xFF0000   \n\t"
-             "mov %0, %0, ror #8      \n\t"
-             "eor %0, %0, %1, lsr #8  \n\t"
-             : "+r"(x), "=&r"(t));
-#endif /* HAVE_ARMV6 */
-    return x;
-}
-
-#endif /* __ARMCC_VERSION */
-
-#endif /* AVUTIL_ARM_BSWAP_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h
--- a/ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,78 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_ARM_INTREADWRITE_H
-#define AVUTIL_ARM_INTREADWRITE_H
-
-#include <stdint.h>
-#include "config.h"
-
-#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM
-
-#define AV_RN16 AV_RN16
-static av_always_inline uint16_t AV_RN16(const void *p)
-{
-    uint16_t v;
-    __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p));
-    return v;
-}
-
-#define AV_WN16 AV_WN16
-static av_always_inline void AV_WN16(void *p, uint16_t v)
-{
-    __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v));
-}
-
-#define AV_RN32 AV_RN32
-static av_always_inline uint32_t AV_RN32(const void *p)
-{
-    uint32_t v;
-    __asm__ ("ldr  %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p));
-    return v;
-}
-
-#define AV_WN32 AV_WN32
-static av_always_inline void AV_WN32(void *p, uint32_t v)
-{
-    __asm__ ("str  %1, %0" : "=m"(*(uint32_t *)p) : "r"(v));
-}
-
-#define AV_RN64 AV_RN64
-static av_always_inline uint64_t AV_RN64(const void *p)
-{
-    union { uint64_t v; uint32_t hl[2]; } v;
-    __asm__ ("ldr   %0, %2  \n\t"
-             "ldr   %1, %3  \n\t"
-             : "=&r"(v.hl[0]), "=r"(v.hl[1])
-             : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1)));
-    return v.v;
-}
-
-#define AV_WN64 AV_WN64
-static av_always_inline void AV_WN64(void *p, uint64_t v)
-{
-    union { uint64_t v; uint32_t hl[2]; } vv = { v };
-    __asm__ ("str  %2, %0  \n\t"
-             "str  %3, %1  \n\t"
-             : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1))
-             : "r"(vv.hl[0]), "r"(vv.hl[1]));
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#endif /* AVUTIL_ARM_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/arm/timer.h
--- a/ffmpeg_smp/h264dec/libavutil/arm/timer.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_ARM_TIMER_H
-#define AVUTIL_ARM_TIMER_H
-
-#include <stdint.h>
-#include "config.h"
-
-#if HAVE_INLINE_ASM && defined(__ARM_ARCH_7A__)
-
-#define AV_READ_TIME read_time
-
-static inline uint64_t read_time(void)
-{
-    unsigned cc;
-    __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc));
-    return cc;
-}
-
-#endif /* HAVE_INLINE_ASM && __ARM_ARCH_7A__ */
-
-#endif /* AVUTIL_ARM_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/attributes.h
--- a/ffmpeg_smp/h264dec/libavutil/attributes.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Macro definitions for various function/variable attributes
- */
-
-#ifndef AVUTIL_ATTRIBUTES_H
-#define AVUTIL_ATTRIBUTES_H
-
-#ifdef __GNUC__
-#    define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
-#else
-#    define AV_GCC_VERSION_AT_LEAST(x,y) 0
-#endif
-
-#ifndef av_always_inline
-#if AV_GCC_VERSION_AT_LEAST(3,1)
-#    define av_always_inline __attribute__((always_inline)) inline
-#else
-#    define av_always_inline inline
-#endif
-#endif
-
-#ifndef av_noinline
-#if AV_GCC_VERSION_AT_LEAST(3,1)
-#    define av_noinline __attribute__((noinline))
-#else
-#    define av_noinline
-#endif
-#endif
-
-#ifndef av_pure
-#if AV_GCC_VERSION_AT_LEAST(3,1)
-#    define av_pure __attribute__((pure))
-#else
-#    define av_pure
-#endif
-#endif
-
-#ifndef av_const
-#if AV_GCC_VERSION_AT_LEAST(2,6)
-#    define av_const __attribute__((const))
-#else
-#    define av_const
-#endif
-#endif
-
-#ifndef av_cold
-#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3)
-#    define av_cold __attribute__((cold))
-#else
-#    define av_cold
-#endif
-#endif
-
-#ifndef av_flatten
-#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1)
-#    define av_flatten __attribute__((flatten))
-#else
-#    define av_flatten
-#endif
-#endif
-
-#ifndef attribute_deprecated
-#if AV_GCC_VERSION_AT_LEAST(3,1)
-#    define attribute_deprecated __attribute__((deprecated))
-#else
-#    define attribute_deprecated
-#endif
-#endif
-
-#ifndef av_unused
-#if defined(__GNUC__)
-#    define av_unused __attribute__((unused))
-#else
-#    define av_unused
-#endif
-#endif
-
-#ifndef av_uninit
-#if defined(__GNUC__) && !defined(__ICC)
-#    define av_uninit(x) x=x
-#else
-#    define av_uninit(x) x
-#endif
-#endif
-
-#ifdef __GNUC__
-#    define av_builtin_constant_p __builtin_constant_p
-#else
-#    define av_builtin_constant_p(x) 0
-#endif
-
-#endif /* AVUTIL_ATTRIBUTES_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/bswap.h
--- a/ffmpeg_smp/h264dec/libavutil/bswap.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,95 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * byte swapping routines
- */
-
-#ifndef AVUTIL_BSWAP_H
-#define AVUTIL_BSWAP_H
-
-#include <stdint.h>
-#include "config.h"
-#include "attributes.h"
-
-#if   ARCH_ARM
-#   include "arm/bswap.h"
-#elif ARCH_X86
-#   include "x86/bswap.h"
-#endif
-
-#ifndef bswap_16
-static av_always_inline av_const uint16_t bswap_16(uint16_t x)
-{
-    x= (x>>8) | (x<<8);
-    return x;
-}
-#endif
-
-#ifndef bswap_32
-static av_always_inline av_const uint32_t bswap_32(uint32_t x)
-{
-    x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
-    x= (x>>16) | (x<<16);
-    return x;
-}
-#endif
-
-#ifndef bswap_64
-static inline uint64_t av_const bswap_64(uint64_t x)
-{
-#if 0
-    x= ((x<< 8)&0xFF00FF00FF00FF00ULL) | ((x>> 8)&0x00FF00FF00FF00FFULL);
-    x= ((x<<16)&0xFFFF0000FFFF0000ULL) | ((x>>16)&0x0000FFFF0000FFFFULL);
-    return (x>>32) | (x<<32);
-#else
-    union {
-        uint64_t ll;
-        uint32_t l[2];
-    } w, r;
-    w.ll = x;
-    r.l[0] = bswap_32 (w.l[1]);
-    r.l[1] = bswap_32 (w.l[0]);
-    return r.ll;
-#endif
-}
-#endif
-
-// be2me ... big-endian to machine-endian
-// le2me ... little-endian to machine-endian
-
-#if HAVE_BIGENDIAN
-#define be2me_16(x) (x)
-#define be2me_32(x) (x)
-#define be2me_64(x) (x)
-#define le2me_16(x) bswap_16(x)
-#define le2me_32(x) bswap_32(x)
-#define le2me_64(x) bswap_64(x)
-#else
-#define be2me_16(x) bswap_16(x)
-#define be2me_32(x) bswap_32(x)
-#define be2me_64(x) bswap_64(x)
-#define le2me_16(x) (x)
-#define le2me_32(x) (x)
-#define le2me_64(x) (x)
-#endif
-
-#endif /* AVUTIL_BSWAP_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/common.h
--- a/ffmpeg_smp/h264dec/libavutil/common.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,298 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * common internal and external API header
- */
-
-#ifndef AVUTIL_COMMON_H
-#define AVUTIL_COMMON_H
-
-#include <ctype.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "attributes.h"
-
-//rounded division & shift
-#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
-/* assume b>0 */
-#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
-#define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
-#define FFSIGN(a) ((a) > 0 ? 1 : -1)
-
-#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
-#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
-#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
-#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)
-
-#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
-#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
-#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
-
-/* misc math functions */
-extern const uint8_t ff_log2_tab[256];
-
-static inline av_const int av_log2_c(unsigned int v)
-{
-    int n = 0;
-    if (v & 0xffff0000) {
-        v >>= 16;
-        n += 16;
-    }
-    if (v & 0xff00) {
-        v >>= 8;
-        n += 8;
-    }
-    n += ff_log2_tab[v];
-
-    return n;
-}
-
-static inline av_const int av_log2_16bit_c(unsigned int v)
-{
-    int n = 0;
-    if (v & 0xff00) {
-        v >>= 8;
-        n += 8;
-    }
-    n += ff_log2_tab[v];
-
-    return n;
-}
-
-#ifdef HAVE_AV_CONFIG_H
-#   include "config.h"
-#endif
-
-/**
- * Clips a signed integer value into the amin-amax range.
- * @param a value to clip
- * @param amin minimum value of the clip range
- * @param amax maximum value of the clip range
- * @return clipped value
- */
-static inline av_const int av_clip(int a, int amin, int amax)
-{
-    if      (a < amin) return amin;
-    else if (a > amax) return amax;
-    else               return a;
-}
-
-/**
- * Clips a signed integer value into the 0-255 range.
- * @param a value to clip
- * @return clipped value
- */
-static inline av_const uint8_t av_clip_uint8(int a)
-{
-    if (a&(~0xFF)) return (-a)>>31;
-    else           return a;
-}
-
-/**
- * Clips a signed integer value into the 0-65535 range.
- * @param a value to clip
- * @return clipped value
- */
-static inline av_const uint16_t av_clip_uint16(int a)
-{
-    if (a&(~0xFFFF)) return (-a)>>31;
-    else             return a;
-}
-
-/**
- * Clips a signed integer value into the -32768,32767 range.
- * @param a value to clip
- * @return clipped value
- */
-static inline av_const int16_t av_clip_int16(int a)
-{
-    if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF;
-    else                      return a;
-}
-
-/**
- * Clips a signed 64-bit integer value into the -2147483648,2147483647 range.
- * @param a value to clip
- * @return clipped value
- */
-static inline av_const int32_t av_clipl_int32(int64_t a)
-{
-    if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF;
-    else                                         return a;
-}
-
-/**
- * Clips a float value into the amin-amax range.
- * @param a value to clip
- * @param amin minimum value of the clip range
- * @param amax maximum value of the clip range
- * @return clipped value
- */
-static inline av_const float av_clipf(float a, float amin, float amax)
-{
-    if      (a < amin) return amin;
-    else if (a > amax) return amax;
-    else               return a;
-}
-
-/** Computes ceil(log2(x)).
- * @param x value used to compute ceil(log2(x))
- * @return computed ceiling of log2(x)
- */
-static inline av_const int av_ceil_log2(int x)
-{
-    return av_log2_c((x - 1) << 1);
-}
-
-#define MKTAG(a,b,c,d) (a | (b << 8) | (c << 16) | (d << 24))
-#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
-
-/*!
- * \def GET_UTF8(val, GET_BYTE, ERROR)
- * Converts a UTF-8 character (up to 4 bytes long) to its 32-bit UCS-4 encoded form
- * \param val is the output and should be of type uint32_t. It holds the converted
- * UCS-4 character and should be a left value.
- * \param GET_BYTE gets UTF-8 encoded bytes from any proper source. It can be
- * a function or a statement whose return value or evaluated value is of type
- * uint8_t. It will be executed up to 4 times for values in the valid UTF-8 range,
- * and up to 7 times in the general case.
- * \param ERROR action that should be taken when an invalid UTF-8 byte is returned
- * from GET_BYTE. It should be a statement that jumps out of the macro,
- * like exit(), goto, return, break, or continue.
- */
-#define GET_UTF8(val, GET_BYTE, ERROR)\
-    val= GET_BYTE;\
-    {\
-        int ones= 7 - av_log2(val ^ 255);\
-        if(ones==1)\
-            ERROR\
-        val&= 127>>ones;\
-        while(--ones > 0){\
-            int tmp= GET_BYTE - 128;\
-            if(tmp>>6)\
-                ERROR\
-            val= (val<<6) + tmp;\
-        }\
-    }
-
-/*!
- * \def GET_UTF16(val, GET_16BIT, ERROR)
- * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form
- * \param val is the output and should be of type uint32_t. It holds the converted
- * UCS-4 character and should be a left value.
- * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness.
- * It can be a function or a statement whose return value or evaluated value is of type
- * uint16_t. It will be executed up to 2 times.
- * \param ERROR action that should be taken when an invalid UTF-16 surrogate is
- * returned from GET_BYTE. It should be a statement that jumps out of the macro,
- * like exit(), goto, return, break, or continue.
- */
-#define GET_UTF16(val, GET_16BIT, ERROR)\
-    val = GET_16BIT;\
-    {\
-        unsigned int hi = val - 0xD800;\
-        if (hi < 0x800) {\
-            val = GET_16BIT - 0xDC00;\
-            if (val > 0x3FFU || hi > 0x3FFU)\
-                ERROR\
-            val += (hi<<10) + 0x10000;\
-        }\
-    }\
-
-/*!
- * \def PUT_UTF8(val, tmp, PUT_BYTE)
- * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).
- * \param val is an input-only argument and should be of type uint32_t. It holds
- * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If
- * val is given as a function it is executed only once.
- * \param tmp is a temporary variable and should be of type uint8_t. It
- * represents an intermediate value during conversion that is to be
- * output by PUT_BYTE.
- * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination.
- * It could be a function or a statement, and uses tmp as the input byte.
- * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be
- * executed up to 4 times for values in the valid UTF-8 range and up to
- * 7 times in the general case, depending on the length of the converted
- * Unicode character.
- */
-#define PUT_UTF8(val, tmp, PUT_BYTE)\
-    {\
-        int bytes, shift;\
-        uint32_t in = val;\
-        if (in < 0x80) {\
-            tmp = in;\
-            PUT_BYTE\
-        } else {\
-            bytes = (av_log2(in) + 4) / 5;\
-            shift = (bytes - 1) * 6;\
-            tmp = (256 - (256 >> bytes)) | (in >> shift);\
-            PUT_BYTE\
-            while (shift >= 6) {\
-                shift -= 6;\
-                tmp = 0x80 | ((in >> shift) & 0x3f);\
-                PUT_BYTE\
-            }\
-        }\
-    }
-
-/*!
- * \def PUT_UTF16(val, tmp, PUT_16BIT)
- * Converts a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes).
- * \param val is an input-only argument and should be of type uint32_t. It holds
- * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If
- * val is given as a function it is executed only once.
- * \param tmp is a temporary variable and should be of type uint16_t. It
- * represents an intermediate value during conversion that is to be
- * output by PUT_16BIT.
- * \param PUT_16BIT writes the converted UTF-16 data to any proper destination
- * in desired endianness. It could be a function or a statement, and uses tmp
- * as the input byte.  For example, PUT_BYTE could be "*output++ = tmp;"
- * PUT_BYTE will be executed 1 or 2 times depending on input character.
- */
-#define PUT_UTF16(val, tmp, PUT_16BIT)\
-    {\
-        uint32_t in = val;\
-        if (in < 0x10000) {\
-            tmp = in;\
-            PUT_16BIT\
-        } else {\
-            tmp = 0xD800 | ((in - 0x10000) >> 10);\
-            PUT_16BIT\
-            tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\
-            PUT_16BIT\
-        }\
-    }\
-
-
-
-#include "mem.h"
-
-#ifdef HAVE_AV_CONFIG_H
-#    include "internal.h"
-#endif /* HAVE_AV_CONFIG_H */
-
-#endif /* AVUTIL_COMMON_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/error.h
--- a/ffmpeg_smp/h264dec/libavutil/error.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,53 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * error code definitions
- */
-
-#ifndef AVUTIL_ERROR_H
-#define AVUTIL_ERROR_H
-
-#include <errno.h>
-#include "common.h"
-
-/* error handling */
-#if EDOM > 0
-#define AVERROR(e) (-(e))   ///< Returns a negative error code from a POSIX error code, to return from library functions.
-#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value.
-#else
-/* Some platforms have E* and errno already negated. */
-#define AVERROR(e) (e)
-#define AVUNERROR(e) (e)
-#endif
-
-#define AVERROR_EOF         AVERROR(EPIPE)   ///< End of file
-
-
-/**
- * Puts a description of the AVERROR code errnum in errbuf.
- * In case of failure the global variable errno is set to indicate the
- * error.
- *
- * @param errbuf_size the size in bytes of errbuf
- * @return 0 on success, a negative value otherwise
- */
-int av_strerror(int errnum, char *errbuf, size_t errbuf_size);
-
-#endif /* AVUTIL_ERROR_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/internal.h
--- a/ffmpeg_smp/h264dec/libavutil/internal.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,168 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * common internal API header
- */
-
-#ifndef AVUTIL_INTERNAL_H
-#define AVUTIL_INTERNAL_H
-
-#if !defined(DEBUG) && !defined(NDEBUG)
-#    define NDEBUG
-#endif
-
-#include <limits.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <assert.h>
-#include "config.h"
-#include "attributes.h"
-#include "timer.h"
-
-
-
-#ifndef INT16_MIN
-#define INT16_MIN       (-0x7fff - 1)
-#endif
-
-#ifndef INT16_MAX
-#define INT16_MAX       0x7fff
-#endif
-
-#ifndef INT32_MIN
-#define INT32_MIN       (-0x7fffffff - 1)
-#endif
-
-#ifndef INT32_MAX
-#define INT32_MAX       0x7fffffff
-#endif
-
-#ifndef UINT32_MAX
-#define UINT32_MAX      0xffffffff
-#endif
-
-#ifndef INT64_MIN
-#define INT64_MIN       (-0x7fffffffffffffffLL - 1)
-#endif
-
-#ifndef INT64_MAX
-#define INT64_MAX INT64_C(9223372036854775807)
-#endif
-
-#ifndef UINT64_MAX
-#define UINT64_MAX UINT64_C(0xFFFFFFFFFFFFFFFF)
-#endif
-
-#ifndef INT_BIT
-#    define INT_BIT (CHAR_BIT * sizeof(int))
-#endif
-
-#ifndef offsetof
-#    define offsetof(T, F) ((unsigned int)((char *)&((T *)0)->F))
-#endif
-
-/* Use to export labels from asm. */
-#define LABEL_MANGLE(a) #a
-#define LOCAL_MANGLE(a) #a
-#define MANGLE(a) #a
-
-// Use rip-relative addressing if compiling PIC code on x86-64.
-// #if ARCH_X86_64 && defined(PIC)
-// #    define LOCAL_MANGLE(a) #a "(%%rip)"
-// #else
-// #    define LOCAL_MANGLE(a) #a
-// #endif
-// 
-// #define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
-
-/* debug stuff */
-
-/* dprintf macros */
-#ifdef DEBUG
-#    define dprintf(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__)
-#else
-#    define dprintf(pctx, ...)
-#endif
-
-#define av_abort()      do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0)
-
-/* math */
-
-
-/* avoid usage of dangerous/inappropriate system functions */
-// #undef  malloc
-// #define malloc please_use_av_malloc
-// #undef  free
-// #define free please_use_av_free
-#undef  realloc
-#define realloc please_use_av_realloc
-#undef  time
-#define time time_is_forbidden_due_to_security_issues
-#undef  rand
-#define rand rand_is_forbidden_due_to_state_trashing_use_av_lfg_get
-#undef  srand
-#define srand srand_is_forbidden_due_to_state_trashing_use_av_lfg_init
-#undef  random
-#define random random_is_forbidden_due_to_state_trashing_use_av_lfg_get
-#undef  sprintf
-#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf
-//#undef  exit
-//#define exit exit_is_forbidden
-#ifndef LIBAVFORMAT_BUILD
-
-#undef  puts
-#define puts please_use_av_log_instead_of_puts
-#undef  perror
-#define perror please_use_av_log_instead_of_perror
-#endif
-
-#define FF_ALLOC_OR_GOTO(p, size, label)\
-{\
-    p = av_malloc(size);\
-    if (p == NULL && (size) != 0) {\
-        av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\
-        goto label;\
-    }\
-}
-
-#define FF_ALLOCZ_OR_GOTO(p, size, label)\
-{\
-    p = av_mallocz(size);\
-    if (p == NULL && (size) != 0) {\
-        av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\
-        goto label;\
-    }\
-}
-
-
-/**
- * Returns NULL if CONFIG_SMALL is true, otherwise the argument
- * without modification. Used to disable the definition of strings
- * (for example AVCodec long_names).
- */
-#if CONFIG_SMALL
-#   define NULL_IF_CONFIG_SMALL(x) NULL
-#else
-#   define NULL_IF_CONFIG_SMALL(x) x
-#endif
-
-#endif /* AVUTIL_INTERNAL_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/intreadwrite.h
--- a/ffmpeg_smp/h264dec/libavutil/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,498 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_INTREADWRITE_H
-#define AVUTIL_INTREADWRITE_H
-
-#include <stdint.h>
-#include "config.h"
-#include "bswap.h"
-#include "common.h"
-
-typedef union {
-    uint64_t u64;
-    uint32_t u32[2];
-    uint16_t u16[4];
-    uint8_t  u8 [8];
-    double   f64;
-    float    f32[2];
-} __attribute__((__may_alias__)) av_alias64;
-
-typedef union {
-    uint32_t u32;
-    uint16_t u16[2];
-    uint8_t  u8 [4];
-    float    f32;
-} __attribute__((__may_alias__)) av_alias32;
-
-typedef union {
-    uint16_t u16;
-    uint8_t  u8 [2];
-} __attribute__((__may_alias__)) av_alias16  ;
-
-/*
- * Arch-specific headers can provide any combination of
- * AV_[RW][BLN](16|24|32|64) and AV_(COPY|SWAP|ZERO)(64|128) macros.
- * Preprocessor symbols must be defined, even if these are implemented
- * as inline functions.
- */
-
-#if   ARCH_ARM
-#   include "arm/intreadwrite.h"
-#elif ARCH_PPC
-#   include "ppc/intreadwrite.h"
-#elif ARCH_X86
-#   include "x86/intreadwrite.h"
-#endif
-
-/*
- * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers.
- */
-
-#if HAVE_BIGENDIAN
-
-#   if    defined(AV_RN16) && !defined(AV_RB16)
-#       define AV_RB16(p) AV_RN16(p)
-#   elif !defined(AV_RN16) &&  defined(AV_RB16)
-#       define AV_RN16(p) AV_RB16(p)
-#   endif
-
-#   if    defined(AV_WN16) && !defined(AV_WB16)
-#       define AV_WB16(p, v) AV_WN16(p, v)
-#   elif !defined(AV_WN16) &&  defined(AV_WB16)
-#       define AV_WN16(p, v) AV_WB16(p, v)
-#   endif
-
-#   if    defined(AV_RN24) && !defined(AV_RB24)
-#       define AV_RB24(p) AV_RN24(p)
-#   elif !defined(AV_RN24) &&  defined(AV_RB24)
-#       define AV_RN24(p) AV_RB24(p)
-#   endif
-
-#   if    defined(AV_WN24) && !defined(AV_WB24)
-#       define AV_WB24(p, v) AV_WN24(p, v)
-#   elif !defined(AV_WN24) &&  defined(AV_WB24)
-#       define AV_WN24(p, v) AV_WB24(p, v)
-#   endif
-
-#   if    defined(AV_RN32) && !defined(AV_RB32)
-#       define AV_RB32(p) AV_RN32(p)
-#   elif !defined(AV_RN32) &&  defined(AV_RB32)
-#       define AV_RN32(p) AV_RB32(p)
-#   endif
-
-#   if    defined(AV_WN32) && !defined(AV_WB32)
-#       define AV_WB32(p, v) AV_WN32(p, v)
-#   elif !defined(AV_WN32) &&  defined(AV_WB32)
-#       define AV_WN32(p, v) AV_WB32(p, v)
-#   endif
-
-#   if    defined(AV_RN64) && !defined(AV_RB64)
-#       define AV_RB64(p) AV_RN64(p)
-#   elif !defined(AV_RN64) &&  defined(AV_RB64)
-#       define AV_RN64(p) AV_RB64(p)
-#   endif
-
-#   if    defined(AV_WN64) && !defined(AV_WB64)
-#       define AV_WB64(p, v) AV_WN64(p, v)
-#   elif !defined(AV_WN64) &&  defined(AV_WB64)
-#       define AV_WN64(p, v) AV_WB64(p, v)
-#   endif
-
-#else /* HAVE_BIGENDIAN */
-
-#   if    defined(AV_RN16) && !defined(AV_RL16)
-#       define AV_RL16(p) AV_RN16(p)
-#   elif !defined(AV_RN16) &&  defined(AV_RL16)
-#       define AV_RN16(p) AV_RL16(p)
-#   endif
-
-#   if    defined(AV_WN16) && !defined(AV_WL16)
-#       define AV_WL16(p, v) AV_WN16(p, v)
-#   elif !defined(AV_WN16) &&  defined(AV_WL16)
-#       define AV_WN16(p, v) AV_WL16(p, v)
-#   endif
-
-#   if    defined(AV_RN24) && !defined(AV_RL24)
-#       define AV_RL24(p) AV_RN24(p)
-#   elif !defined(AV_RN24) &&  defined(AV_RL24)
-#       define AV_RN24(p) AV_RL24(p)
-#   endif
-
-#   if    defined(AV_WN24) && !defined(AV_WL24)
-#       define AV_WL24(p, v) AV_WN24(p, v)
-#   elif !defined(AV_WN24) &&  defined(AV_WL24)
-#       define AV_WN24(p, v) AV_WL24(p, v)
-#   endif
-
-#   if    defined(AV_RN32) && !defined(AV_RL32)
-#       define AV_RL32(p) AV_RN32(p)
-#   elif !defined(AV_RN32) &&  defined(AV_RL32)
-#       define AV_RN32(p) AV_RL32(p)
-#   endif
-
-#   if    defined(AV_WN32) && !defined(AV_WL32)
-#       define AV_WL32(p, v) AV_WN32(p, v)
-#   elif !defined(AV_WN32) &&  defined(AV_WL32)
-#       define AV_WN32(p, v) AV_WL32(p, v)
-#   endif
-
-#   if    defined(AV_RN64) && !defined(AV_RL64)
-#       define AV_RL64(p) AV_RN64(p)
-#   elif !defined(AV_RN64) &&  defined(AV_RL64)
-#       define AV_RN64(p) AV_RL64(p)
-#   endif
-
-#   if    defined(AV_WN64) && !defined(AV_WL64)
-#       define AV_WL64(p, v) AV_WN64(p, v)
-#   elif !defined(AV_WN64) &&  defined(AV_WL64)
-#       define AV_WN64(p, v) AV_WL64(p, v)
-#   endif
-
-#endif /* !HAVE_BIGENDIAN */
-
-/*
- * Define AV_[RW]N helper macros to simplify definitions not provided
- * by per-arch headers.
- */
-
-
-
-#if defined(__DECC)
-
-#   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
-#   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
-
-#else
-
-#ifndef AV_RB16
-#   define AV_RB16(x)                           \
-    ((((const uint8_t*)(x))[0] << 8) |          \
-      ((const uint8_t*)(x))[1])
-#endif
-#ifndef AV_WB16
-#   define AV_WB16(p, d) do {                   \
-        ((uint8_t*)(p))[1] = (d);               \
-        ((uint8_t*)(p))[0] = (d)>>8;            \
-    } while(0)
-#endif
-
-#ifndef AV_RL16
-#   define AV_RL16(x)                           \
-    ((((const uint8_t*)(x))[1] << 8) |          \
-      ((const uint8_t*)(x))[0])
-#endif
-#ifndef AV_WL16
-#   define AV_WL16(p, d) do {                   \
-        ((uint8_t*)(p))[0] = (d);               \
-        ((uint8_t*)(p))[1] = (d)>>8;            \
-    } while(0)
-#endif
-
-#ifndef AV_RB32
-#   define AV_RB32(x)                           \
-    ((((const uint8_t*)(x))[0] << 24) |         \
-     (((const uint8_t*)(x))[1] << 16) |         \
-     (((const uint8_t*)(x))[2] <<  8) |         \
-      ((const uint8_t*)(x))[3])
-#endif
-#ifndef AV_WB32
-#   define AV_WB32(p, d) do {                   \
-        ((uint8_t*)(p))[3] = (d);               \
-        ((uint8_t*)(p))[2] = (d)>>8;            \
-        ((uint8_t*)(p))[1] = (d)>>16;           \
-        ((uint8_t*)(p))[0] = (d)>>24;           \
-    } while(0)
-#endif
-
-#ifndef AV_RL32
-#   define AV_RL32(x)                           \
-    ((((const uint8_t*)(x))[3] << 24) |         \
-     (((const uint8_t*)(x))[2] << 16) |         \
-     (((const uint8_t*)(x))[1] <<  8) |         \
-      ((const uint8_t*)(x))[0])
-#endif
-#ifndef AV_WL32
-#   define AV_WL32(p, d) do {                   \
-        ((uint8_t*)(p))[0] = (d);               \
-        ((uint8_t*)(p))[1] = (d)>>8;            \
-        ((uint8_t*)(p))[2] = (d)>>16;           \
-        ((uint8_t*)(p))[3] = (d)>>24;           \
-    } while(0)
-#endif
-
-#ifndef AV_RB64
-#   define AV_RB64(x)                                   \
-    (((uint64_t)((const uint8_t*)(x))[0] << 56) |       \
-     ((uint64_t)((const uint8_t*)(x))[1] << 48) |       \
-     ((uint64_t)((const uint8_t*)(x))[2] << 40) |       \
-     ((uint64_t)((const uint8_t*)(x))[3] << 32) |       \
-     ((uint64_t)((const uint8_t*)(x))[4] << 24) |       \
-     ((uint64_t)((const uint8_t*)(x))[5] << 16) |       \
-     ((uint64_t)((const uint8_t*)(x))[6] <<  8) |       \
-      (uint64_t)((const uint8_t*)(x))[7])
-#endif
-#ifndef AV_WB64
-#   define AV_WB64(p, d) do {                   \
-        ((uint8_t*)(p))[7] = (d);               \
-        ((uint8_t*)(p))[6] = (d)>>8;            \
-        ((uint8_t*)(p))[5] = (d)>>16;           \
-        ((uint8_t*)(p))[4] = (d)>>24;           \
-        ((uint8_t*)(p))[3] = (d)>>32;           \
-        ((uint8_t*)(p))[2] = (d)>>40;           \
-        ((uint8_t*)(p))[1] = (d)>>48;           \
-        ((uint8_t*)(p))[0] = (d)>>56;           \
-    } while(0)
-#endif
-
-#ifndef AV_RL64
-#   define AV_RL64(x)                                   \
-    (((uint64_t)((const uint8_t*)(x))[7] << 56) |       \
-     ((uint64_t)((const uint8_t*)(x))[6] << 48) |       \
-     ((uint64_t)((const uint8_t*)(x))[5] << 40) |       \
-     ((uint64_t)((const uint8_t*)(x))[4] << 32) |       \
-     ((uint64_t)((const uint8_t*)(x))[3] << 24) |       \
-     ((uint64_t)((const uint8_t*)(x))[2] << 16) |       \
-     ((uint64_t)((const uint8_t*)(x))[1] <<  8) |       \
-      (uint64_t)((const uint8_t*)(x))[0])
-#endif
-#ifndef AV_WL64
-#   define AV_WL64(p, d) do {                   \
-        ((uint8_t*)(p))[0] = (d);               \
-        ((uint8_t*)(p))[1] = (d)>>8;            \
-        ((uint8_t*)(p))[2] = (d)>>16;           \
-        ((uint8_t*)(p))[3] = (d)>>24;           \
-        ((uint8_t*)(p))[4] = (d)>>32;           \
-        ((uint8_t*)(p))[5] = (d)>>40;           \
-        ((uint8_t*)(p))[6] = (d)>>48;           \
-        ((uint8_t*)(p))[7] = (d)>>56;           \
-    } while(0)
-#endif
-
-#if HAVE_BIGENDIAN
-#   define AV_RN(s, p)    AV_RB##s(p)
-#   define AV_WN(s, p, v) AV_WB##s(p, v)
-#else
-#   define AV_RN(s, p)    AV_RL##s(p)
-#   define AV_WN(s, p, v) AV_WL##s(p, v)
-#endif
-
-#endif /* HAVE_FAST_UNALIGNED */
-
-#ifndef AV_RN16
-#   define AV_RN16(p) AV_RN(16, p)
-#endif
-
-#ifndef AV_RN32
-#   define AV_RN32(p) AV_RN(32, p)
-#endif
-
-#ifndef AV_RN64
-#   define AV_RN64(p) AV_RN(64, p)
-#endif
-
-#ifndef AV_WN16
-#   define AV_WN16(p, v) AV_WN(16, p, v)
-#endif
-
-#ifndef AV_WN32
-#   define AV_WN32(p, v) AV_WN(32, p, v)
-#endif
-
-#ifndef AV_WN64
-#   define AV_WN64(p, v) AV_WN(64, p, v)
-#endif
-
-#if HAVE_BIGENDIAN
-#   define AV_RB(s, p)    AV_RN##s(p)
-#   define AV_WB(s, p, v) AV_WN##s(p, v)
-#   define AV_RL(s, p)    bswap_##s(AV_RN##s(p))
-#   define AV_WL(s, p, v) AV_WN##s(p, bswap_##s(v))
-#else
-#   define AV_RB(s, p)    bswap_##s(AV_RN##s(p))
-#   define AV_WB(s, p, v) AV_WN##s(p, bswap_##s(v))
-#   define AV_RL(s, p)    AV_RN##s(p)
-#   define AV_WL(s, p, v) AV_WN##s(p, v)
-#endif
-
-#define AV_RB8(x)     (((const uint8_t*)(x))[0])
-#define AV_WB8(p, d)  do { ((uint8_t*)(p))[0] = (d); } while(0)
-
-#define AV_RL8(x)     AV_RB8(x)
-#define AV_WL8(p, d)  AV_WB8(p, d)
-
-#ifndef AV_RB16
-#   define AV_RB16(p)    AV_RB(16, p)
-#endif
-#ifndef AV_WB16
-#   define AV_WB16(p, v) AV_WB(16, p, v)
-#endif
-
-#ifndef AV_RL16
-#   define AV_RL16(p)    AV_RL(16, p)
-#endif
-#ifndef AV_WL16
-#   define AV_WL16(p, v) AV_WL(16, p, v)
-#endif
-
-#ifndef AV_RB32
-#   define AV_RB32(p)    AV_RB(32, p)
-#endif
-#ifndef AV_WB32
-#   define AV_WB32(p, v) AV_WB(32, p, v)
-#endif
-
-#ifndef AV_RL32
-#   define AV_RL32(p)    AV_RL(32, p)
-#endif
-#ifndef AV_WL32
-#   define AV_WL32(p, v) AV_WL(32, p, v)
-#endif
-
-#ifndef AV_RB64
-#   define AV_RB64(p)    AV_RB(64, p)
-#endif
-#ifndef AV_WB64
-#   define AV_WB64(p, v) AV_WB(64, p, v)
-#endif
-
-#ifndef AV_RL64
-#   define AV_RL64(p)    AV_RL(64, p)
-#endif
-#ifndef AV_WL64
-#   define AV_WL64(p, v) AV_WL(64, p, v)
-#endif
-
-#ifndef AV_RB24
-#   define AV_RB24(x)                           \
-    ((((const uint8_t*)(x))[0] << 16) |         \
-     (((const uint8_t*)(x))[1] <<  8) |         \
-      ((const uint8_t*)(x))[2])
-#endif
-#ifndef AV_WB24
-#   define AV_WB24(p, d) do {                   \
-        ((uint8_t*)(p))[2] = (d);               \
-        ((uint8_t*)(p))[1] = (d)>>8;            \
-        ((uint8_t*)(p))[0] = (d)>>16;           \
-    } while(0)
-#endif
-
-#ifndef AV_RL24
-#   define AV_RL24(x)                           \
-    ((((const uint8_t*)(x))[2] << 16) |         \
-     (((const uint8_t*)(x))[1] <<  8) |         \
-      ((const uint8_t*)(x))[0])
-#endif
-#ifndef AV_WL24
-#   define AV_WL24(p, d) do {                   \
-        ((uint8_t*)(p))[0] = (d);               \
-        ((uint8_t*)(p))[1] = (d)>>8;            \
-        ((uint8_t*)(p))[2] = (d)>>16;           \
-    } while(0)
-#endif
-
-/*
- * The AV_[RW]NA macros access naturally aligned data
- * in a type-safe way.
- */
-
-#define AV_RNA(s, p)    (((const av_alias##s*)(p))->u##s)
-#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v))
-
-#ifndef AV_RN16A
-#   define AV_RN16A(p) AV_RNA(16, p)
-#endif
-
-#ifndef AV_RN32A
-#   define AV_RN32A(p) AV_RNA(32, p)
-#endif
-
-#ifndef AV_RN64A
-#   define AV_RN64A(p) AV_RNA(64, p)
-#endif
-
-#ifndef AV_WN16A
-#   define AV_WN16A(p, v) AV_WNA(16, p, v)
-#endif
-
-#ifndef AV_WN32A
-#   define AV_WN32A(p, v) AV_WNA(32, p, v)
-#endif
-
-#ifndef AV_WN64A
-#   define AV_WN64A(p, v) AV_WNA(64, p, v)
-#endif
-
-/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be
- * naturally aligned. They may be implemented using MMX,
- * so emms_c() must be called before using any float code
- * afterwards.
- */
-
-#define AV_COPY(n, d, s) \
-    (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
-
-#ifndef AV_COPY16
-#   define AV_COPY16(d, s) AV_COPY(16, d, s)
-#endif
-
-#ifndef AV_COPY32
-#   define AV_COPY32(d, s) AV_COPY(32, d, s)
-#endif
-
-#ifndef AV_COPY64
-#   define AV_COPY64(d, s) AV_COPY(64, d, s)
-#endif
-
-#ifndef AV_COPY128
-#   define AV_COPY128(d, s)                    \
-    do {                                       \
-        AV_COPY64(d, s);                       \
-        AV_COPY64((char*)(d)+8, (char*)(s)+8); \
-    } while(0)
-#endif
-
-#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b))
-
-#ifndef AV_SWAP64
-#   define AV_SWAP64(a, b) AV_SWAP(64, a, b)
-#endif
-
-#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
-
-#ifndef AV_ZERO16
-#   define AV_ZERO16(d) AV_ZERO(16, d)
-#endif
-
-#ifndef AV_ZERO32
-#   define AV_ZERO32(d) AV_ZERO(32, d)
-#endif
-
-#ifndef AV_ZERO64
-#   define AV_ZERO64(d) AV_ZERO(64, d)
-#endif
-
-#ifndef AV_ZERO128
-#   define AV_ZERO128(d)         \
-    do {                         \
-        AV_ZERO64(d);            \
-        AV_ZERO64((char*)(d)+8); \
-    } while(0)
-#endif
-
-#endif /* AVUTIL_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/log.c
--- a/ffmpeg_smp/h264dec/libavutil/log.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,111 +0,0 @@
-/*
- * log functions
- * Copyright (c) 2003 Michel Bardiaux
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * logging functions
- */
-#include "error.h"
-#include <unistd.h>
-#include <stdlib.h>
-#include "log.h"
-
-
-static int av_log_level = AV_LOG_INFO;
-
-static int use_ansi_color=-1;
-
-#undef fprintf
-static void colored_fputs(int color, const char *str){
-    if(use_ansi_color<0){
-#if HAVE_ISATTY && !defined(_WIN32)
-        use_ansi_color= getenv("TERM") && !getenv("NO_COLOR") && isatty(2);
-#else
-        use_ansi_color= 0;
-#endif
-    }
-
-    if(use_ansi_color){
-        fprintf(stderr, "\033[%d;3%dm", color>>4, color&15);
-    }
-    fputs(str, stderr);
-    if(use_ansi_color){
-        fprintf(stderr, "\033[0m");
-    }
-}
-
-void av_log_default_callback(int level, const char* fmt, va_list vl)
-{
-    static int print_prefix=1;
-    static int count;
-    static char line[1024], prev[1024];
-    static const uint8_t color[]={0x41,0x41,0x11,0x03,9,9,9};
-
-    if(level>av_log_level)
-        return;
-#undef fprintf
-
-    line[0]=0;
-
-    vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl);
-
-    print_prefix= line[strlen(line)-1] == '\n';
-    if(print_prefix && !strcmp(line, prev)){
-        count++;
-        return;
-    }
-    if(count>0){
-        fprintf(stderr, "    Last message repeated %d times\n", count);
-        count=0;
-    }
-    colored_fputs(color[av_clip(level>>3, 0, 6)], line);
-    strcpy(prev, line);
-}
-
-static void (*av_log_callback)(int, const char*, va_list) = av_log_default_callback;
-
-void av_log(int level, const char *fmt, ...)
-{
-    va_list vl;
-    va_start(vl, fmt);
-    av_vlog(level, fmt, vl);
-    va_end(vl);
-}
-
-void av_vlog(int level, const char *fmt, va_list vl)
-{
-    av_log_callback(level, fmt, vl);
-}
-
-int av_log_get_level(void)
-{
-    return av_log_level;
-}
-
-void av_log_set_level(int level)
-{
-    av_log_level = level;
-}
-
-void av_log_set_callback(void (*callback)(int, const char*, va_list))
-{
-    av_log_callback = callback;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/log.h
--- a/ffmpeg_smp/h264dec/libavutil/log.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_LOG_H
-#define AVUTIL_LOG_H
-
-#include <stdarg.h>
-//#include "avutil.h"
-
-/**
- * Describes the class of an AVClass context structure. That is an
- * arbitrary struct of which the first field is a pointer to an
- * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.).
- */
-typedef struct {
-    /**
-     * The name of the class; usually it is the same name as the
-     * context structure type to which the AVClass is associated.
-     */
-    const char* class_name;
-
-    /**
-     * A pointer to a function which returns the name of a context
-     * instance ctx associated with the class.
-     */
-    const char* (*item_name)(void* ctx);
-
-    /**
-     * a pointer to the first option specified in the class if any or NULL
-     *
-     * @see av_set_default_options()
-     */
-    const struct AVOption *option;
-
-    /**
-     * LIBAVUTIL_VERSION with which this structure was created.
-     * This is used to allow fields to be added without requiring major
-     * version bumps everywhere.
-     */
-
-    int version;
-} AVClass;
-
-/* av_log API */
-
-#define AV_LOG_QUIET    -8
-
-/**
- * Something went really wrong and we will crash now.
- */
-#define AV_LOG_PANIC     0
-
-/**
- * Something went wrong and recovery is not possible.
- * For example, no header was found for a format which depends
- * on headers or an illegal combination of parameters is used.
- */
-#define AV_LOG_FATAL     8
-
-/**
- * Something went wrong and cannot losslessly be recovered.
- * However, not all future data is affected.
- */
-#define AV_LOG_ERROR    16
-
-/**
- * Something somehow does not look correct. This may or may not
- * lead to problems. An example would be the use of '-vstrict -2'.
- */
-#define AV_LOG_WARNING  24
-
-#define AV_LOG_INFO     32
-#define AV_LOG_VERBOSE  40
-
-/**
- * Stuff which is only useful for libav* developers.
- */
-#define AV_LOG_DEBUG    48
-
-/**
- * Sends the specified message to the log if the level is less than or equal
- * to the current av_log_level. By default, all logging messages are sent to
- * stderr. This behavior can be altered by setting a different av_vlog callback
- * function.
- *
- * @param avcl A pointer to an arbitrary struct of which the first field is a
- * pointer to an AVClass struct.
- * @param level The importance level of the message, lower values signifying
- * higher importance.
- * @param fmt The format string (printf-compatible) that specifies how
- * subsequent arguments are converted to output.
- * @see av_vlog
- */
-
-void av_log(int level, const char *fmt, ...);
-
-void av_vlog(int level, const char *fmt, va_list);
-int av_log_get_level(void);
-void av_log_set_level(int);
-void av_log_set_callback(void (*)(int, const char*, va_list));
-void av_log_default_callback(int level, const char* fmt, va_list vl);
-
-#endif /* AVUTIL_LOG_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/mem.c
--- a/ffmpeg_smp/h264dec/libavutil/mem.c	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,127 +0,0 @@
-/*
- * default memory allocator for libavutil
- * Copyright (c) 2002 Fabrice Bellard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * default memory allocator for libavutil
- */
-
-#include "config.h"
-
-#include <limits.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#if HAVE_MALLOC_H
-#include <malloc.h>
-#endif
-
-#include "mem.h"
-
-/* here we can use OS-dependent allocation functions */
-#undef free
-#undef malloc
-#undef realloc
-
-#ifdef MALLOC_PREFIX
-
-#define malloc         AV_JOIN(MALLOC_PREFIX, malloc)
-#define memalign       AV_JOIN(MALLOC_PREFIX, memalign)
-#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign)
-#define realloc        AV_JOIN(MALLOC_PREFIX, realloc)
-#define free           AV_JOIN(MALLOC_PREFIX, free)
-
-void *malloc(size_t size);
-void *memalign(size_t align, size_t size);
-int   posix_memalign(void **ptr, size_t align, size_t size);
-void *realloc(void *ptr, size_t size);
-void  free(void *ptr);
-
-#endif /* MALLOC_PREFIX */
-
-
-/* You can redefine av_malloc and av_free in your project to use your
-   memory allocator. You do not need to suppress this file because the
-   linker will do it automatically. */
-
-void *av_malloc(unsigned int size)
-{
-    void *ptr = NULL;
-    /* let's disallow possible ambiguous cases */
-    if(size > (INT_MAX-16) )
-        return NULL;
-
-//FIXME: when no aligned mallocs vector code should be disabled.
-#if HAVE_POSIX_MEMALIGN
-    if (posix_memalign(&ptr,16,size))
-        ptr = NULL;
-#elif HAVE_MEMALIGN
-    ptr = memalign(16,size);
-#else
-    ptr = malloc(size);
-#endif
-    return ptr;
-}
-
-void *av_realloc(void *ptr, unsigned int size)
-{
-    /* let's disallow possible ambiguous cases */
-    if(size > (INT_MAX-16) )
-        return NULL;
-
-    return realloc(ptr, size);
-
-}
-
-void av_free(void *ptr)
-{
-    /* XXX: this test should not be needed on most libcs */
-    if (ptr)
-        free(ptr);
-
-}
-
-void av_freep(void *arg)
-{
-    void **ptr= (void**)arg;
-    av_free(*ptr);
-    *ptr = NULL;
-}
-
-void *av_mallocz(unsigned int size)
-{
-    void *ptr = av_malloc(size);
-    if (ptr)
-        memset(ptr, 0, size);
-    return ptr;
-}
-
-char *av_strdup(const char *s)
-{
-    char *ptr= NULL;
-    if(s){
-        int len = strlen(s) + 1;
-        ptr = av_malloc(len);
-        if (ptr)
-            memcpy(ptr, s, len);
-    }
-    return ptr;
-}
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/mem.h
--- a/ffmpeg_smp/h264dec/libavutil/mem.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,143 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * memory handling functions
- */
-
-#ifndef AVUTIL_MEM_H
-#define AVUTIL_MEM_H
-
-#include "attributes.h"
-#include "config.h"
-
-#define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
-#define DECLARE_ALIGNED_16(t,v)      t __attribute__ ((aligned (16))) v
-#define DECLARE_ASM_CONST(n,t,v)    static const t __attribute__((used)) __attribute__ ((aligned (n))) v
-
-#if AV_GCC_VERSION_AT_LEAST(3,1)
-    #define av_malloc_attrib __attribute__((__malloc__))
-#else
-    #define av_malloc_attrib
-#endif
-
-/**
- * Allocates a block of size bytes with alignment suitable for all
- * memory accesses (including vectors if available on the CPU).
- * @param size Size in bytes for the memory block to be allocated.
- * @return Pointer to the allocated block, NULL if the block cannot
- * be allocated.
- * @see av_mallocz()
- */
-void *av_malloc(unsigned int size) av_malloc_attrib;
-
-/**
- * Allocates or reallocates a block of memory.
- * If ptr is NULL and size > 0, allocates a new block. If
- * size is zero, frees the memory block pointed to by ptr.
- * @param size Size in bytes for the memory block to be allocated or
- * reallocated.
- * @param ptr Pointer to a memory block already allocated with
- * av_malloc(z)() or av_realloc() or NULL.
- * @return Pointer to a newly reallocated block or NULL if the block
- * cannot be reallocated or the function is used to free the memory block.
- * @see av_fast_realloc()
- */
-void *av_realloc(void *ptr, unsigned int size);
-
-/**
- * Reallocates the given block if it is not large enough, otherwise it
- * does nothing.
- *
- * @see av_realloc
- */
-void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
-
-/**
- * Allocates a buffer, reusing the given one if large enough.
- *
- * Contrary to av_fast_realloc the current buffer contents might not be
- * preserved and on error the old buffer is freed, thus no special
- * handling to avoid memleaks is necessary.
- *
- * @param ptr pointer to pointer to already allocated buffer, overwritten with pointer to new buffer
- * @param size size of the buffer *ptr points to
- * @param min_size minimum size of *ptr buffer after returning, *ptr will be NULL and
- *                 *size 0 if an error occurred.
- */
-void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size);
-
-/**
- * Frees a memory block which has been allocated with av_malloc(z)() or
- * av_realloc().
- * @param ptr Pointer to the memory block which should be freed.
- * @note ptr = NULL is explicitly allowed.
- * @note It is recommended that you use av_freep() instead.
- * @see av_freep()
- */
-
-void av_free(void *ptr);
-
-/**
- * Allocates a block of size bytes with alignment suitable for all
- * memory accesses (including vectors if available on the CPU) and
- * zeroes all the bytes of the block.
- * @param size Size in bytes for the memory block to be allocated.
- * @return Pointer to the allocated block, NULL if it cannot be allocated.
- * @see av_malloc()
- */
-void *av_mallocz(unsigned int size) av_malloc_attrib;
-
-/**
- * Duplicates the string s.
- * @param s string to be duplicated
- * @return Pointer to a newly allocated string containing a
- * copy of s or NULL if the string cannot be allocated.
- */
-char *av_strdup(const char *s) av_malloc_attrib;
-
-/**
- * Frees a memory block which has been allocated with av_malloc(z)() or
- * av_realloc() and set the pointer pointing to it to NULL.
- * @param ptr Pointer to the pointer to the memory block which should
- * be freed.
- * @see av_free()
- */
-void av_freep(void *ptr);
-
-
-static av_always_inline uint32_t pack16to32(int a, int b){
-#if HAVE_BIGENDIAN
-   return (b&0xFFFF) + (a<<16);
-#else
-   return (a&0xFFFF) + (b<<16);
-#endif
-}
-
-static av_always_inline uint16_t pack8to16(int a, int b){
-#if HAVE_BIGENDIAN
-   return (b&0xFF) + (a<<8);
-#else
-   return (a&0xFF) + (b<<8);
-#endif
-}
-
-#endif /* AVUTIL_MEM_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/pixfmt.h
--- a/ffmpeg_smp/h264dec/libavutil/pixfmt.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,161 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_PIXFMT_H
-#define AVUTIL_PIXFMT_H
-
-/**
- * @file
- * pixel format definitions
- *
- * @warning This file has to be considered an internal but installed
- * header, so it should not be directly included in your projects.
- */
-
-/**
- * Pixel format. Notes:
- *
- * PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA
- * color is put together as:
- *  (A << 24) | (R << 16) | (G << 8) | B
- * This is stored as BGRA on little-endian CPU architectures and ARGB on
- * big-endian CPUs.
- *
- * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized
- * image data is stored in AVFrame.data[0]. The palette is transported in
- * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is
- * formatted the same as in PIX_FMT_RGB32 described above (i.e., it is
- * also endian-specific). Note also that the individual RGB palette
- * components stored in AVFrame.data[1] should be in the range 0..255.
- * This is important as many custom PAL8 video codecs that were designed
- * to run on the IBM VGA graphics adapter use 6-bit palette components.
- *
- * For all the 8bit per pixel formats, an RGB32 palette is in data[1] like
- * for pal8. This palette is filled in automatically by the function
- * allocating the picture.
- *
- * Note, make sure that all newly added big endian formats have pix_fmt&1==1
- *       and that all newly added little endian formats have pix_fmt&1==0
- *       this allows simpler detection of big vs little endian.
- */
-enum PixelFormat {
-    PIX_FMT_NONE= -1,
-    PIX_FMT_YUV420P,   ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
-    PIX_FMT_YUYV422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
-    PIX_FMT_RGB24,     ///< packed RGB 8:8:8, 24bpp, RGBRGB...
-    PIX_FMT_BGR24,     ///< packed RGB 8:8:8, 24bpp, BGRBGR...
-    PIX_FMT_YUV422P,   ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
-    PIX_FMT_YUV444P,   ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
-    PIX_FMT_YUV410P,   ///< planar YUV 4:1:0,  9bpp, (1 Cr & Cb sample per 4x4 Y samples)
-    PIX_FMT_YUV411P,   ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
-    PIX_FMT_GRAY8,     ///<        Y        ,  8bpp
-    PIX_FMT_MONOWHITE, ///<        Y        ,  1bpp, 0 is white, 1 is black
-    PIX_FMT_MONOBLACK, ///<        Y        ,  1bpp, 0 is black, 1 is white
-    PIX_FMT_PAL8,      ///< 8 bit with PIX_FMT_RGB32 palette
-    PIX_FMT_YUVJ420P,  ///< planar YUV 4:2:0, 12bpp, full scale (JPEG)
-    PIX_FMT_YUVJ422P,  ///< planar YUV 4:2:2, 16bpp, full scale (JPEG)
-    PIX_FMT_YUVJ444P,  ///< planar YUV 4:4:4, 24bpp, full scale (JPEG)
-    PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing
-    PIX_FMT_XVMC_MPEG2_IDCT,
-    PIX_FMT_UYVY422,   ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
-    PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3
-    PIX_FMT_BGR8,      ///< packed RGB 3:3:2,  8bpp, (msb)2B 3G 3R(lsb)
-    PIX_FMT_BGR4,      ///< packed RGB 1:2:1,  4bpp, (msb)1B 2G 1R(lsb)
-    PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1B 2G 1R(lsb)
-    PIX_FMT_RGB8,      ///< packed RGB 3:3:2,  8bpp, (msb)2R 3G 3B(lsb)
-    PIX_FMT_RGB4,      ///< packed RGB 1:2:1,  4bpp, (msb)1R 2G 1B(lsb)
-    PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1R 2G 1B(lsb)
-    PIX_FMT_NV12,      ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 for UV
-    PIX_FMT_NV21,      ///< as above, but U and V bytes are swapped
-
-    PIX_FMT_ARGB,      ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
-    PIX_FMT_RGBA,      ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
-    PIX_FMT_ABGR,      ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
-    PIX_FMT_BGRA,      ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
-
-    PIX_FMT_GRAY16BE,  ///<        Y        , 16bpp, big-endian
-    PIX_FMT_GRAY16LE,  ///<        Y        , 16bpp, little-endian
-    PIX_FMT_YUV440P,   ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
-    PIX_FMT_YUVJ440P,  ///< planar YUV 4:4:0 full scale (JPEG)
-    PIX_FMT_YUVA420P,  ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
-    PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    PIX_FMT_RGB48BE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, big-endian
-    PIX_FMT_RGB48LE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, little-endian
-
-    PIX_FMT_RGB565BE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), big-endian
-    PIX_FMT_RGB565LE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), little-endian
-    PIX_FMT_RGB555BE,  ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), big-endian, most significant bit to 0
-    PIX_FMT_RGB555LE,  ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), little-endian, most significant bit to 0
-
-    PIX_FMT_BGR565BE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), big-endian
-    PIX_FMT_BGR565LE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), little-endian
-    PIX_FMT_BGR555BE,  ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), big-endian, most significant bit to 1
-    PIX_FMT_BGR555LE,  ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), little-endian, most significant bit to 1
-
-    PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers
-    PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers
-    PIX_FMT_VAAPI_VLD,  ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-
-    PIX_FMT_YUV420P16LE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
-    PIX_FMT_YUV420P16BE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
-    PIX_FMT_YUV422P16LE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
-    PIX_FMT_YUV422P16BE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
-    PIX_FMT_YUV444P16LE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
-    PIX_FMT_YUV444P16BE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
-    PIX_FMT_VDPAU_MPEG4,  ///< MPEG4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    PIX_FMT_DXVA2_VLD,    ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer
-
-    PIX_FMT_RGB444BE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), big-endian, most significant bits to 0
-    PIX_FMT_RGB444LE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), little-endian, most significant bits to 0
-    PIX_FMT_BGR444BE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), big-endian, most significant bits to 1
-    PIX_FMT_BGR444LE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), little-endian, most significant bits to 1
-    PIX_FMT_Y400A,     ///< 8bit gray, 8bit alpha
-    PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
-};
-
-#if HAVE_BIGENDIAN
-#   define PIX_FMT_NE(be, le) PIX_FMT_##be
-#else
-#   define PIX_FMT_NE(be, le) PIX_FMT_##le
-#endif
-
-#define PIX_FMT_RGB32   PIX_FMT_NE(ARGB, BGRA)
-#define PIX_FMT_RGB32_1 PIX_FMT_NE(RGBA, ABGR)
-#define PIX_FMT_BGR32   PIX_FMT_NE(ABGR, RGBA)
-#define PIX_FMT_BGR32_1 PIX_FMT_NE(BGRA, ARGB)
-
-#define PIX_FMT_GRAY16 PIX_FMT_NE(GRAY16BE, GRAY16LE)
-#define PIX_FMT_RGB48  PIX_FMT_NE(RGB48BE,  RGB48LE)
-#define PIX_FMT_RGB565 PIX_FMT_NE(RGB565BE, RGB565LE)
-#define PIX_FMT_RGB555 PIX_FMT_NE(RGB555BE, RGB555LE)
-#define PIX_FMT_RGB444 PIX_FMT_NE(RGB444BE, RGB444LE)
-#define PIX_FMT_BGR565 PIX_FMT_NE(BGR565BE, BGR565LE)
-#define PIX_FMT_BGR555 PIX_FMT_NE(BGR555BE, BGR555LE)
-#define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE)
-
-#define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE)
-#define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE)
-#define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE)
-
-#endif /* AVUTIL_PIXFMT_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h
--- a/ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_PPC_INTREADWRITE_H
-#define AVUTIL_PPC_INTREADWRITE_H
-
-#include <stdint.h>
-#include "config.h"
-
-#if HAVE_XFORM_ASM
-
-#define AV_RL16 AV_RL16
-static av_always_inline uint16_t AV_RL16(const void *p)
-{
-    uint16_t v;
-    __asm__ ("lhbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p));
-    return v;
-}
-
-#define AV_WL16 AV_WL16
-static av_always_inline void AV_WL16(void *p, uint16_t v)
-{
-    __asm__ ("sthbrx  %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v));
-}
-
-#define AV_RL32 AV_RL32
-static av_always_inline uint32_t AV_RL32(const void *p)
-{
-    uint32_t v;
-    __asm__ ("lwbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p));
-    return v;
-}
-
-#define AV_WL32 AV_WL32
-static av_always_inline void AV_WL32(void *p, uint32_t v)
-{
-    __asm__ ("stwbrx  %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v));
-}
-
-#if HAVE_LDBRX
-
-#define AV_RL64 AV_RL64
-static av_always_inline uint64_t AV_RL64(const void *p)
-{
-    uint64_t v;
-    __asm__ ("ldbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p));
-    return v;
-}
-
-#define AV_WL64 AV_WL64
-static av_always_inline void AV_WL64(void *p, uint64_t v)
-{
-    __asm__ ("stdbrx  %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v));
-}
-
-#else
-
-#define AV_RL64 AV_RL64
-static av_always_inline uint64_t AV_RL64(const void *p)
-{
-    union { uint64_t v; uint32_t hl[2]; } v;
-    __asm__ ("lwbrx   %0, %y2  \n\t"
-             "lwbrx   %1, %y3  \n\t"
-             : "=&r"(v.hl[1]), "=r"(v.hl[0])
-             : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1)));
-    return v.v;
-}
-
-#define AV_WL64 AV_WL64
-static av_always_inline void AV_WL64(void *p, uint64_t v)
-{
-    union { uint64_t v; uint32_t hl[2]; } vv = { v };
-    __asm__ ("stwbrx  %2, %y0  \n\t"
-             "stwbrx  %3, %y1  \n\t"
-             : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1))
-             : "r"(vv.hl[1]), "r"(vv.hl[0]));
-}
-
-#endif /* HAVE_LDBRX */
-
-#endif /* HAVE_XFORM_ASM */
-
-/*
- * GCC fails miserably on the packed struct version which is used by
- * default, so we override it here.
- */
-
-#define AV_RB64(p) (*(const uint64_t *)(p))
-#define AV_WB64(p, v) (*(uint64_t *)(p) = (v))
-
-#endif /* AVUTIL_PPC_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/ppc/timer.h
--- a/ffmpeg_smp/h264dec/libavutil/ppc/timer.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2005 Luca Barbato <lu_zero@gentoo.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_PPC_TIMER_H
-#define AVUTIL_PPC_TIMER_H
-
-#include <stdint.h>
-
-#define AV_READ_TIME read_time
-
-static inline uint64_t read_time(void)
-{
-    uint32_t tbu, tbl, temp;
-
-     /* from section 2.2.1 of the 32-bit PowerPC PEM */
-     __asm__ volatile(
-         "1:\n"
-         "mftbu  %2\n"
-         "mftb   %0\n"
-         "mftbu  %1\n"
-         "cmpw   %2,%1\n"
-         "bne    1b\n"
-     : "=r"(tbl), "=r"(tbu), "=r"(temp)
-     :
-     : "cc");
-
-     return (((uint64_t)tbu)<<32) | (uint64_t)tbl;
-}
-
-#endif /* AVUTIL_PPC_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/timer.h
--- a/ffmpeg_smp/h264dec/libavutil/timer.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-/**
- * @file
- * high precision timer, useful to profile code
- *
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_TIMER_H
-#define AVUTIL_TIMER_H
-
-#include <stdlib.h>
-#include <stdint.h>
-#include "config.h"
-
-#if   ARCH_ARM
-#   include "arm/timer.h"
-#elif ARCH_PPC
-#   include "ppc/timer.h"
-#elif ARCH_X86
-#   include "x86/timer.h"
-#endif
-
-#if !defined(AV_READ_TIME) && HAVE_GETHRTIME
-#   define AV_READ_TIME gethrtime
-#endif
-
-#ifdef AV_READ_TIME
-#define START_TIMER \
-uint64_t tend;\
-uint64_t tstart= AV_READ_TIME();\
-
-#define STOP_TIMER(id) \
-tend= AV_READ_TIME();\
-{\
-    static uint64_t tsum=0;\
-    static int tcount=0;\
-    static int tskip_count=0;\
-    if(tcount<2 || tend - tstart < 8*tsum/tcount || tend - tstart < 2000){\
-        tsum+= tend - tstart;\
-        tcount++;\
-    }else\
-        tskip_count++;\
-    if(((tcount+tskip_count)&(tcount+tskip_count-1))==0){\
-        av_log(NULL, AV_LOG_ERROR, "%"PRIu64" dezicycles in %s, %d runs, %d skips\n",\
-               tsum*10/tcount, id, tcount, tskip_count);\
-    }\
-}
-#else
-#define START_TIMER
-#define STOP_TIMER(id) {}
-#endif
-
-#endif /* AVUTIL_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86/bswap.h
--- a/ffmpeg_smp/h264dec/libavutil/x86/bswap.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * byte swapping routines
- */
-
-#ifndef AVUTIL_X86_BSWAP_H
-#define AVUTIL_X86_BSWAP_H
-
-#include <stdint.h>
-#include "config.h"
-#include "libavutil/attributes.h"
-
-#define bswap_16 bswap_16
-static av_always_inline av_const uint16_t bswap_16(uint16_t x)
-{
-    __asm__("rorw $8, %0" : "+r"(x));
-    return x;
-}
-
-#define bswap_32 bswap_32
-static av_always_inline av_const uint32_t bswap_32(uint32_t x)
-{
-// #if HAVE_BSWAP
-    __asm__("bswap   %0" : "+r" (x));
-// #else
-//     __asm__("rorw    $8,  %w0 \n\t"
-//             "rorl    $16, %0  \n\t"
-//             "rorw    $8,  %w0"
-//             : "+r"(x));
-// #endif
-    return x;
-}
-
-#if ARCH_X86_64
-#define bswap_64 bswap_64
-static inline uint64_t av_const bswap_64(uint64_t x)
-{
-    __asm__("bswap  %0": "=r" (x) : "0" (x));
-    return x;
-}
-#endif
-
-#endif /* AVUTIL_X86_BSWAP_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h
--- a/ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_X86_INTREADWRITE_H
-#define AVUTIL_X86_INTREADWRITE_H
-
-#include <stdint.h>
-#include "config.h"
-#include "libavutil/attributes.h"
-
-#if HAVE_MMX
-
-#if defined(__MMX__)
-
-#define AV_COPY64 AV_COPY64
-static av_always_inline void AV_COPY64(void *d, const void *s)
-{
-    __asm__("movq   %1, %%mm0  \n\t"
-            "movq   %%mm0, %0  \n\t"
-            : "=m"(*(uint64_t*)d)
-            : "m" (*(const uint64_t*)s)
-            : "mm0");
-}
-
-#define AV_SWAP64 AV_SWAP64
-static av_always_inline void AV_SWAP64(void *a, void *b)
-{
-    __asm__("movq   %1, %%mm0  \n\t"
-            "movq   %0, %%mm1  \n\t"
-            "movq   %%mm0, %0  \n\t"
-            "movq   %%mm1, %1  \n\t"
-            : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
-            ::"mm0", "mm1");
-}
-
-#define AV_ZERO64 AV_ZERO64
-static av_always_inline void AV_ZERO64(void *d)
-{
-    __asm__("pxor %%mm0, %%mm0  \n\t"
-            "movq %%mm0, %0     \n\t"
-            : "=m"(*(uint64_t*)d)
-            :: "mm0");
-}
-
-#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
-
-#ifdef __SSE__
-
-#define AV_COPY128 AV_COPY128
-static av_always_inline void AV_COPY128(void *d, const void *s)
-{
-    struct v {uint64_t v[2];};
-
-    __asm__("movaps   %1, %%xmm0  \n\t"
-            "movaps   %%xmm0, %0  \n\t"
-            : "=m"(*(struct v*)d)
-            : "m" (*(const struct v*)s)
-            : "xmm0");
-}
-
-#endif /* __SSE__ */
-
-#ifdef __SSE2__
-
-#define AV_ZERO128 AV_ZERO128
-static av_always_inline void AV_ZERO128(void *d)
-{
-    struct v {uint64_t v[2];};
-
-    __asm__("pxor %%xmm0, %%xmm0  \n\t"
-            "movdqa   %%xmm0, %0  \n\t"
-            : "=m"(*(struct v*)d)
-            :: "xmm0");
-}
-
-#endif /* __SSE2__ */
-
-#endif /* HAVE_MMX */
-
-#endif /* AVUTIL_X86_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86/timer.h
--- a/ffmpeg_smp/h264dec/libavutil/x86/timer.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_X86_TIMER_H
-#define AVUTIL_X86_TIMER_H
-
-#include <stdint.h>
-
-#define AV_READ_TIME read_time
-
-static inline uint64_t read_time(void)
-{
-    uint32_t a, d;
-    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
-    return ((uint64_t)d << 32) + a;
-}
-
-#endif /* AVUTIL_X86_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86_cpu.h
--- a/ffmpeg_smp/h264dec/libavutil/x86_cpu.h	Mon Aug 27 12:09:56 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_X86_CPU_H
-#define AVUTIL_X86_CPU_H
-
-#include <stdint.h>
-#include "config.h"
-
-#if ARCH_X86_64
-#    define REG_a "rax"
-#    define REG_b "rbx"
-#    define REG_c "rcx"
-#    define REG_d "rdx"
-#    define REG_D "rdi"
-#    define REG_S "rsi"
-#    define PTR_SIZE "8"
-typedef int64_t x86_reg;
-
-#    define REG_SP "rsp"
-#    define REG_BP "rbp"
-#    define REGBP   rbp
-#    define REGa    rax
-#    define REGb    rbx
-#    define REGc    rcx
-#    define REGd    rdx
-#    define REGSP   rsp
-
-#elif ARCH_X86_32
-
-#    define REG_a "eax"
-#    define REG_b "ebx"
-#    define REG_c "ecx"
-#    define REG_d "edx"
-#    define REG_D "edi"
-#    define REG_S "esi"
-#    define PTR_SIZE "4"
-typedef int32_t x86_reg;
-
-#    define REG_SP "esp"
-#    define REG_BP "ebp"
-#    define REGBP   ebp
-#    define REGa    eax
-#    define REGb    ebx
-#    define REGc    ecx
-#    define REGd    edx
-#    define REGSP   esp
-#else
-typedef int x86_reg;
-#endif
-
-// #if ARCH_X86_64 && defined(PIC)
-// #    define BROKEN_RELOCATIONS 1
-// #endif
-
-#endif /* AVUTIL_X86_CPU_H */
diff -r 11d15c47beaf -r 897f711a7157 h264dec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/h264dec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,288 @@
+/*
+* H264 decoder main
+*/
+
+#include "config.h"
+#include "libavcodec/h264.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <time.h>
+
+#include <assert.h>
+
+
+static const char program_name[] = "h264dec";
+static const int program_birth_year = 2010;
+
+static const char *file_name;
+static int ifile, ofile;
+static int no_arch =0;
+static int parallel = 1;
+static int frame_width  = 0;
+static int frame_height = 0;
+
+static void av_exit(int ret)
+{
+    //do some free calls
+#undef exit
+    exit(ret);
+}
+
+static void opt_input_file(const char *filename)
+{
+    /* open the input file */
+    ifile = open(filename, O_RDONLY, 0666);
+    if (ifile < 0){
+        fprintf(stderr, "Failed to open %s\n", filename);
+        av_exit(-1);
+    }
+
+    //parse first frame to get resolution (other information available but not used)
+    H264Slice slice;
+    PictureInfo pi;
+    GetBitContext gb = {0,};
+    ParserContext *pc;
+    NalContext *nc;
+
+    pc = get_parse_context(ifile);
+    nc = get_nal_context(0, 0);
+
+    memset(&slice, 0, sizeof(H264Slice));
+    slice.current_picture_info=&pi;
+
+    av_read_frame_internal(pc, &gb);
+    decode_nal_units(nc, &slice, &gb);
+
+    frame_width = nc->width;
+    frame_height= nc->height;
+
+    //clean up
+    av_freep(&gb.raw);
+    if (gb.rbsp)
+        av_freep(&gb.rbsp);
+    free_parse_context(pc);
+    free_nal_context(nc);
+
+    //rewind file
+    int offset;
+    if ( (offset=lseek(ifile, 0, SEEK_SET)) ){
+        fprintf(stderr, "Rewind input file %s failed at offset %d\n", filename, offset);
+    }
+
+}
+
+static void opt_output_file(const char *filename)
+{
+    if (filename){
+        if (!strcmp(filename, "-"))
+            filename = "pipe:";
+
+        ofile = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0666);
+    }else{
+        ofile =0;
+    }
+}
+
+static void show_usage(void)
+{
+    printf("usage: ffmpeg [options] -i infile }...\n");
+    printf("\n");
+}
+
+static struct option long_options[] = {
+    {"static-sched", 0, 0, 0},
+    {"static-mbd", 0, 0, 0},
+    {"numamap", 0, 0, 0},
+    {"no-mbd", 0, 0, 0},
+    {"static-3d", 0, 0, 0},
+    {"slice-bufs", 1, 0, 0},
+    {"smt", 0, 0, 0},
+    {"noarch", 0, 0, 'a'},
+    {"display", 0, 0, 'd'},
+    {"fullscreen", 0, 0, 'f'},
+    {"numframes", 1, 0, 'n'},
+    {"use-ppe-ed", 1, 0, 'p'},
+    {"sequential", 0, 0, 's'},
+    {"threads", 1, 0, 't'},
+    {"verbose", 1, 0, 'v'},
+    {"wave-order", 1, 0, 'w'},
+    {"smb-size", 1, 0, 'z'},
+    {"pipe-bufs", 1, 0, 'e'},
+    {0, 0, 0, 0}
+};
+
+static h264_options cli_opts;
+static void parse_cmd(int argc, char **argv)
+{
+    int c;
+    int digit_optind = 0;
+    int option_index = 0;
+    char ofile_name[1024];
+    extern char *optarg;
+    extern int optind, optopt;
+
+    cli_opts.statsched =0;
+    cli_opts.numamap =0;
+    cli_opts.statmbd =0;
+    cli_opts.no_mbd= 0;
+    cli_opts.numframes = INT_MAX;
+    cli_opts.display=0;
+    cli_opts.fullscreen=0;
+    cli_opts.verbose=0;
+    cli_opts.ppe_ed=0;
+    cli_opts.profile=0;
+    cli_opts.threads = 1;
+    cli_opts.smb_size[0] = cli_opts.smb_size[1] = 1;
+    cli_opts.wave_order=0;
+    cli_opts.static_3d=0;
+    cli_opts.pipe_bufs=8;
+    cli_opts.slice_bufs=1;
+    cli_opts.smt= 0;
+    while ((c = getopt_long(argc, argv, "ade:fi:n:o:p:st:vwz:", long_options, &option_index)) != -1 ){
+        int this_option_optind = optind ? optind : 1;
+
+        switch (c){
+            case 0:
+                if (option_index==0){
+                    cli_opts.statsched=1;
+                }else if (option_index==1){
+                    cli_opts.statmbd= 1;
+                }else if (option_index==2){
+                    cli_opts.numamap= 1;
+                }else if (option_index==3){
+                    cli_opts.no_mbd= 1;
+                }else if (option_index==4){
+                    cli_opts.static_3d= 1;
+                }else if (option_index==5){
+                    cli_opts.slice_bufs= (unsigned) atoi(optarg);
+                }else if (option_index==6){
+                    cli_opts.smt= 1;
+                }
+                break;
+            case '0':
+            case '1':
+            case '2':
+                if (digit_optind != 0 && digit_optind != this_option_optind)
+                    printf("digits occur in two different argv-elements.\n");
+                digit_optind = this_option_optind;
+                printf("option %c\n", c);
+                break;
+            case 'a':
+                no_arch=1;
+                break;
+            case 'd':
+                cli_opts.display=1;
+                break;
+            case 'f':
+                cli_opts.fullscreen=1;
+                break;
+            case 'i':
+                file_name = (const char *)optarg;
+                opt_input_file(file_name);
+                break;
+            case 'n':
+                cli_opts.numframes = (unsigned) atoi(optarg);
+                break;
+            case 'o':
+                strcpy(ofile_name, optarg);
+                opt_output_file(ofile_name);
+                break;
+            case 'p':
+                cli_opts.profile = (unsigned) atoi(optarg);
+                break;
+            case 's':
+                cli_opts.threads = 0;
+                parallel = 0;
+                break;
+            case 't':
+                cli_opts.threads = atoi(optarg);
+                if (cli_opts.threads<=0){
+                    fprintf(stderr, "Option -%c requires thread numbers > 0\n", c);
+                    av_exit(-1);
+                }
+                break;
+            case 'v':
+                cli_opts.verbose = 1;
+                break;
+            case 'w':
+                cli_opts.wave_order = 1;
+                break;
+            case 'z': // only useful in ompss
+                if (argc < optind +1){
+                    fprintf(stderr, "Option -%c (--smb-size) requires 2 arguments\n", c);
+                    av_exit(-1);
+                }
+                optind--;
+                for (int i=0; i<2; i++){
+                    cli_opts.smb_size[i] = atoi(argv[optind++]);
+                    if (!(cli_opts.smb_size > 0)){
+                        fprintf(stderr, "Option -%c (--smb-size) requires dimensions > 0\n", c);
+                        av_exit(-1);
+                    }
+                }
+                break;
+            case 'e':
+                cli_opts.pipe_bufs = atoi(optarg);
+                break;
+            case ':':
+                fprintf(stderr, "Option -%c requires an operand\n", optopt);
+                av_exit(-1);
+                break;
+            case '?':
+                fprintf(stderr, "Unrecognized option: -%c\n", optopt);
+                av_exit(-1);
+                break;
+        }
+    }
+
+}
+
+int main(int argc, char **argv)
+{
+    /* parse options */
+    parse_cmd(argc, argv);
+
+    if(!ifile ) {
+        show_usage();
+        av_exit(1);
+    }
+
+    H264Context *h = get_h264dec_context(file_name, ifile, ofile, frame_width, frame_height, &cli_opts);
+#if OMPSS
+    if (h264_decode_ompss( h ) < 0)
+        av_exit(-1);
+#else
+    if (parallel){
+        if (ARCH_CELL && !no_arch){
+            if (h264_decode_cell( h ) < 0)
+                av_exit(-1);
+        }else{
+            if (h264_decode_pthread( h ) < 0)
+                av_exit(1);
+        }
+    }else{
+        if (ARCH_CELL && !no_arch){
+            if (h264_decode_cell_seq( h ) < 0)
+                av_exit(1);
+        }else{
+            if (h264_decode_seq( h ) < 0)
+                av_exit(1);
+        }
+    }
+#endif
+    free_h264dec_context(h);
+    close(ifile);
+    close(ofile);
+
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/aac.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/aac.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_AAC_H
+#define AVCODEC_ARM_AAC_H
+
+#include "config.h"
+
+#if HAVE_NEON && HAVE_INLINE_ASM
+
+#define VMUL2 VMUL2
+static inline float *VMUL2(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    unsigned v0, v1;
+    __asm__ volatile ("ubfx     %0,  %4,  #0, #4      \n\t"
+                      "ubfx     %1,  %4,  #4, #4      \n\t"
+                      "ldr      %0,  [%3, %0, lsl #2] \n\t"
+                      "ldr      %1,  [%3, %1, lsl #2] \n\t"
+                      "vld1.32  {d1[]},   [%5,:32]    \n\t"
+                      "vmov     d0,  %0,  %1          \n\t"
+                      "vmul.f32 d0,  d0,  d1          \n\t"
+                      "vst1.32  {d0},     [%2,:64]!   \n\t"
+                      : "=&r"(v0), "=&r"(v1), "+r"(dst)
+                      : "r"(v), "r"(idx), "r"(scale)
+                      : "d0", "d1");
+    return dst;
+}
+
+#define VMUL4 VMUL4
+static inline float *VMUL4(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    unsigned v0, v1, v2, v3;
+    __asm__ volatile ("ubfx     %0,  %6,  #0, #2      \n\t"
+                      "ubfx     %1,  %6,  #2, #2      \n\t"
+                      "ldr      %0,  [%5, %0, lsl #2] \n\t"
+                      "ubfx     %2,  %6,  #4, #2      \n\t"
+                      "ldr      %1,  [%5, %1, lsl #2] \n\t"
+                      "ubfx     %3,  %6,  #6, #2      \n\t"
+                      "ldr      %2,  [%5, %2, lsl #2] \n\t"
+                      "vmov     d0,  %0,  %1          \n\t"
+                      "ldr      %3,  [%5, %3, lsl #2] \n\t"
+                      "vld1.32  {d2[],d3[]},[%7,:32]  \n\t"
+                      "vmov     d1,  %2,  %3          \n\t"
+                      "vmul.f32 q0,  q0,  q1          \n\t"
+                      "vst1.32  {q0},     [%4,:128]!  \n\t"
+                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst)
+                      : "r"(v), "r"(idx), "r"(scale)
+                      : "d0", "d1", "d2", "d3");
+    return dst;
+}
+
+#define VMUL2S VMUL2S
+static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    unsigned v0, v1, v2, v3;
+    __asm__ volatile ("ubfx     %0,  %6,  #0, #4      \n\t"
+                      "ubfx     %1,  %6,  #4, #4      \n\t"
+                      "ldr      %0,  [%5, %0, lsl #2] \n\t"
+                      "lsl      %2,  %8,  #30         \n\t"
+                      "ldr      %1,  [%5, %1, lsl #2] \n\t"
+                      "lsl      %3,  %8,  #31         \n\t"
+                      "vmov     d0,  %0,  %1          \n\t"
+                      "bic      %2,  %2,  #1<<30      \n\t"
+                      "vld1.32  {d1[]},   [%7,:32]    \n\t"
+                      "vmov     d2,  %2,  %3          \n\t"
+                      "veor     d0,  d0,  d2          \n\t"
+                      "vmul.f32 d0,  d0,  d1          \n\t"
+                      "vst1.32  {d0},     [%4,:64]!   \n\t"
+                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst)
+                      : "r"(v), "r"(idx), "r"(scale), "r"(sign)
+                      : "d0", "d1", "d2");
+    return dst;
+}
+
+#define VMUL4S VMUL4S
+static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    unsigned v0, v1, v2, v3, nz;
+    __asm__ volatile ("vld1.32  {d2[],d3[]},[%9,:32]  \n\t"
+                      "ubfx     %0,  %8,  #0, #2      \n\t"
+                      "ubfx     %1,  %8,  #2, #2      \n\t"
+                      "ldr      %0,  [%7, %0, lsl #2] \n\t"
+                      "ubfx     %2,  %8,  #4, #2      \n\t"
+                      "ldr      %1,  [%7, %1, lsl #2] \n\t"
+                      "ubfx     %3,  %8,  #6, #2      \n\t"
+                      "ldr      %2,  [%7, %2, lsl #2] \n\t"
+                      "vmov     d0,  %0,  %1          \n\t"
+                      "ldr      %3,  [%7, %3, lsl #2] \n\t"
+                      "lsr      %6,  %8,  #12         \n\t"
+                      "rbit     %6,  %6               \n\t"
+                      "vmov     d1,  %2,  %3          \n\t"
+                      "lsls     %6,  %6,  #1          \n\t"
+                      "and      %0,  %5,  #1<<31      \n\t"
+                      "lslcs    %5,  %5,  #1          \n\t"
+                      "lsls     %6,  %6,  #1          \n\t"
+                      "and      %1,  %5,  #1<<31      \n\t"
+                      "lslcs    %5,  %5,  #1          \n\t"
+                      "lsls     %6,  %6,  #1          \n\t"
+                      "and      %2,  %5,  #1<<31      \n\t"
+                      "lslcs    %5,  %5,  #1          \n\t"
+                      "vmov     d4,  %0,  %1          \n\t"
+                      "and      %3,  %5,  #1<<31      \n\t"
+                      "vmov     d5,  %2,  %3          \n\t"
+                      "veor     q0,  q0,  q2          \n\t"
+                      "vmul.f32 q0,  q0,  q1          \n\t"
+                      "vst1.32  {q0},     [%4,:128]!  \n\t"
+                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
+                        "+r"(sign), "=r"(nz)
+                      : "r"(v), "r"(idx), "r"(scale)
+                      : "d0", "d1", "d2", "d3", "d4", "d5");
+    return dst;
+}
+
+#endif /* HAVE_NEON && HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_ARM_AAC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/asm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/asm.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+        .macro require8, val=1
+ELF     .eabi_attribute 24, \val
+        .endm
+
+        .macro preserve8, val=1
+ELF     .eabi_attribute 25, \val
+        .endm
+
+        .macro function name, export=0
+        .macro endfunc
+ELF     .size   \name, . - \name
+        .endfunc
+        .purgem endfunc
+        .endm
+.if \export
+        .global EXTERN_ASM\name
+EXTERN_ASM\name:
+.endif
+ELF     .type   \name, %function
+        .func   \name
+\name:
+        .endm
+
+        .macro movrel rd, val
+#if HAVE_ARMV6T2 && !CONFIG_PIC
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+        .endm
+
+#if HAVE_VFP_ARGS
+        .eabi_attribute 28, 1
+#   define VFP
+#   define NOVFP @
+#else
+#   define VFP   @
+#   define NOVFP
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dcadsp_init_arm.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dcadsp_init_arm.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/dcadsp.h"
+
+void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
+                         int decifactor, float scale, float bias);
+
+void av_cold ff_dcadsp_init_arm(DCADSPContext *s)
+{
+    if (HAVE_NEON)
+        s->lfe_fir = ff_dca_lfe_fir_neon;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dcadsp_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dcadsp_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+function ff_dca_lfe_fir_neon, export=1
+        push            {r4-r6,lr}
+
+        add             r4,  r0,  r3,  lsl #2   @ out2
+        add             r5,  r2,  #256*4-16     @ cf1
+        sub             r1,  r1,  #12
+        cmp             r3,  #32
+        moveq           r6,  #256/32
+        movne           r6,  #256/64
+NOVFP   vldr            d0,  [sp, #16]          @ scale, bias
+        mov             lr,  #-16
+1:
+        vmov.f32        q2,  #0.0               @ v0
+        vmov.f32        q3,  #0.0               @ v1
+        mov             r12, r6
+2:
+        vld1.32         {q8},     [r2,:128]!    @ cf0
+        vld1.32         {q9},     [r5,:128], lr @ cf1
+        vld1.32         {q1},     [r1], lr      @ in
+        subs            r12, r12, #4
+        vrev64.32       q10, q8
+        vmla.f32        q3,  q1,  q9
+        vmla.f32        d4,  d2,  d21
+        vmla.f32        d5,  d3,  d20
+        bne             2b
+
+        add             r1,  r1,  r6,  lsl #2
+        subs            r3,  r3,  #1
+        vadd.f32        d4,  d4,  d5
+        vadd.f32        d6,  d6,  d7
+        vpadd.f32       d4,  d4,  d6
+        vdup.32         d5,  d0[1]
+        vmla.f32        d5,  d4,  d0[0]
+        vst1.32         {d5[0]},  [r0,:32]!
+        vst1.32         {d5[1]},  [r4,:32]!
+        bne             1b
+
+        pop             {r4-r6,pc}
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_arm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_arm.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,712 @@
+@
+@ ARMv4 optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "asm.S"
+
+        preserve8
+
+#if !HAVE_PLD
+.macro pld reg
+.endm
+#endif
+
+#if HAVE_ARMV5TE
+function ff_prefetch_arm, export=1
+        subs            r2,  r2,  #1
+        pld             [r0]
+        add             r0,  r0,  r1
+        bne             ff_prefetch_arm
+        bx              lr
+endfunc
+#endif
+
+.macro  ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+        mov             \Rd0, \Rn0, lsr #(\shift * 8)
+        mov             \Rd1, \Rn1, lsr #(\shift * 8)
+        mov             \Rd2, \Rn2, lsr #(\shift * 8)
+        mov             \Rd3, \Rn3, lsr #(\shift * 8)
+        orr             \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
+        orr             \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
+        orr             \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
+        orr             \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
+.endm
+.macro  ALIGN_DWORD shift, R0, R1, R2
+        mov             \R0, \R0, lsr #(\shift * 8)
+        orr             \R0, \R0, \R1, lsl #(32 - \shift * 8)
+        mov             \R1, \R1, lsr #(\shift * 8)
+        orr             \R1, \R1, \R2, lsl #(32 - \shift * 8)
+.endm
+.macro  ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
+        mov             \Rdst0, \Rsrc0, lsr #(\shift * 8)
+        mov             \Rdst1, \Rsrc1, lsr #(\shift * 8)
+        orr             \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
+        orr             \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
+.endm
+
+.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+        @ Rmask = 0xFEFEFEFE
+        @ Rn = destroy
+        eor             \Rd0, \Rn0, \Rm0
+        eor             \Rd1, \Rn1, \Rm1
+        orr             \Rn0, \Rn0, \Rm0
+        orr             \Rn1, \Rn1, \Rm1
+        and             \Rd0, \Rd0, \Rmask
+        and             \Rd1, \Rd1, \Rmask
+        sub             \Rd0, \Rn0, \Rd0, lsr #1
+        sub             \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+        @ Rmask = 0xFEFEFEFE
+        @ Rn = destroy
+        eor             \Rd0, \Rn0, \Rm0
+        eor             \Rd1, \Rn1, \Rm1
+        and             \Rn0, \Rn0, \Rm0
+        and             \Rn1, \Rn1, \Rm1
+        and             \Rd0, \Rd0, \Rmask
+        and             \Rd1, \Rd1, \Rmask
+        add             \Rd0, \Rn0, \Rd0, lsr #1
+        add             \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro  JMP_ALIGN tmp, reg
+        ands            \tmp, \reg, #3
+        bic             \reg, \reg, #3
+        beq             1f
+        subs            \tmp, \tmp, #1
+        beq             2f
+        subs            \tmp, \tmp, #1
+        beq             3f
+        b    4f
+.endm
+
+@ ----------------------------------------------------------------
+        .align 5
+function ff_put_pixels16_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r11, lr}
+        JMP_ALIGN       r5,  r1
+1:
+        ldm             r1,  {r4-r7}
+        add             r1,  r1,  r2
+        stm             r0,  {r4-r7}
+        pld             [r1]
+        subs            r3,  r3,  #1
+        add             r0,  r0,  r2
+        bne             1b
+        pop             {r4-r11, pc}
+        .align 5
+2:
+        ldm             r1,  {r4-r8}
+        add             r1,  r1,  r2
+        ALIGN_QWORD_D   1,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
+        pld             [r1]
+        subs            r3,  r3,  #1
+        stm             r0,  {r9-r12}
+        add             r0,  r0,  r2
+        bne             2b
+        pop             {r4-r11, pc}
+        .align 5
+3:
+        ldm             r1,  {r4-r8}
+        add             r1,  r1,  r2
+        ALIGN_QWORD_D   2,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
+        pld             [r1]
+        subs            r3,  r3,  #1
+        stm             r0,  {r9-r12}
+        add             r0,  r0,  r2
+        bne             3b
+        pop             {r4-r11, pc}
+        .align 5
+4:
+        ldm             r1,  {r4-r8}
+        add             r1,  r1,  r2
+        ALIGN_QWORD_D   3,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
+        pld             [r1]
+        subs            r3,  r3,  #1
+        stm             r0,  {r9-r12}
+        add             r0,  r0,  r2
+        bne             4b
+        pop             {r4-r11,pc}
+endfunc
+
+@ ----------------------------------------------------------------
+        .align 5
+function ff_put_pixels8_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r5,lr}
+        JMP_ALIGN       r5,  r1
+1:
+        ldm             r1,  {r4-r5}
+        add             r1,  r1,  r2
+        subs            r3,  r3,  #1
+        pld             [r1]
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             1b
+        pop             {r4-r5,pc}
+        .align 5
+2:
+        ldm             r1,  {r4-r5, r12}
+        add             r1,  r1,  r2
+        ALIGN_DWORD     1,   r4,  r5,  r12
+        pld             [r1]
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             2b
+        pop             {r4-r5,pc}
+        .align 5
+3:
+        ldm             r1,  {r4-r5, r12}
+        add             r1,  r1,  r2
+        ALIGN_DWORD     2,   r4,  r5,  r12
+        pld             [r1]
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             3b
+        pop             {r4-r5,pc}
+        .align 5
+4:
+        ldm             r1,  {r4-r5, r12}
+        add             r1,  r1,  r2
+        ALIGN_DWORD     3,   r4,  r5,  r12
+        pld             [r1]
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             4b
+        pop             {r4-r5,pc}
+endfunc
+
+@ ----------------------------------------------------------------
+        .align 5
+function ff_put_pixels8_x2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r10,lr}
+        ldr             r12, =0xfefefefe
+        JMP_ALIGN       r5,  r1
+1:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
+        pld             [r1]
+        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        bne             1b
+        pop             {r4-r10,pc}
+        .align 5
+2:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
+        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
+        pld             [r1]
+        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             2b
+        pop             {r4-r10,pc}
+        .align 5
+3:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
+        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
+        pld             [r1]
+        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             3b
+        pop             {r4-r10,pc}
+        .align 5
+4:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
+        pld             [r1]
+        RND_AVG32       r8,  r9,  r6,  r7,  r5,  r10, r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        bne             4b
+        pop             {r4-r10,pc}
+endfunc
+
+        .align 5
+function ff_put_no_rnd_pixels8_x2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r10,lr}
+        ldr             r12, =0xfefefefe
+        JMP_ALIGN       r5,  r1
+1:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
+        pld             [r1]
+        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        bne             1b
+        pop             {r4-r10,pc}
+        .align 5
+2:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
+        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
+        pld             [r1]
+        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             2b
+        pop             {r4-r10,pc}
+        .align 5
+3:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
+        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
+        pld             [r1]
+        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bne             3b
+        pop             {r4-r10,pc}
+        .align 5
+4:
+        ldm             r1,  {r4-r5, r10}
+        add             r1,  r1,  r2
+        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
+        pld             [r1]
+        NO_RND_AVG32    r8,  r9,  r6,  r7,  r5,  r10, r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        bne             4b
+        pop             {r4-r10,pc}
+endfunc
+
+
+@ ----------------------------------------------------------------
+        .align 5
+function ff_put_pixels8_y2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r11,lr}
+        mov             r3,  r3,  lsr #1
+        ldr             r12, =0xfefefefe
+        JMP_ALIGN       r5,  r1
+1:
+        ldm             r1,  {r4-r5}
+        add             r1,  r1,  r2
+6:      ldm             r1,  {r6-r7}
+        add             r1,  r1,  r2
+        pld             [r1]
+        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
+        ldm             r1,  {r4-r5}
+        add             r1,  r1,  r2
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        pld             [r1]
+        RND_AVG32       r8,  r9,  r6,  r7,  r4,  r5,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+        .align 5
+2:
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     1,   r4,  r5,  r6
+6:      ldm             r1,  {r7-r9}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     1,   r7,  r8,  r9
+        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     1,   r4,  r5,  r6
+        subs            r3,  r3,  #1
+        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+        .align 5
+3:
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     2,   r4,  r5,  r6
+6:      ldm             r1,  {r7-r9}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     2,   r7,  r8,  r9
+        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     2,   r4,  r5,  r6
+        subs            r3,  r3,  #1
+        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+        .align 5
+4:
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     3,   r4,  r5,  r6
+6:      ldm             r1,  {r7-r9}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     3,   r7,  r8,  r9
+        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     3,   r4,  r5,  r6
+        subs            r3,  r3,  #1
+        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+endfunc
+
+        .align 5
+function ff_put_no_rnd_pixels8_y2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r11,lr}
+        mov             r3,  r3,  lsr #1
+        ldr             r12, =0xfefefefe
+        JMP_ALIGN       r5,  r1
+1:
+        ldm             r1,  {r4-r5}
+        add             r1,  r1,  r2
+6:      ldm             r1,  {r6-r7}
+        add             r1,  r1,  r2
+        pld             [r1]
+        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
+        ldm             r1,  {r4-r5}
+        add             r1,  r1,  r2
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        pld             [r1]
+        NO_RND_AVG32    r8,  r9,  r6,  r7,  r4,  r5,  r12
+        subs            r3,  r3,  #1
+        stm             r0,  {r8-r9}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+        .align 5
+2:
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     1,   r4,  r5,  r6
+6:      ldm             r1,  {r7-r9}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     1,   r7,  r8,  r9
+        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     1,   r4,  r5,  r6
+        subs            r3,  r3,  #1
+        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+        .align 5
+3:
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     2,   r4,  r5,  r6
+6:      ldm             r1,  {r7-r9}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     2,   r7,  r8,  r9
+        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     2,   r4,  r5,  r6
+        subs            r3,  r3,  #1
+        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+        .align 5
+4:
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     3,   r4,  r5,  r6
+6:      ldm             r1,  {r7-r9}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     3,   r7,  r8,  r9
+        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        ldm             r1,  {r4-r6}
+        add             r1,  r1,  r2
+        pld             [r1]
+        ALIGN_DWORD     3,   r4,  r5,  r6
+        subs            r3,  r3,  #1
+        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
+        stm             r0,  {r10-r11}
+        add             r0,  r0,  r2
+        bne             6b
+        pop             {r4-r11,pc}
+endfunc
+
+        .ltorg
+
+@ ----------------------------------------------------------------
+.macro  RND_XY2_IT align, rnd
+        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
+        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
+.if \align == 0
+        ldm             r1,  {r6-r8}
+.elseif \align == 3
+        ldm             r1,  {r5-r7}
+.else
+        ldm             r1,  {r8-r10}
+.endif
+        add             r1,  r1,  r2
+        pld             [r1]
+.if \align == 0
+        ALIGN_DWORD_D   1,   r4,  r5,  r6,  r7,  r8
+.elseif \align == 1
+        ALIGN_DWORD_D   1,   r4,  r5,  r8,  r9,  r10
+        ALIGN_DWORD_D   2,   r6,  r7,  r8,  r9,  r10
+.elseif \align == 2
+        ALIGN_DWORD_D   2,   r4,  r5,  r8,  r9,  r10
+        ALIGN_DWORD_D   3,   r6,  r7,  r8,  r9,  r10
+.elseif \align == 3
+        ALIGN_DWORD_D   3,   r4,  r5,  r5,  r6,  r7
+.endif
+        ldr             r14, =0x03030303
+        tst             r3,  #1
+        and             r8,  r4,  r14
+        and             r9,  r5,  r14
+        and             r10, r6,  r14
+        and             r11, r7,  r14
+        andeq           r14, r14, r14, \rnd #1
+        add             r8,  r8,  r10
+        add             r9,  r9,  r11
+        ldr             r12, =0xfcfcfcfc >> 2
+        addeq           r8,  r8,  r14
+        addeq           r9,  r9,  r14
+        and             r4,  r12, r4,  lsr #2
+        and             r5,  r12, r5,  lsr #2
+        and             r6,  r12, r6,  lsr #2
+        and             r7,  r12, r7,  lsr #2
+        add             r10, r4,  r6
+        add             r11, r5,  r7
+        subs            r3,  r3,  #1
+.endm
+
+.macro RND_XY2_EXPAND align, rnd
+        RND_XY2_IT      \align, \rnd
+6:      push            {r8-r11}
+        RND_XY2_IT      \align, \rnd
+        pop             {r4-r7}
+        add             r4,  r4,  r8
+        add             r5,  r5,  r9
+        ldr             r14, =0x0f0f0f0f
+        add             r6,  r6,  r10
+        add             r7,  r7,  r11
+        and             r4,  r14, r4,  lsr #2
+        and             r5,  r14, r5,  lsr #2
+        add             r4,  r4,  r6
+        add             r5,  r5,  r7
+        stm             r0,  {r4-r5}
+        add             r0,  r0,  r2
+        bge             6b
+        pop             {r4-r11,pc}
+.endm
+
+        .align 5
+function ff_put_pixels8_xy2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r11,lr} @ R14 is also called LR
+        JMP_ALIGN       r5,  r1
+1:      RND_XY2_EXPAND  0, lsl
+        .align 5
+2:      RND_XY2_EXPAND  1, lsl
+        .align 5
+3:      RND_XY2_EXPAND  2, lsl
+        .align 5
+4:      RND_XY2_EXPAND  3, lsl
+endfunc
+
+        .align 5
+function ff_put_no_rnd_pixels8_xy2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld             [r1]
+        push            {r4-r11,lr}
+        JMP_ALIGN       r5,  r1
+1:      RND_XY2_EXPAND  0, lsr
+        .align 5
+2:      RND_XY2_EXPAND  1, lsr
+        .align 5
+3:      RND_XY2_EXPAND  2, lsr
+        .align 5
+4:      RND_XY2_EXPAND  3, lsr
+endfunc
+
+        .align 5
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+function ff_add_pixels_clamped_arm, export=1
+        push            {r4-r10}
+        mov             r10, #8
+1:
+        ldr             r4,  [r1]               /* load dest */
+        /* block[0] and block[1]*/
+        ldrsh           r5,  [r0]
+        ldrsh           r7,  [r0, #2]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r5,  r6
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #4]           /* moved form [A] */
+        orr             r9,  r9,  r8,  lsl #8
+        /* block[2] and block[3] */
+        /* [A] */
+        ldrsh           r7,  [r0, #6]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6,  lsr #16
+        add             r8,  r7,  r8,  lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6,  lsl #16
+        ldr             r4,  [r1, #4]           /* moved form [B] */
+        orr             r9,  r9,  r8,  lsl #24
+        /* store dest */
+        ldrsh           r5,  [r0, #8]           /* moved form [C] */
+        str             r9,  [r1]
+
+        /* load dest */
+        /* [B] */
+        /* block[4] and block[5] */
+        /* [C] */
+        ldrsh           r7,  [r0, #10]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r5,  r6
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #12]          /* moved from [D] */
+        orr             r9,  r9,  r8,  lsl #8
+        /* block[6] and block[7] */
+        /* [D] */
+        ldrsh           r7,  [r0, #14]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6,  lsr #16
+        add             r8,  r7,  r8,  lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6,  lsl #16
+        add             r0,  r0,  #16           /* moved from [E] */
+        orr             r9,  r9,  r8,  lsl #24
+        subs            r10, r10, #1            /* moved from [F] */
+        /* store dest */
+        str             r9,  [r1, #4]
+
+        /* [E] */
+        /* [F] */
+        add             r1,  r1,  r2
+        bne             1b
+
+        pop             {r4-r10}
+        bx              lr
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_arm.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_arm.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_DSPUTIL_H
+#define AVCODEC_ARM_DSPUTIL_H
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+
+void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
+void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_armv6.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_armv6.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+
+        .text
+
+.macro  call_2x_pixels  type, subp
+function ff_\type\()_pixels16\subp\()_armv6, export=1
+        push            {r0-r3, lr}
+        bl              ff_\type\()_pixels8\subp\()_armv6
+        pop             {r0-r3, lr}
+        add             r0,  r0,  #8
+        add             r1,  r1,  #8
+        b               ff_\type\()_pixels8\subp\()_armv6
+endfunc
+.endm
+
+call_2x_pixels          avg
+call_2x_pixels          put, _x2
+call_2x_pixels          put, _y2
+call_2x_pixels          put, _x2_no_rnd
+call_2x_pixels          put, _y2_no_rnd
+
+function ff_put_pixels16_armv6, export=1
+        push            {r4-r11}
+1:
+        ldr             r5,  [r1, #4]
+        ldr             r6,  [r1, #8]
+        ldr             r7,  [r1, #12]
+        ldr             r4,  [r1], r2
+        strd            r6,  r7,  [r0, #8]
+        ldr             r9,  [r1, #4]
+        strd            r4,  r5,  [r0],  r2
+        ldr             r10, [r1, #8]
+        ldr             r11, [r1, #12]
+        ldr             r8,  [r1], r2
+        strd            r10, r11, [r0, #8]
+        subs            r3,  r3,  #2
+        strd            r8,  r9,  [r0],  r2
+        bne             1b
+
+        pop             {r4-r11}
+        bx              lr
+endfunc
+
+function ff_put_pixels8_armv6, export=1
+        push            {r4-r7}
+1:
+        ldr             r5,  [r1, #4]
+        ldr             r4,  [r1], r2
+        ldr             r7,  [r1, #4]
+        strd            r4,  r5,  [r0],  r2
+        ldr             r6,  [r1], r2
+        subs            r3,  r3,  #2
+        strd            r6,  r7,  [r0],  r2
+        bne             1b
+
+        pop             {r4-r7}
+        bx              lr
+endfunc
+
+function ff_put_pixels8_x2_armv6, export=1
+        push            {r4-r11, lr}
+        mov             r12, #1
+        orr             r12, r12, r12, lsl #8
+        orr             r12, r12, r12, lsl #16
+1:
+        ldr             r4,  [r1]
+        subs            r3,  r3,  #2
+        ldr             r5,  [r1, #4]
+        ldr             r7,  [r1, #5]
+        lsr             r6,  r4,  #8
+        ldr             r8,  [r1, r2]!
+        orr             r6,  r6,  r5,  lsl #24
+        ldr             r9,  [r1, #4]
+        ldr             r11, [r1, #5]
+        lsr             r10, r8,  #8
+        add             r1,  r1,  r2
+        orr             r10, r10, r9,  lsl #24
+        eor             r14, r4,  r6
+        uhadd8          r4,  r4,  r6
+        eor             r6,  r5,  r7
+        uhadd8          r5,  r5,  r7
+        and             r14, r14, r12
+        and             r6,  r6,  r12
+        uadd8           r4,  r4,  r14
+        eor             r14, r8,  r10
+        uadd8           r5,  r5,  r6
+        eor             r6,  r9,  r11
+        uhadd8          r8,  r8,  r10
+        and             r14, r14, r12
+        uhadd8          r9,  r9,  r11
+        and             r6,  r6,  r12
+        uadd8           r8,  r8,  r14
+        strd            r4,  r5,  [r0],  r2
+        uadd8           r9,  r9,  r6
+        strd            r8,  r9,  [r0],  r2
+        bne             1b
+
+        pop             {r4-r11, pc}
+endfunc
+
+function ff_put_pixels8_y2_armv6, export=1
+        push            {r4-r11}
+        mov             r12, #1
+        orr             r12, r12, r12, lsl #8
+        orr             r12, r12, r12, lsl #16
+        ldr             r4,  [r1]
+        ldr             r5,  [r1, #4]
+        ldr             r6,  [r1, r2]!
+        ldr             r7,  [r1, #4]
+1:
+        subs            r3,  r3,  #2
+        uhadd8          r8,  r4,  r6
+        eor             r10, r4,  r6
+        uhadd8          r9,  r5,  r7
+        eor             r11, r5,  r7
+        and             r10, r10, r12
+        ldr             r4,  [r1, r2]!
+        uadd8           r8,  r8,  r10
+        and             r11, r11, r12
+        uadd8           r9,  r9,  r11
+        ldr             r5,  [r1, #4]
+        uhadd8          r10, r4,  r6
+        eor             r6,  r4,  r6
+        uhadd8          r11, r5,  r7
+        and             r6,  r6,  r12
+        eor             r7,  r5,  r7
+        uadd8           r10, r10, r6
+        and             r7,  r7,  r12
+        ldr             r6,  [r1, r2]!
+        uadd8           r11, r11, r7
+        strd            r8,  r9,  [r0],  r2
+        ldr             r7,  [r1, #4]
+        strd            r10, r11, [r0],  r2
+        bne             1b
+
+        pop             {r4-r11}
+        bx              lr
+endfunc
+
+function ff_put_pixels8_x2_no_rnd_armv6, export=1
+        push            {r4-r9, lr}
+1:
+        subs            r3,  r3,  #2
+        ldr             r4,  [r1]
+        ldr             r5,  [r1, #4]
+        ldr             r7,  [r1, #5]
+        ldr             r8,  [r1, r2]!
+        ldr             r9,  [r1, #4]
+        ldr             r14, [r1, #5]
+        add             r1,  r1,  r2
+        lsr             r6,  r4,  #8
+        orr             r6,  r6,  r5,  lsl #24
+        lsr             r12, r8,  #8
+        orr             r12, r12, r9,  lsl #24
+        uhadd8          r4,  r4,  r6
+        uhadd8          r5,  r5,  r7
+        uhadd8          r8,  r8,  r12
+        uhadd8          r9,  r9,  r14
+        stm             r0,  {r4,r5}
+        add             r0,  r0,  r2
+        stm             r0,  {r8,r9}
+        add             r0,  r0,  r2
+        bne             1b
+
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_put_pixels8_y2_no_rnd_armv6, export=1
+        push            {r4-r9, lr}
+        ldr             r4,  [r1]
+        ldr             r5,  [r1, #4]
+        ldr             r6,  [r1, r2]!
+        ldr             r7,  [r1, #4]
+1:
+        subs            r3,  r3,  #2
+        uhadd8          r8,  r4,  r6
+        ldr             r4,  [r1, r2]!
+        uhadd8          r9,  r5,  r7
+        ldr             r5,  [r1, #4]
+        uhadd8          r12, r4,  r6
+        ldr             r6,  [r1, r2]!
+        uhadd8          r14, r5,  r7
+        ldr             r7,  [r1, #4]
+        stm             r0,  {r8,r9}
+        add             r0,  r0,  r2
+        stm             r0,  {r12,r14}
+        add             r0,  r0,  r2
+        bne             1b
+
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_avg_pixels8_armv6, export=1
+        pld             [r1, r2]
+        push            {r4-r10, lr}
+        mov             lr,  #1
+        orr             lr,  lr,  lr,  lsl #8
+        orr             lr,  lr,  lr,  lsl #16
+        ldrd            r4,  r5,  [r0]
+        ldr             r10, [r1, #4]
+        ldr             r9,  [r1], r2
+        subs            r3,  r3,  #2
+1:
+        pld             [r1, r2]
+        eor             r8,  r4,  r9
+        uhadd8          r4,  r4,  r9
+        eor             r12, r5,  r10
+        ldrd            r6,  r7,  [r0, r2]
+        uhadd8          r5,  r5,  r10
+        and             r8,  r8,  lr
+        ldr             r10, [r1, #4]
+        and             r12, r12, lr
+        uadd8           r4,  r4,  r8
+        ldr             r9,  [r1], r2
+        eor             r8,  r6,  r9
+        uadd8           r5,  r5,  r12
+        pld             [r1, r2,  lsl #1]
+        eor             r12, r7,  r10
+        uhadd8          r6,  r6,  r9
+        strd            r4,  r5,  [r0], r2
+        uhadd8          r7,  r7,  r10
+        beq             2f
+        and             r8,  r8,  lr
+        ldrd            r4,  r5,  [r0, r2]
+        uadd8           r6,  r6,  r8
+        ldr             r10, [r1, #4]
+        and             r12, r12, lr
+        subs            r3,  r3,  #2
+        uadd8           r7,  r7,  r12
+        ldr             r9,  [r1], r2
+        strd            r6,  r7,  [r0], r2
+        b               1b
+2:
+        and             r8,  r8,  lr
+        and             r12, r12, lr
+        uadd8           r6,  r6,  r8
+        uadd8           r7,  r7,  r12
+        strd            r6,  r7,  [r0], r2
+
+        pop             {r4-r10, pc}
+endfunc
+
+function ff_add_pixels_clamped_armv6, export=1
+        push            {r4-r8,lr}
+        mov             r3,  #8
+1:
+        ldm             r0!, {r4,r5,r12,lr}
+        ldrd            r6,  r7,  [r1]
+        pkhbt           r8,  r4,  r5,  lsl #16
+        pkhtb           r5,  r5,  r4,  asr #16
+        pkhbt           r4,  r12, lr,  lsl #16
+        pkhtb           lr,  lr,  r12, asr #16
+        pld             [r1, r2]
+        uxtab16         r8,  r8,  r6
+        uxtab16         r5,  r5,  r6,  ror #8
+        uxtab16         r4,  r4,  r7
+        uxtab16         lr,  lr,  r7,  ror #8
+        usat16          r8,  #8,  r8
+        usat16          r5,  #8,  r5
+        usat16          r4,  #8,  r4
+        usat16          lr,  #8,  lr
+        orr             r6,  r8,  r5,  lsl #8
+        orr             r7,  r4,  lr,  lsl #8
+        subs            r3,  r3,  #1
+        strd            r6,  r7,  [r1],  r2
+        bgt             1b
+        pop             {r4-r8,pc}
+endfunc
+
+function ff_get_pixels_armv6, export=1
+        pld             [r1, r2]
+        push            {r4-r8, lr}
+        mov             lr,  #8
+1:
+        ldrd            r4,  r5,  [r1],  r2
+        subs            lr,  lr,  #1
+        uxtb16          r6,  r4
+        uxtb16          r4,  r4,  ror #8
+        uxtb16          r12, r5
+        uxtb16          r8,  r5,  ror #8
+        pld             [r1, r2]
+        pkhbt           r5,  r6,  r4,  lsl #16
+        pkhtb           r6,  r4,  r6,  asr #16
+        pkhbt           r7,  r12, r8,  lsl #16
+        pkhtb           r12, r8,  r12, asr #16
+        stm             r0!, {r5,r6,r7,r12}
+        bgt             1b
+
+        pop             {r4-r8, pc}
+endfunc
+
+function ff_diff_pixels_armv6, export=1
+        pld             [r1, r3]
+        pld             [r2, r3]
+        push            {r4-r9, lr}
+        mov             lr,  #8
+1:
+        ldrd            r4,  r5,  [r1],  r3
+        ldrd            r6,  r7,  [r2],  r3
+        uxtb16          r8,  r4
+        uxtb16          r4,  r4,  ror #8
+        uxtb16          r9,  r6
+        uxtb16          r6,  r6,  ror #8
+        pld             [r1, r3]
+        ssub16          r9,  r8,  r9
+        ssub16          r6,  r4,  r6
+        uxtb16          r8,  r5
+        uxtb16          r5,  r5,  ror #8
+        pld             [r2, r3]
+        pkhbt           r4,  r9,  r6,  lsl #16
+        pkhtb           r6,  r6,  r9,  asr #16
+        uxtb16          r9,  r7
+        uxtb16          r7,  r7,  ror #8
+        ssub16          r9,  r8,  r9
+        ssub16          r5,  r5,  r7
+        subs            lr,  lr,  #1
+        pkhbt           r8,  r9,  r5,  lsl #16
+        pkhtb           r9,  r5,  r9,  asr #16
+        stm             r0!, {r4,r6,r8,r9}
+        bgt             1b
+
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_pix_abs16_armv6, export=1
+        ldr             r0,  [sp]
+        push            {r4-r9, lr}
+        mov             r12, #0
+        mov             lr,  #0
+        ldm             r1,  {r4-r7}
+        ldr             r8,  [r2]
+1:
+        ldr             r9,  [r2, #4]
+        pld             [r1, r3]
+        usada8          r12, r4,  r8,  r12
+        ldr             r8,  [r2, #8]
+        pld             [r2, r3]
+        usada8          lr,  r5,  r9,  lr
+        ldr             r9,  [r2, #12]
+        usada8          r12, r6,  r8,  r12
+        subs            r0,  r0,  #1
+        usada8          lr,  r7,  r9,  lr
+        beq             2f
+        add             r1,  r1,  r3
+        ldm             r1,  {r4-r7}
+        add             r2,  r2,  r3
+        ldr             r8,  [r2]
+        b               1b
+2:
+        add             r0,  r12, lr
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_pix_abs16_x2_armv6, export=1
+        ldr             r12, [sp]
+        push            {r4-r11, lr}
+        mov             r0,  #0
+        mov             lr,  #1
+        orr             lr,  lr,  lr,  lsl #8
+        orr             lr,  lr,  lr,  lsl #16
+1:
+        ldr             r8,  [r2]
+        ldr             r9,  [r2, #4]
+        lsr             r10, r8,  #8
+        ldr             r4,  [r1]
+        lsr             r6,  r9,  #8
+        orr             r10, r10, r9,  lsl #24
+        ldr             r5,  [r2, #8]
+        eor             r11, r8,  r10
+        uhadd8          r7,  r8,  r10
+        orr             r6,  r6,  r5,  lsl #24
+        and             r11, r11, lr
+        uadd8           r7,  r7,  r11
+        ldr             r8,  [r1, #4]
+        usada8          r0,  r4,  r7,  r0
+        eor             r7,  r9,  r6
+        lsr             r10, r5,  #8
+        and             r7,  r7,  lr
+        uhadd8          r4,  r9,  r6
+        ldr             r6,  [r2, #12]
+        uadd8           r4,  r4,  r7
+        pld             [r1, r3]
+        orr             r10, r10, r6,  lsl #24
+        usada8          r0,  r8,  r4,  r0
+        ldr             r4,  [r1, #8]
+        eor             r11, r5,  r10
+        ldrb            r7,  [r2, #16]
+        and             r11, r11, lr
+        uhadd8          r8,  r5,  r10
+        ldr             r5,  [r1, #12]
+        uadd8           r8,  r8,  r11
+        pld             [r2, r3]
+        lsr             r10, r6,  #8
+        usada8          r0,  r4,  r8,  r0
+        orr             r10, r10, r7,  lsl #24
+        subs            r12,  r12,  #1
+        eor             r11, r6,  r10
+        add             r1,  r1,  r3
+        uhadd8          r9,  r6,  r10
+        and             r11, r11, lr
+        uadd8           r9,  r9,  r11
+        add             r2,  r2,  r3
+        usada8          r0,  r5,  r9,  r0
+        bgt             1b
+
+        pop             {r4-r11, pc}
+endfunc
+
+.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
+        ldr             \n0, [r2]
+        eor             \n1, \p0, \n0
+        uhadd8          \p0, \p0, \n0
+        and             \n1, \n1, lr
+        ldr             \n2, [r1]
+        uadd8           \p0, \p0, \n1
+        ldr             \n1, [r2, #4]
+        usada8          r0,  \p0, \n2, r0
+        pld             [r1,  r3]
+        eor             \n3, \p1, \n1
+        uhadd8          \p1, \p1, \n1
+        and             \n3, \n3, lr
+        ldr             \p0, [r1, #4]
+        uadd8           \p1, \p1, \n3
+        ldr             \n2, [r2, #8]
+        usada8          r0,  \p1, \p0, r0
+        pld             [r2,  r3]
+        eor             \p0, \p2, \n2
+        uhadd8          \p2, \p2, \n2
+        and             \p0, \p0, lr
+        ldr             \p1, [r1, #8]
+        uadd8           \p2, \p2, \p0
+        ldr             \n3, [r2, #12]
+        usada8          r0,  \p2, \p1, r0
+        eor             \p1, \p3, \n3
+        uhadd8          \p3, \p3, \n3
+        and             \p1, \p1, lr
+        ldr             \p0,  [r1, #12]
+        uadd8           \p3, \p3, \p1
+        add             r1,  r1,  r3
+        usada8          r0,  \p3, \p0,  r0
+        add             r2,  r2,  r3
+.endm
+
+function ff_pix_abs16_y2_armv6, export=1
+        pld             [r1]
+        pld             [r2]
+        ldr             r12, [sp]
+        push            {r4-r11, lr}
+        mov             r0,  #0
+        mov             lr,  #1
+        orr             lr,  lr,  lr,  lsl #8
+        orr             lr,  lr,  lr,  lsl #16
+        ldr             r4,  [r2]
+        ldr             r5,  [r2, #4]
+        ldr             r6,  [r2, #8]
+        ldr             r7,  [r2, #12]
+        add             r2,  r2,  r3
+1:
+        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
+        subs            r12, r12, #2
+        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
+        bgt             1b
+
+        pop             {r4-r11, pc}
+endfunc
+
+function ff_pix_abs8_armv6, export=1
+        pld             [r2, r3]
+        ldr             r12, [sp]
+        push            {r4-r9, lr}
+        mov             r0,  #0
+        mov             lr,  #0
+        ldrd            r4,  r5,  [r1], r3
+1:
+        subs            r12, r12, #2
+        ldr             r7,  [r2, #4]
+        ldr             r6,  [r2], r3
+        ldrd            r8,  r9,  [r1], r3
+        usada8          r0,  r4,  r6,  r0
+        pld             [r2, r3]
+        usada8          lr,  r5,  r7,  lr
+        ldr             r7,  [r2, #4]
+        ldr             r6,  [r2], r3
+        beq             2f
+        ldrd            r4,  r5,  [r1], r3
+        usada8          r0,  r8,  r6,  r0
+        pld             [r2, r3]
+        usada8          lr,  r9,  r7,  lr
+        b               1b
+2:
+        usada8          r0,  r8,  r6,  r0
+        usada8          lr,  r9,  r7,  lr
+        add             r0,  r0,  lr
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_sse16_armv6, export=1
+        ldr             r12, [sp]
+        push            {r4-r9, lr}
+        mov             r0,  #0
+1:
+        ldrd            r4,  r5,  [r1]
+        ldr             r8,  [r2]
+        uxtb16          lr,  r4
+        uxtb16          r4,  r4,  ror #8
+        uxtb16          r9,  r8
+        uxtb16          r8,  r8,  ror #8
+        ldr             r7,  [r2, #4]
+        usub16          lr,  lr,  r9
+        usub16          r4,  r4,  r8
+        smlad           r0,  lr,  lr,  r0
+        uxtb16          r6,  r5
+        uxtb16          lr,  r5,  ror #8
+        uxtb16          r8,  r7
+        uxtb16          r9,  r7,  ror #8
+        smlad           r0,  r4,  r4,  r0
+        ldrd            r4,  r5,  [r1, #8]
+        usub16          r6,  r6,  r8
+        usub16          r8,  lr,  r9
+        ldr             r7,  [r2, #8]
+        smlad           r0,  r6,  r6,  r0
+        uxtb16          lr,  r4
+        uxtb16          r4,  r4,  ror #8
+        uxtb16          r9,  r7
+        uxtb16          r7,  r7, ror #8
+        smlad           r0,  r8,  r8,  r0
+        ldr             r8,  [r2, #12]
+        usub16          lr,  lr,  r9
+        usub16          r4,  r4,  r7
+        smlad           r0,  lr,  lr,  r0
+        uxtb16          r6,  r5
+        uxtb16          r5,  r5,  ror #8
+        uxtb16          r9,  r8
+        uxtb16          r8,  r8,  ror #8
+        smlad           r0,  r4,  r4,  r0
+        usub16          r6,  r6,  r9
+        usub16          r5,  r5,  r8
+        smlad           r0,  r6,  r6,  r0
+        add             r1,  r1,  r3
+        add             r2,  r2,  r3
+        subs            r12, r12, #1
+        smlad           r0,  r5,  r5,  r0
+        bgt             1b
+
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_pix_norm1_armv6, export=1
+        push            {r4-r6, lr}
+        mov             r12, #16
+        mov             lr,  #0
+1:
+        ldm             r0,  {r2-r5}
+        uxtb16          r6,  r2
+        uxtb16          r2,  r2,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        uxtb16          r6,  r3
+        smlad           lr,  r2,  r2,  lr
+        uxtb16          r3,  r3,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        uxtb16          r6,  r4
+        smlad           lr,  r3,  r3,  lr
+        uxtb16          r4,  r4,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        uxtb16          r6,  r5
+        smlad           lr,  r4,  r4,  lr
+        uxtb16          r5,  r5,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        subs            r12, r12, #1
+        add             r0,  r0,  r1
+        smlad           lr,  r5,  r5,  lr
+        bgt             1b
+
+        mov             r0,  lr
+        pop             {r4-r6, pc}
+endfunc
+
+function ff_pix_sum_armv6, export=1
+        push            {r4-r7, lr}
+        mov             r12, #16
+        mov             r2,  #0
+        mov             r3,  #0
+        mov             lr,  #0
+        ldr             r4,  [r0]
+1:
+        subs            r12, r12, #1
+        ldr             r5,  [r0, #4]
+        usada8          r2,  r4,  lr,  r2
+        ldr             r6,  [r0, #8]
+        usada8          r3,  r5,  lr,  r3
+        ldr             r7,  [r0, #12]
+        usada8          r2,  r6,  lr,  r2
+        beq             2f
+        ldr             r4,  [r0, r1]!
+        usada8          r3,  r7,  lr,  r3
+        bgt             1b
+2:
+        usada8          r3,  r7,  lr,  r3
+        add             r0,  r2,  r3
+        pop             {r4-r7, pc}
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_arm.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_init_arm.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,112 @@
+/*
+ * ARM optimized DSP utils
+ * Copyright (c) 2001 Lionel Ulmer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "dsputil_arm.h"
+
+void ff_j_rev_dct_arm(DCTELEM *data);
+void ff_simple_idct_arm(DCTELEM *data);
+
+/* XXX: local hack */
+static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+
+void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+CALL_2X_PIXELS(ff_put_pixels16_x2_arm,         ff_put_pixels8_x2_arm,        8)
+CALL_2X_PIXELS(ff_put_pixels16_y2_arm,         ff_put_pixels8_y2_arm,        8)
+CALL_2X_PIXELS(ff_put_pixels16_xy2_arm,        ff_put_pixels8_xy2_arm,       8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm,  ff_put_no_rnd_pixels8_x2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm,  ff_put_no_rnd_pixels8_y2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
+
+void ff_add_pixels_clamped_arm(const DCTELEM *block, uint8_t *dest,
+                               int line_size);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+static void j_rev_dct_arm_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_j_rev_dct_arm (block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+static void j_rev_dct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_j_rev_dct_arm (block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_arm_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_simple_idct_arm (block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_simple_idct_arm (block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+int mm_support(void)
+{
+    return HAVE_IWMMXT * FF_MM_IWMMXT;
+}
+
+void dsputil_init_arm(DSPContext* c)
+{
+    ff_put_pixels_clamped = c->put_pixels_clamped;
+    ff_add_pixels_clamped = c->add_pixels_clamped;
+  
+    c->idct_put              = simple_idct_arm_put;
+    c->idct_add              = simple_idct_arm_add;
+    c->idct                  = ff_simple_idct_arm;
+    c->idct_permutation_type = FF_NO_IDCT_PERM;
+
+    c->add_pixels_clamped = ff_add_pixels_clamped_arm;
+
+    c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
+    c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
+
+    if (HAVE_NEON)    ff_dsputil_init_neon(c);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_armv5te.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_init_armv5te.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "dsputil_arm.h"
+
+void ff_simple_idct_armv5te(DCTELEM *data);
+void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_prefetch_arm(void *mem, int stride, int h);
+
+void av_cold ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx)
+{
+    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
+                           avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
+        c->idct_put              = ff_simple_idct_put_armv5te;
+        c->idct_add              = ff_simple_idct_add_armv5te;
+        c->idct                  = ff_simple_idct_armv5te;
+        c->idct_permutation_type = FF_NO_IDCT_PERM;
+    }
+
+    c->prefetch = ff_prefetch_arm;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_armv6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_init_armv6.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "dsputil_arm.h"
+
+void ff_simple_idct_armv6(DCTELEM *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_add_pixels_clamped_armv6(const DCTELEM *block,
+                                 uint8_t *restrict pixels,
+                                 int line_size);
+
+void ff_get_pixels_armv6(DCTELEM *block, const uint8_t *pixels, int stride);
+void ff_diff_pixels_armv6(DCTELEM *block, const uint8_t *s1,
+                          const uint8_t *s2, int stride);
+
+int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+                       int line_size, int h);
+int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+                          int line_size, int h);
+int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+                          int line_size, int h);
+
+int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+                       int line_size, int h);
+
+int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+                   int line_size, int h);
+
+int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
+int ff_pix_sum_armv6(uint8_t *pix, int line_size);
+
+void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
+{
+    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
+                           avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
+        c->idct_put              = ff_simple_idct_put_armv6;
+        c->idct_add              = ff_simple_idct_add_armv6;
+        c->idct                  = ff_simple_idct_armv6;
+        c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+    }
+
+    c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
+/*     c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
+    c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
+/*     c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
+/*     c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
+/*     c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
+
+    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
+    c->get_pixels = ff_get_pixels_armv6;
+    c->diff_pixels = ff_diff_pixels_armv6;
+
+    c->pix_abs[0][0] = ff_pix_abs16_armv6;
+    c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
+    c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
+
+    c->pix_abs[1][0] = ff_pix_abs8_armv6;
+
+    c->sad[0] = ff_pix_abs16_armv6;
+    c->sad[1] = ff_pix_abs8_armv6;
+
+    c->sse[0] = ff_sse16_armv6;
+
+    c->pix_norm1 = ff_pix_norm1_armv6;
+    c->pix_sum   = ff_pix_sum_armv6;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_neon.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_init_neon.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,308 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "dsputil_arm.h"
+
+void ff_simple_idct_neon(DCTELEM *data);
+void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_vp3_idct_neon(DCTELEM *data);
+void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data);
+
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
+void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
+
+void ff_vector_fmul_neon(float *dst, const float *src, int len);
+void ff_vector_fmul_window_neon(float *dst, const float *src0,
+                                const float *src1, const float *win,
+                                float add_bias, int len);
+void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
+                                int len);
+void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src,
+                                     const float **vp, float mul, int len);
+void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src,
+                                     const float **vp, float mul, int len);
+void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul,
+                              int len);
+void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
+                              int len);
+void ff_butterflies_float_neon(float *v1, float *v2, int len);
+float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
+                                        float mul, int len);
+void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
+                                 const float *src1, int len);
+void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
+                             const float *src2, int len);
+
+void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
+                          int len);
+void ff_float_to_int16_neon(int16_t *, const float *, long);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
+
+int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len,
+                                    int shift);
+int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2,
+                                             int16_t *v3, int len, int mul);
+
+void ff_dsputil_init_neon(DSPContext *c)
+{
+
+    {
+        c->idct_put              = ff_simple_idct_put_neon;
+        c->idct_add              = ff_simple_idct_add_neon;
+        c->idct                  = ff_simple_idct_neon;
+        c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
+
+    }
+
+    c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+    c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
+
+    c->add_pixels_clamped = ff_add_pixels_clamped_neon;
+    c->put_pixels_clamped = ff_put_pixels_clamped_neon;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+
+
+	c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+	c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+	c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
+
+	c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+	c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+	c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
+
+	c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
+	c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+	c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+	c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+	c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+	c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+	c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+	c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+	c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+	c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+	c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+	c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+	c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+	c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+	c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+	c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+	c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
+	c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+	c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+	c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+	c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+	c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+	c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+	c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+	c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+	c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+	c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+	c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+	c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+	c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+	c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+	c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+	c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
+	c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
+	c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
+	c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
+	c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
+	c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
+	c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
+	c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
+
+	c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
+	c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
+	c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
+	c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
+	c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
+	c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
+	c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
+	c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;    
+
+    c->vector_fmul                = ff_vector_fmul_neon;
+    c->vector_fmul_window         = ff_vector_fmul_window_neon;
+    c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
+    c->butterflies_float          = ff_butterflies_float_neon;
+    c->scalarproduct_float        = ff_scalarproduct_float_neon;
+    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+    c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
+    c->vector_fmul_add            = ff_vector_fmul_add_neon;
+    c->vector_clipf               = ff_vector_clipf_neon;
+
+    c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon;
+    c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon;
+
+    c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
+    c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
+
+
+    c->float_to_int16            = ff_float_to_int16_neon;
+    c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+
+    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
+    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_vfp.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_init_vfp.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "dsputil_arm.h"
+
+void ff_vector_fmul_vfp(float *dst, const float *src, int len);
+void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+                                const float *src1, int len);
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
+{
+    c->vector_fmul = ff_vector_fmul_vfp;
+    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
+#if HAVE_ARMV6
+    c->float_to_int16 = ff_float_to_int16_vfp;
+#endif
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_iwmmxt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_iwmmxt.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,205 @@
+/*
+ * iWMMXt optimized DSP utils
+ * Copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+
+#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
+#define SET_RND(regd)  __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
+#define WAVG2B "wavg2b"
+#include "dsputil_iwmmxt_rnd_template.c"
+#undef DEF
+#undef SET_RND
+#undef WAVG2B
+
+#define DEF(x, y) x ## _ ## y ##_iwmmxt
+#define SET_RND(regd)  __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
+#define WAVG2B "wavg2br"
+#include "dsputil_iwmmxt_rnd_template.c"
+#undef DEF
+#undef SET_RND
+#undef WAVG2BR
+
+// need scheduling
+#define OP(AVG)                                         \
+    __asm__ volatile (                                      \
+        /* alignment */                                 \
+        "and r12, %[pixels], #7 \n\t"                   \
+        "bic %[pixels], %[pixels], #7 \n\t"             \
+        "tmcr wcgr1, r12 \n\t"                          \
+                                                        \
+        "wldrd wr0, [%[pixels]] \n\t"                   \
+        "wldrd wr1, [%[pixels], #8] \n\t"               \
+        "add %[pixels], %[pixels], %[line_size] \n\t"   \
+        "walignr1 wr4, wr0, wr1 \n\t"                   \
+                                                        \
+        "1: \n\t"                                       \
+                                                        \
+        "wldrd wr2, [%[pixels]] \n\t"                   \
+        "wldrd wr3, [%[pixels], #8] \n\t"               \
+        "add %[pixels], %[pixels], %[line_size] \n\t"   \
+        "pld [%[pixels]] \n\t"                          \
+        "walignr1 wr5, wr2, wr3 \n\t"                   \
+        AVG " wr6, wr4, wr5 \n\t"                       \
+        "wstrd wr6, [%[block]] \n\t"                    \
+        "add %[block], %[block], %[line_size] \n\t"     \
+                                                        \
+        "wldrd wr0, [%[pixels]] \n\t"                   \
+        "wldrd wr1, [%[pixels], #8] \n\t"               \
+        "add %[pixels], %[pixels], %[line_size] \n\t"   \
+        "walignr1 wr4, wr0, wr1 \n\t"                   \
+        "pld [%[pixels]] \n\t"                          \
+        AVG " wr6, wr4, wr5 \n\t"                       \
+        "wstrd wr6, [%[block]] \n\t"                    \
+        "add %[block], %[block], %[line_size] \n\t"     \
+                                                        \
+        "subs %[h], %[h], #2 \n\t"                      \
+        "bne 1b \n\t"                                   \
+        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)  \
+        : [line_size]"r"(line_size) \
+        : "memory", "r12");
+void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    OP("wavg2br");
+}
+void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    OP("wavg2b");
+}
+#undef OP
+
+void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    uint8_t *pixels2 = pixels + line_size;
+
+    __asm__ volatile (
+        "mov            r12, #4                 \n\t"
+        "1:                                     \n\t"
+        "pld            [%[pixels], %[line_size2]]              \n\t"
+        "pld            [%[pixels2], %[line_size2]]             \n\t"
+        "wldrd          wr4, [%[pixels]]        \n\t"
+        "wldrd          wr5, [%[pixels2]]       \n\t"
+        "pld            [%[block], #32]         \n\t"
+        "wunpckelub     wr6, wr4                \n\t"
+        "wldrd          wr0, [%[block]]         \n\t"
+        "wunpckehub     wr7, wr4                \n\t"
+        "wldrd          wr1, [%[block], #8]     \n\t"
+        "wunpckelub     wr8, wr5                \n\t"
+        "wldrd          wr2, [%[block], #16]    \n\t"
+        "wunpckehub     wr9, wr5                \n\t"
+        "wldrd          wr3, [%[block], #24]    \n\t"
+        "add            %[block], %[block], #32 \n\t"
+        "waddhss        wr10, wr0, wr6          \n\t"
+        "waddhss        wr11, wr1, wr7          \n\t"
+        "waddhss        wr12, wr2, wr8          \n\t"
+        "waddhss        wr13, wr3, wr9          \n\t"
+        "wpackhus       wr14, wr10, wr11        \n\t"
+        "wpackhus       wr15, wr12, wr13        \n\t"
+        "wstrd          wr14, [%[pixels]]       \n\t"
+        "add            %[pixels], %[pixels], %[line_size2]     \n\t"
+        "subs           r12, r12, #1            \n\t"
+        "wstrd          wr15, [%[pixels2]]      \n\t"
+        "add            %[pixels2], %[pixels2], %[line_size2]   \n\t"
+        "bne            1b                      \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
+        : [line_size2]"r"(line_size << 1)
+        : "cc", "memory", "r12");
+}
+
+static void clear_blocks_iwmmxt(DCTELEM *blocks)
+{
+    __asm__ volatile(
+                "wzero wr0                      \n\t"
+                "mov r1, #(128 * 6 / 32)        \n\t"
+                "1:                             \n\t"
+                "wstrd wr0, [%0]                \n\t"
+                "wstrd wr0, [%0, #8]            \n\t"
+                "wstrd wr0, [%0, #16]           \n\t"
+                "wstrd wr0, [%0, #24]           \n\t"
+                "subs r1, r1, #1                \n\t"
+                "add %0, %0, #32                \n\t"
+                "bne 1b                         \n\t"
+                : "+r"(blocks)
+                :
+                : "r1"
+        );
+}
+
+static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    return;
+}
+
+/* A run time test is not simple. If this file is compiled in
+ * then we should install the functions
+ */
+int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */
+
+void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
+{
+    if (avctx->dsp_mask) {
+        if (avctx->dsp_mask & FF_MM_FORCE)
+            mm_flags |= (avctx->dsp_mask & 0xffff);
+        else
+            mm_flags &= ~(avctx->dsp_mask & 0xffff);
+    }
+
+    if (!(mm_flags & FF_MM_IWMMXT)) return;
+
+    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
+
+    c->clear_blocks = clear_blocks_iwmmxt;
+
+    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
+
+    c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
+    c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
+    c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
+    c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
+    c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
+    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
+    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
+    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
+
+    c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
+    c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
+    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
+    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
+    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
+    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
+    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_iwmmxt_rnd_template.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_iwmmxt_rnd_template.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1114 @@
+/*
+ * iWMMXt optimized DSP utils
+ * copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "wldrd wr0, [%[block]] \n\t"
+        "wldrd wr2, [r5] \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        WAVG2B" wr8, wr8, wr0 \n\t"
+        WAVG2B" wr10, wr10, wr2 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "pld [%[block]] \n\t"
+        "pld [%[block], #32] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "pld [r5] \n\t"
+        "pld [r5, #32] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr2, [%[pixels], #16] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "walignr1 wr9, wr1, wr2 \n\t"
+        "wldrd wr5, [r4, #16] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "walignr1 wr11, wr4, wr5 \n\t"
+        "wstrd wr9, [%[block], #8] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "wstrd wr11, [r5, #8] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1:                             \n\t"
+        "wldrd wr0, [%[pixels]]         \n\t"
+        "wldrd wr1, [%[pixels], #8]     \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wldrd wr2, [%[pixels], #16]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4]                \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr8, wr0, wr1         \n\t"
+        "wldrd wr4, [r4, #8]            \n\t"
+        "walignr1 wr9, wr1, wr2         \n\t"
+        "wldrd wr5, [r4, #16]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "wldrd wr0, [%[block]]          \n\t"
+        "pld [r4]                       \n\t"
+        "wldrd wr1, [%[block], #8]      \n\t"
+        "pld [r4, #32]                  \n\t"
+        "wldrd wr2, [r5]                \n\t"
+        "walignr1 wr10, wr3, wr4        \n\t"
+        "wldrd wr3, [r5, #8]            \n\t"
+        WAVG2B" wr8, wr8, wr0           \n\t"
+        WAVG2B" wr9, wr9, wr1           \n\t"
+        WAVG2B" wr10, wr10, wr2         \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "walignr1 wr11, wr4, wr5        \n\t"
+        WAVG2B" wr11, wr11, wr3         \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5]               \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "wstrd wr11, [r5, #8]           \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wldrd wr15, [r4, #16]          \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "walignr1 wr3, wr14, wr15       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr5, wr12               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "wmoveq wr7, wr15               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr5, wr11, wr12     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "walignr2ne wr7, wr14, wr15     \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr1, wr1, wr5           \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wstrd wr1, [%[block], #8]      \n\t"
+        WAVG2B" wr3, wr3, wr7           \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr3, [r5, #8]            \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "wldrd wr12, [r5]               \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        WAVG2B" wr0, wr0, wr10          \n\t"
+        WAVG2B" wr2, wr2, wr12          \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wldrd wr15, [r4, #16]          \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "walignr1 wr3, wr14, wr15       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr5, wr12               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "wmoveq wr7, wr15               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr5, wr11, wr12     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "walignr2ne wr7, wr14, wr15     \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr1, wr1, wr5           \n\t"
+        "wldrd wr12, [r5]               \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wldrd wr13, [r5, #8]           \n\t"
+        WAVG2B" wr3, wr3, wr7           \n\t"
+        WAVG2B" wr0, wr0, wr10          \n\t"
+        WAVG2B" wr1, wr1, wr11          \n\t"
+        WAVG2B" wr2, wr2, wr12          \n\t"
+        WAVG2B" wr3, wr3, wr13          \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr1, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "pld [%[block]]                 \n\t"
+        "wstrd wr3, [r5, #8]            \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        :"r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "and            r12, %[pixels], #7                      \n\t"
+        "tmcr           wcgr1, r12                              \n\t"
+        "bic            %[pixels], %[pixels], #7                \n\t"
+
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "pld            [%[block]]                              \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "walignr1       wr0, wr10, wr11                         \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+
+      "1:                                                       \n\t"
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "walignr1       wr4, wr10, wr11                         \n\t"
+        "wldrd          wr10, [%[block]]                        \n\t"
+         WAVG2B"        wr8, wr0, wr4                           \n\t"
+         WAVG2B"        wr8, wr8, wr10                          \n\t"
+        "wstrd          wr8, [%[block]]                         \n\t"
+        "add            %[block], %[block], %[line_size]        \n\t"
+
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "pld            [%[block]]                              \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "walignr1       wr0, wr10, wr11                         \n\t"
+        "wldrd          wr10, [%[block]]                        \n\t"
+         WAVG2B"        wr8, wr0, wr4                           \n\t"
+         WAVG2B"        wr8, wr8, wr10                          \n\t"
+        "wstrd          wr8, [%[block]]                         \n\t"
+        "add            %[block], %[block], %[line_size]        \n\t"
+
+        "subs           %[h], %[h], #2                          \n\t"
+        "pld            [%[block]]                              \n\t"
+        "bne            1b                                      \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "cc", "memory", "r12");
+}
+
+void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr4, wr10, wr11       \n\t"
+        "walignr1 wr5, wr11, wr12       \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr4, wr10, wr11       \n\t"
+        "walignr1 wr5, wr11, wr12       \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        WAVG2B" wr8, wr8, wr10          \n\t"
+        WAVG2B" wr9, wr9, wr11          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        WAVG2B" wr8, wr8, wr10          \n\t"
+        WAVG2B" wr9, wr9, wr11          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "pld [%[block]]                 \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "add r12, r12, #1               \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "cmp r12, #8                    \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        /* alignment */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "tmcr wcgr2, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr7, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr6, wr7            \n\t"
+        "wunpckehub wr7, wr7            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr6, wr6, wr10         \n\t"
+        "waddhus wr7, wr7, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "add r12, r12, #1               \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "cmp r12, #8                    \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        /* alignment */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7           \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "tmcr wcgr2, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr7, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr6, wr7            \n\t"
+        "wunpckehub wr7, wr7            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr6, wr6, wr10         \n\t"
+        "waddhus wr7, wr7, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wldrd wr13, [%[block], #8]     \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        WAVG2B" wr9, wr9, wr13          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "pld [%[block], #32]            \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wldrd wr13, [%[block], #8]     \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        WAVG2B" wr9, wr9, wr13          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1146 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+        preserve8
+        .text
+
+        .macro pixels16 avg=0
+.if \avg
+        mov             ip,  r0
+.endif
+1:      vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d2, d3},  [r1], r2
+        vld1.64         {d4, d5},  [r1], r2
+        pld             [r1, r2, lsl #2]
+        vld1.64         {d6, d7},  [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        pld             [r1, r2, lsl #1]
+.if \avg
+        vld1.64         {d16,d17}, [ip,:128], r2
+        vrhadd.u8       q0,  q0,  q8
+        vld1.64         {d18,d19}, [ip,:128], r2
+        vrhadd.u8       q1,  q1,  q9
+        vld1.64         {d20,d21}, [ip,:128], r2
+        vrhadd.u8       q2,  q2,  q10
+        vld1.64         {d22,d23}, [ip,:128], r2
+        vrhadd.u8       q3,  q3,  q11
+.endif
+        subs            r3,  r3,  #4
+        vst1.64         {d0, d1},  [r0,:128], r2
+        vst1.64         {d2, d3},  [r0,:128], r2
+        vst1.64         {d4, d5},  [r0,:128], r2
+        vst1.64         {d6, d7},  [r0,:128], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels16_x2 vhadd=vrhadd.u8
+1:      vld1.64         {d0-d2},   [r1], r2
+        vld1.64         {d4-d6},   [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        subs            r3,  r3,  #2
+        vext.8          q1,  q0,  q1,  #1
+        \vhadd          q0,  q0,  q1
+        vext.8          q3,  q2,  q3,  #1
+        \vhadd          q2,  q2,  q3
+        vst1.64         {d0, d1},  [r0,:128], r2
+        vst1.64         {d4, d5},  [r0,:128], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels16_y2 vhadd=vrhadd.u8
+        vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d2, d3},  [r1], r2
+1:      subs            r3,  r3,  #2
+        \vhadd          q2,  q0,  q1
+        vld1.64         {d0, d1},  [r1], r2
+        \vhadd          q3,  q0,  q1
+        vld1.64         {d2, d3},  [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        vst1.64         {d4, d5},  [r0,:128], r2
+        vst1.64         {d6, d7},  [r0,:128], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
+        vld1.64         {d0-d2},   [r1], r2
+        vld1.64         {d4-d6},   [r1], r2
+.if \no_rnd
+        vmov.i16        q13, #1
+.endif
+        pld             [r1]
+        pld             [r1, r2]
+        vext.8          q1,  q0,  q1,  #1
+        vext.8          q3,  q2,  q3,  #1
+        vaddl.u8        q8,  d0,  d2
+        vaddl.u8        q10, d1,  d3
+        vaddl.u8        q9,  d4,  d6
+        vaddl.u8        q11, d5,  d7
+1:      subs            r3,  r3,  #2
+        vld1.64         {d0-d2},   [r1], r2
+        vadd.u16        q12, q8,  q9
+        pld             [r1]
+.if \no_rnd
+        vadd.u16        q12, q12, q13
+.endif
+        vext.8          q15, q0,  q1,  #1
+        vadd.u16        q1 , q10, q11
+        \vshrn          d28, q12, #2
+.if \no_rnd
+        vadd.u16        q1,  q1,  q13
+.endif
+        \vshrn          d29, q1,  #2
+        vaddl.u8        q8,  d0,  d30
+        vld1.64         {d2-d4},   [r1], r2
+        vaddl.u8        q10, d1,  d31
+        vst1.64         {d28,d29}, [r0,:128], r2
+        vadd.u16        q12, q8,  q9
+        pld             [r1, r2]
+.if \no_rnd
+        vadd.u16        q12, q12, q13
+.endif
+        vext.8          q2,  q1,  q2,  #1
+        vadd.u16        q0,  q10, q11
+        \vshrn          d30, q12, #2
+.if \no_rnd
+        vadd.u16        q0,  q0,  q13
+.endif
+        \vshrn          d31, q0,  #2
+        vaddl.u8        q9,  d2,  d4
+        vaddl.u8        q11, d3,  d5
+        vst1.64         {d30,d31}, [r0,:128], r2
+        bgt             1b
+        bx              lr
+        .endm
+
+        .macro pixels8 avg=0
+1:      vld1.64         {d0}, [r1], r2
+        vld1.64         {d1}, [r1], r2
+        vld1.64         {d2}, [r1], r2
+        pld             [r1, r2, lsl #2]
+        vld1.64         {d3}, [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        pld             [r1, r2, lsl #1]
+.if \avg
+        vld1.64         {d4}, [r0,:64], r2
+        vrhadd.u8       d0,  d0,  d4
+        vld1.64         {d5}, [r0,:64], r2
+        vrhadd.u8       d1,  d1,  d5
+        vld1.64         {d6}, [r0,:64], r2
+        vrhadd.u8       d2,  d2,  d6
+        vld1.64         {d7}, [r0,:64], r2
+        vrhadd.u8       d3,  d3,  d7
+        sub             r0,  r0,  r2,  lsl #2
+.endif
+        subs            r3,  r3,  #4
+        vst1.64         {d0}, [r0,:64], r2
+        vst1.64         {d1}, [r0,:64], r2
+        vst1.64         {d2}, [r0,:64], r2
+        vst1.64         {d3}, [r0,:64], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels8_x2 vhadd=vrhadd.u8
+1:      vld1.64         {d0, d1},  [r1], r2
+        vext.8          d1,  d0,  d1,  #1
+        vld1.64         {d2, d3},  [r1], r2
+        vext.8          d3,  d2,  d3,  #1
+        pld             [r1]
+        pld             [r1, r2]
+        subs            r3,  r3,  #2
+        vswp            d1,  d2
+        \vhadd          q0,  q0,  q1
+        vst1.64         {d0},      [r0,:64], r2
+        vst1.64         {d1},      [r0,:64], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels8_y2 vhadd=vrhadd.u8
+        vld1.64         {d0},      [r1], r2
+        vld1.64         {d1},      [r1], r2
+1:      subs            r3,  r3,  #2
+        \vhadd          d4,  d0,  d1
+        vld1.64         {d0},      [r1], r2
+        \vhadd          d5,  d0,  d1
+        vld1.64         {d1},      [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        vst1.64         {d4},      [r0,:64], r2
+        vst1.64         {d5},      [r0,:64], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
+        vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d2, d3},  [r1], r2
+.if \no_rnd
+        vmov.i16        q11, #1
+.endif
+        pld             [r1]
+        pld             [r1, r2]
+        vext.8          d4,  d0,  d1,  #1
+        vext.8          d6,  d2,  d3,  #1
+        vaddl.u8        q8,  d0,  d4
+        vaddl.u8        q9,  d2,  d6
+1:      subs            r3,  r3,  #2
+        vld1.64         {d0, d1},  [r1], r2
+        pld             [r1]
+        vadd.u16        q10, q8,  q9
+        vext.8          d4,  d0,  d1,  #1
+.if \no_rnd
+        vadd.u16        q10, q10, q11
+.endif
+        vaddl.u8        q8,  d0,  d4
+        \vshrn          d5,  q10, #2
+        vld1.64         {d2, d3},  [r1], r2
+        vadd.u16        q10, q8,  q9
+        pld             [r1, r2]
+.if \no_rnd
+        vadd.u16        q10, q10, q11
+.endif
+        vst1.64         {d5},      [r0,:64], r2
+        \vshrn          d7,  q10, #2
+        vext.8          d6,  d2,  d3,  #1
+        vaddl.u8        q9,  d2,  d6
+        vst1.64         {d7},      [r0,:64], r2
+        bgt             1b
+        bx              lr
+        .endm
+
+        .macro pixfunc pfx name suf rnd_op args:vararg
+function ff_\pfx\name\suf\()_neon, export=1
+        \name \rnd_op \args
+endfunc
+        .endm
+
+        .macro pixfunc2 pfx name args:vararg
+        pixfunc \pfx \name
+        pixfunc \pfx \name \args
+        .endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+        mov             r3,  #16
+endfunc
+
+        pixfunc  put_ pixels16
+        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+        mov             r3,  #16
+endfunc
+
+        pixfunc  avg_ pixels16,, 1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+        mov             r3,  #8
+endfunc
+
+        pixfunc  put_ pixels8
+        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
+
+function ff_avg_h264_qpel8_mc00_neon, export=1
+        mov             r3,  #8
+endfunc
+
+        pixfunc  avg_ pixels8,, 1
+
+function ff_put_pixels_clamped_neon, export=1
+        vld1.64         {d16-d19}, [r0,:128]!
+        vqmovun.s16     d0, q8
+        vld1.64         {d20-d23}, [r0,:128]!
+        vqmovun.s16     d1, q9
+        vld1.64         {d24-d27}, [r0,:128]!
+        vqmovun.s16     d2, q10
+        vld1.64         {d28-d31}, [r0,:128]!
+        vqmovun.s16     d3, q11
+        vst1.64         {d0},      [r1,:64], r2
+        vqmovun.s16     d4, q12
+        vst1.64         {d1},      [r1,:64], r2
+        vqmovun.s16     d5, q13
+        vst1.64         {d2},      [r1,:64], r2
+        vqmovun.s16     d6, q14
+        vst1.64         {d3},      [r1,:64], r2
+        vqmovun.s16     d7, q15
+        vst1.64         {d4},      [r1,:64], r2
+        vst1.64         {d5},      [r1,:64], r2
+        vst1.64         {d6},      [r1,:64], r2
+        vst1.64         {d7},      [r1,:64], r2
+        bx              lr
+endfunc
+
+function ff_put_signed_pixels_clamped_neon, export=1
+        vmov.u8         d31, #128
+        vld1.64         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d0, q8
+        vld1.64         {d18-d19}, [r0,:128]!
+        vqmovn.s16      d1, q9
+        vld1.64         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d2, q8
+        vld1.64         {d18-d19}, [r0,:128]!
+        vadd.u8         d0, d0, d31
+        vld1.64         {d20-d21}, [r0,:128]!
+        vadd.u8         d1, d1, d31
+        vld1.64         {d22-d23}, [r0,:128]!
+        vadd.u8         d2, d2, d31
+        vst1.64         {d0},      [r1,:64], r2
+        vqmovn.s16      d3, q9
+        vst1.64         {d1},      [r1,:64], r2
+        vqmovn.s16      d4, q10
+        vst1.64         {d2},      [r1,:64], r2
+        vqmovn.s16      d5, q11
+        vld1.64         {d24-d25}, [r0,:128]!
+        vadd.u8         d3, d3, d31
+        vld1.64         {d26-d27}, [r0,:128]!
+        vadd.u8         d4, d4, d31
+        vadd.u8         d5, d5, d31
+        vst1.64         {d3},      [r1,:64], r2
+        vqmovn.s16      d6, q12
+        vst1.64         {d4},      [r1,:64], r2
+        vqmovn.s16      d7, q13
+        vst1.64         {d5},      [r1,:64], r2
+        vadd.u8         d6, d6, d31
+        vadd.u8         d7, d7, d31
+        vst1.64         {d6},      [r1,:64], r2
+        vst1.64         {d7},      [r1,:64], r2
+        bx              lr
+endfunc
+
+function ff_add_pixels_clamped_neon, export=1
+        mov             r3, r1
+        vld1.64         {d16},   [r1,:64], r2
+        vld1.64         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vld1.64         {d17},   [r1,:64], r2
+        vld1.64         {d2-d3}, [r0,:128]!
+        vqmovun.s16     d0, q0
+        vld1.64         {d18},   [r1,:64], r2
+        vaddw.u8        q1, q1, d17
+        vld1.64         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.64         {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.64         {d19},   [r1,:64], r2
+        vld1.64         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vqmovun.s16     d4, q2
+        vst1.64         {d2},    [r3,:64], r2
+        vld1.64         {d16},   [r1,:64], r2
+        vqmovun.s16     d6, q3
+        vld1.64         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vst1.64         {d4},    [r3,:64], r2
+        vld1.64         {d17},   [r1,:64], r2
+        vld1.64         {d2-d3}, [r0,:128]!
+        vaddw.u8        q1, q1, d17
+        vst1.64         {d6},    [r3,:64], r2
+        vqmovun.s16     d0, q0
+        vld1.64         {d18},   [r1,:64], r2
+        vld1.64         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.64         {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.64         {d19},   [r1,:64], r2
+        vqmovun.s16     d4, q2
+        vld1.64         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vst1.64         {d2},    [r3,:64], r2
+        vqmovun.s16     d6, q3
+        vst1.64         {d4},    [r3,:64], r2
+        vst1.64         {d6},    [r3,:64], r2
+        bx              lr
+endfunc
+
+function ff_float_to_int16_neon, export=1
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d1},  [r1,:128]!
+        vcvt.s32.f32    q8,  q0,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vcvt.s32.f32    q9,  q1,  #16
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vshrn.s32       d4,  q8,  #16
+        vld1.64         {d0-d1},  [r1,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vshrn.s32       d5,  q9,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vshrn.s32       d6,  q0,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        vshrn.s32       d7,  q1,  #16
+        vld1.64         {d16-d17},[r1,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r1,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.64         {d6-d7},  [r0,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vld1.64         {d0-d1},  [r1,:128]!
+        vshrn.s32       d4,  q8,  #16
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vshrn.s32       d5,  q9,  #16
+        vcvt.s32.f32    q1,  q1,  #16
+        vshrn.s32       d6,  q0,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        vshrn.s32       d7,  q1,  #16
+        vst1.64         {d6-d7},  [r0,:128]!
+        bx              lr
+3:      vshrn.s32       d4,  q8,  #16
+        vshrn.s32       d5,  q9,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        bx              lr
+endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+        cmp             r3, #2
+        ldrlt           r1, [r1]
+        blt             ff_float_to_int16_neon
+        bne             4f
+
+        ldr             r3, [r1]
+        ldr             r1, [r1, #4]
+
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q8,  q0,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q9,  q1,  #16
+        vld1.64         {d20-d21},[r1,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r1,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         q10, q8,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vld1.64         {d24-d25},[r1,:128]!
+        vcvt.s32.f32    q12, q12, #16
+        vld1.64         {d26-d27},[r1,:128]!
+        vsri.32         q11, q9,  #16
+        vst1.64         {d20-d21},[r0,:128]!
+        vcvt.s32.f32    q13, q13, #16
+        vst1.64         {d22-d23},[r0,:128]!
+        vsri.32         q12, q0,  #16
+        vld1.64         {d16-d17},[r3,:128]!
+        vsri.32         q13, q1,  #16
+        vst1.64         {d24-d25},[r0,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r3,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r1,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r1,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.64         {d26-d27},[r0,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vsri.32         q10, q8,  #16
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vld1.64         {d24-d25},[r1,:128]!
+        vcvt.s32.f32    q12, q12, #16
+        vsri.32         q11, q9,  #16
+        vld1.64         {d26-d27},[r1,:128]!
+        vcvt.s32.f32    q13, q13, #16
+        vst1.64         {d20-d21},[r0,:128]!
+        vsri.32         q12, q0,  #16
+        vst1.64         {d22-d23},[r0,:128]!
+        vsri.32         q13, q1,  #16
+        vst1.64         {d24-d27},[r0,:128]!
+        bx              lr
+3:      vsri.32         q10, q8,  #16
+        vsri.32         q11, q9,  #16
+        vst1.64         {d20-d23},[r0,:128]!
+        bx              lr
+
+4:      push            {r4-r8,lr}
+        cmp             r3,  #4
+        lsl             ip,  r3,  #1
+        blt             4f
+
+        @ 4 channels
+5:      ldmia           r1!, {r4-r7}
+        mov             lr,  r2
+        mov             r8,  r0
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r6,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r7,:128]!
+        vcvt.s32.f32    q11, q11, #16
+6:      subs            lr,  lr,  #8
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         q9,  q8,  #16
+        vld1.64         {d2-d3},  [r5,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vsri.32         q11, q10, #16
+        vld1.64         {d4-d5},  [r6,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vzip.32         d18, d22
+        vld1.64         {d6-d7},  [r7,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vzip.32         d19, d23
+        vst1.64         {d18},    [r8], ip
+        vsri.32         q1,  q0,  #16
+        vst1.64         {d22},    [r8], ip
+        vsri.32         q3,  q2,  #16
+        vst1.64         {d19},    [r8], ip
+        vzip.32         d2,  d6
+        vst1.64         {d23},    [r8], ip
+        vzip.32         d3,  d7
+        beq             7f
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.64         {d2},     [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.64         {d6},     [r8], ip
+        vld1.64         {d20-d21},[r6,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.64         {d3},     [r8], ip
+        vld1.64         {d22-d23},[r7,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.64         {d7},     [r8], ip
+        b               6b
+7:      vst1.64         {d2},     [r8], ip
+        vst1.64         {d6},     [r8], ip
+        vst1.64         {d3},     [r8], ip
+        vst1.64         {d7},     [r8], ip
+        subs            r3,  r3,  #4
+        popeq           {r4-r8,pc}
+        cmp             r3,  #4
+        add             r0,  r0,  #8
+        bge             5b
+
+        @ 2 channels
+4:      cmp             r3,  #2
+        blt             4f
+        ldmia           r1!, {r4-r5}
+        mov             lr,  r2
+        mov             r8,  r0
+        tst             lr,  #8
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        beq             6f
+        subs            lr,  lr,  #8
+        beq             7f
+        vsri.32         d18, d16, #16
+        vsri.32         d19, d17, #16
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vst1.32         {d19[1]}, [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.32         {d22[0]}, [r8], ip
+        vst1.32         {d22[1]}, [r8], ip
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+6:      subs            lr,  lr,  #16
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         d18, d16, #16
+        vld1.64         {d2-d3},  [r5,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vsri.32         d19, d17, #16
+        vld1.64         {d4-d5},  [r4,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vld1.64         {d6-d7},  [r5,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vsri.32         d2,  d0,  #16
+        vst1.32         {d19[1]}, [r8], ip
+        vsri.32         d3,  d1,  #16
+        vst1.32         {d22[0]}, [r8], ip
+        vsri.32         d6,  d4,  #16
+        vst1.32         {d22[1]}, [r8], ip
+        vsri.32         d7,  d5,  #16
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+        beq             6f
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.32         {d2[0]},  [r8], ip
+        vst1.32         {d2[1]},  [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.32         {d3[0]},  [r8], ip
+        vst1.32         {d3[1]},  [r8], ip
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.32         {d6[0]},  [r8], ip
+        vst1.32         {d6[1]},  [r8], ip
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.32         {d7[0]},  [r8], ip
+        vst1.32         {d7[1]},  [r8], ip
+        bgt             6b
+6:      vst1.32         {d2[0]},  [r8], ip
+        vst1.32         {d2[1]},  [r8], ip
+        vst1.32         {d3[0]},  [r8], ip
+        vst1.32         {d3[1]},  [r8], ip
+        vst1.32         {d6[0]},  [r8], ip
+        vst1.32         {d6[1]},  [r8], ip
+        vst1.32         {d7[0]},  [r8], ip
+        vst1.32         {d7[1]},  [r8], ip
+        b               8f
+7:      vsri.32         d18, d16, #16
+        vsri.32         d19, d17, #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vst1.32         {d19[1]}, [r8], ip
+        vst1.32         {d22[0]}, [r8], ip
+        vst1.32         {d22[1]}, [r8], ip
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+8:      subs            r3,  r3,  #2
+        add             r0,  r0,  #4
+        popeq           {r4-r8,pc}
+
+        @ 1 channel
+4:      ldr             r4,  [r1],#4
+        tst             r2,  #8
+        mov             lr,  r2
+        mov             r5,  r0
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        bne             8f
+6:      subs            lr,  lr,  #16
+        vld1.64         {d4-d5},  [r4,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vld1.64         {d6-d7},  [r4,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vst1.16         {d0[1]},  [r5,:16], ip
+        vst1.16         {d0[3]},  [r5,:16], ip
+        vst1.16         {d1[1]},  [r5,:16], ip
+        vst1.16         {d1[3]},  [r5,:16], ip
+        vst1.16         {d2[1]},  [r5,:16], ip
+        vst1.16         {d2[3]},  [r5,:16], ip
+        vst1.16         {d3[1]},  [r5,:16], ip
+        vst1.16         {d3[3]},  [r5,:16], ip
+        beq             7f
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+7:      vst1.16         {d4[1]},  [r5,:16], ip
+        vst1.16         {d4[3]},  [r5,:16], ip
+        vst1.16         {d5[1]},  [r5,:16], ip
+        vst1.16         {d5[3]},  [r5,:16], ip
+        vst1.16         {d6[1]},  [r5,:16], ip
+        vst1.16         {d6[3]},  [r5,:16], ip
+        vst1.16         {d7[1]},  [r5,:16], ip
+        vst1.16         {d7[3]},  [r5,:16], ip
+        bgt             6b
+        pop             {r4-r8,pc}
+8:      subs            lr,  lr,  #8
+        vst1.16         {d0[1]},  [r5,:16], ip
+        vst1.16         {d0[3]},  [r5,:16], ip
+        vst1.16         {d1[1]},  [r5,:16], ip
+        vst1.16         {d1[3]},  [r5,:16], ip
+        vst1.16         {d2[1]},  [r5,:16], ip
+        vst1.16         {d2[3]},  [r5,:16], ip
+        vst1.16         {d3[1]},  [r5,:16], ip
+        vst1.16         {d3[3]},  [r5,:16], ip
+        popeq           {r4-r8,pc}
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        b               6b
+endfunc
+
+function ff_vector_fmul_neon, export=1
+        mov             r3,  r0
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d3},  [r0,:128]!
+        vld1.64         {d4-d7},  [r1,:128]!
+        vmul.f32        q8,  q0,  q2
+        vmul.f32        q9,  q1,  q3
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vld1.64         {d0-d1},  [r0,:128]!
+        vld1.64         {d4-d5},  [r1,:128]!
+        vmul.f32        q10, q0,  q2
+        vld1.64         {d2-d3},  [r0,:128]!
+        vld1.64         {d6-d7},  [r1,:128]!
+        vmul.f32        q11, q1,  q3
+        vst1.64         {d16-d19},[r3,:128]!
+        vld1.64         {d0-d1},  [r0,:128]!
+        vld1.64         {d4-d5},  [r1,:128]!
+        vmul.f32        q8,  q0,  q2
+        vld1.64         {d2-d3},  [r0,:128]!
+        vld1.64         {d6-d7},  [r1,:128]!
+        vmul.f32        q9,  q1,  q3
+        vst1.64         {d20-d23},[r3,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vld1.64         {d0-d1},  [r0,:128]!
+        vld1.64         {d4-d5},  [r1,:128]!
+        vst1.64         {d16-d17},[r3,:128]!
+        vmul.f32        q8,  q0,  q2
+        vld1.64         {d2-d3},  [r0,:128]!
+        vld1.64         {d6-d7},  [r1,:128]!
+        vst1.64         {d18-d19},[r3,:128]!
+        vmul.f32        q9,  q1,  q3
+3:      vst1.64         {d16-d19},[r3,:128]!
+        bx              lr
+endfunc
+
+function ff_vector_fmul_window_neon, export=1
+VFP     vdup.32         q8,  d0[0]
+NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
+        push            {r4,r5,lr}
+VFP     ldr             lr,  [sp, #12]
+NOVFP   ldr             lr,  [sp, #16]
+        sub             r2,  r2,  #8
+        sub             r5,  lr,  #2
+        add             r2,  r2,  r5, lsl #2
+        add             r4,  r3,  r5, lsl #3
+        add             ip,  r0,  r5, lsl #3
+        mov             r5,  #-16
+        vld1.64         {d0,d1},  [r1,:128]!
+        vld1.64         {d2,d3},  [r2,:128], r5
+        vld1.64         {d4,d5},  [r3,:128]!
+        vld1.64         {d6,d7},  [r4,:128], r5
+1:      subs            lr,  lr,  #4
+        vmov            q11, q8
+        vmla.f32        d22, d0,  d4
+        vmov            q10, q8
+        vmla.f32        d23, d1,  d5
+        vrev64.32       q3,  q3
+        vmla.f32        d20, d0,  d7
+        vrev64.32       q1,  q1
+        vmla.f32        d21, d1,  d6
+        beq             2f
+        vmla.f32        d22, d3,  d7
+        vld1.64         {d0,d1},  [r1,:128]!
+        vmla.f32        d23, d2,  d6
+        vld1.64         {d18,d19},[r2,:128], r5
+        vmls.f32        d20, d3,  d4
+        vld1.64         {d24,d25},[r3,:128]!
+        vmls.f32        d21, d2,  d5
+        vld1.64         {d6,d7},  [r4,:128], r5
+        vmov            q1,  q9
+        vrev64.32       q11, q11
+        vmov            q2,  q12
+        vswp            d22, d23
+        vst1.64         {d20,d21},[r0,:128]!
+        vst1.64         {d22,d23},[ip,:128], r5
+        b               1b
+2:      vmla.f32        d22, d3,  d7
+        vmla.f32        d23, d2,  d6
+        vmls.f32        d20, d3,  d4
+        vmls.f32        d21, d2,  d5
+        vrev64.32       q11, q11
+        vswp            d22, d23
+        vst1.64         {d20,d21},[r0,:128]!
+        vst1.64         {d22,d23},[ip,:128], r5
+        pop             {r4,r5,pc}
+endfunc
+
+#if CONFIG_VORBIS_DECODER
+function ff_vorbis_inverse_coupling_neon, export=1
+        vmov.i32        q10, #1<<31
+        subs            r2,  r2,  #4
+        mov             r3,  r0
+        mov             r12, r1
+        beq             3f
+
+        vld1.32         {d24-d25},[r1,:128]!
+        vld1.32         {d22-d23},[r0,:128]!
+        vcle.s32        q8,  q12, #0
+        vand            q9,  q11, q10
+        veor            q12, q12, q9
+        vand            q2,  q12, q8
+        vbic            q3,  q12, q8
+        vadd.f32        q12, q11, q2
+        vsub.f32        q11, q11, q3
+1:      vld1.32         {d2-d3},  [r1,:128]!
+        vld1.32         {d0-d1},  [r0,:128]!
+        vcle.s32        q8,  q1,  #0
+        vand            q9,  q0,  q10
+        veor            q1,  q1,  q9
+        vst1.32         {d24-d25},[r3, :128]!
+        vst1.32         {d22-d23},[r12,:128]!
+        vand            q2,  q1,  q8
+        vbic            q3,  q1,  q8
+        vadd.f32        q1,  q0,  q2
+        vsub.f32        q0,  q0,  q3
+        subs            r2,  r2,  #8
+        ble             2f
+        vld1.32         {d24-d25},[r1,:128]!
+        vld1.32         {d22-d23},[r0,:128]!
+        vcle.s32        q8,  q12, #0
+        vand            q9,  q11, q10
+        veor            q12, q12, q9
+        vst1.32         {d2-d3},  [r3, :128]!
+        vst1.32         {d0-d1},  [r12,:128]!
+        vand            q2,  q12, q8
+        vbic            q3,  q12, q8
+        vadd.f32        q12, q11, q2
+        vsub.f32        q11, q11, q3
+        b               1b
+
+2:      vst1.32         {d2-d3},  [r3, :128]!
+        vst1.32         {d0-d1},  [r12,:128]!
+        bxlt            lr
+
+3:      vld1.32         {d2-d3},  [r1,:128]
+        vld1.32         {d0-d1},  [r0,:128]
+        vcle.s32        q8,  q1,  #0
+        vand            q9,  q0,  q10
+        veor            q1,  q1,  q9
+        vand            q2,  q1,  q8
+        vbic            q3,  q1,  q8
+        vadd.f32        q1,  q0,  q2
+        vsub.f32        q0,  q0,  q3
+        vst1.32         {d2-d3},  [r0,:128]!
+        vst1.32         {d0-d1},  [r1,:128]!
+        bx              lr
+endfunc
+#endif
+
+function ff_vector_fmul_scalar_neon, export=1
+VFP     len .req r2
+NOVFP   len .req r3
+VFP     vdup.32         q8,  d0[0]
+NOVFP   vdup.32         q8,  r2
+        bics            r12, len, #15
+        beq             3f
+        vld1.32         {q0},[r1,:128]!
+        vld1.32         {q1},[r1,:128]!
+1:      vmul.f32        q0,  q0,  q8
+        vld1.32         {q2},[r1,:128]!
+        vmul.f32        q1,  q1,  q8
+        vld1.32         {q3},[r1,:128]!
+        vmul.f32        q2,  q2,  q8
+        vst1.32         {q0},[r0,:128]!
+        vmul.f32        q3,  q3,  q8
+        vst1.32         {q1},[r0,:128]!
+        subs            r12, r12, #16
+        beq             2f
+        vld1.32         {q0},[r1,:128]!
+        vst1.32         {q2},[r0,:128]!
+        vld1.32         {q1},[r1,:128]!
+        vst1.32         {q3},[r0,:128]!
+        b               1b
+2:      vst1.32         {q2},[r0,:128]!
+        vst1.32         {q3},[r0,:128]!
+        ands            len, len, #15
+        bxeq            lr
+3:      vld1.32         {q0},[r1,:128]!
+        vmul.f32        q0,  q0,  q8
+        vst1.32         {q0},[r0,:128]!
+        subs            len, len, #4
+        bgt             3b
+        bx              lr
+        .unreq          len
+endfunc
+
+function ff_vector_fmul_sv_scalar_2_neon, export=1
+VFP     vdup.32         d16, d0[0]
+NOVFP   vdup.32         d16, r3
+NOVFP   ldr             r3,  [sp]
+        vld1.32         {d0},[r1,:64]!
+        vld1.32         {d1},[r1,:64]!
+1:      subs            r3,  r3,  #4
+        vmul.f32        d4,  d0,  d16
+        vmul.f32        d5,  d1,  d16
+        ldr             r12, [r2], #4
+        vld1.32         {d2},[r12,:64]
+        ldr             r12, [r2], #4
+        vld1.32         {d3},[r12,:64]
+        vmul.f32        d4,  d4,  d2
+        vmul.f32        d5,  d5,  d3
+        beq             2f
+        vld1.32         {d0},[r1,:64]!
+        vld1.32         {d1},[r1,:64]!
+        vst1.32         {d4},[r0,:64]!
+        vst1.32         {d5},[r0,:64]!
+        b               1b
+2:      vst1.32         {d4},[r0,:64]!
+        vst1.32         {d5},[r0,:64]!
+        bx              lr
+endfunc
+
+function ff_vector_fmul_sv_scalar_4_neon, export=1
+VFP     vdup.32         q10, d0[0]
+NOVFP   vdup.32         q10, r3
+NOVFP   ldr             r3,  [sp]
+        push            {lr}
+        bics            lr,  r3,  #7
+        beq             3f
+        vld1.32         {q0},[r1,:128]!
+        vld1.32         {q2},[r1,:128]!
+1:      ldr             r12, [r2], #4
+        vld1.32         {q1},[r12,:128]
+        ldr             r12, [r2], #4
+        vld1.32         {q3},[r12,:128]
+        vmul.f32        q8,  q0,  q10
+        vmul.f32        q8,  q8,  q1
+        vmul.f32        q9,  q2,  q10
+        vmul.f32        q9,  q9,  q3
+        subs            lr,  lr,  #8
+        beq             2f
+        vld1.32         {q0},[r1,:128]!
+        vld1.32         {q2},[r1,:128]!
+        vst1.32         {q8},[r0,:128]!
+        vst1.32         {q9},[r0,:128]!
+        b               1b
+2:      vst1.32         {q8},[r0,:128]!
+        vst1.32         {q9},[r0,:128]!
+        ands            r3,  r3,  #7
+        popeq           {pc}
+3:      vld1.32         {q0},[r1,:128]!
+        ldr             r12, [r2], #4
+        vld1.32         {q1},[r12,:128]
+        vmul.f32        q0,  q0,  q10
+        vmul.f32        q0,  q0,  q1
+        vst1.32         {q0},[r0,:128]!
+        subs            r3,  r3,  #4
+        bgt             3b
+        pop             {pc}
+endfunc
+
+function ff_sv_fmul_scalar_2_neon, export=1
+VFP     len .req r2
+NOVFP   len .req r3
+VFP     vdup.32         q8,  d0[0]
+NOVFP   vdup.32         q8,  r2
+        ldr             r12, [r1], #4
+        vld1.32         {d0},[r12,:64]
+        ldr             r12, [r1], #4
+        vld1.32         {d1},[r12,:64]
+1:      vmul.f32        q1,  q0,  q8
+        subs            len, len, #4
+        beq             2f
+        ldr             r12, [r1], #4
+        vld1.32         {d0},[r12,:64]
+        ldr             r12, [r1], #4
+        vld1.32         {d1},[r12,:64]
+        vst1.32         {q1},[r0,:128]!
+        b               1b
+2:      vst1.32         {q1},[r0,:128]!
+        bx              lr
+        .unreq          len
+endfunc
+
+function ff_sv_fmul_scalar_4_neon, export=1
+VFP     len .req r2
+NOVFP   len .req r3
+VFP     vdup.32         q8,  d0[0]
+NOVFP   vdup.32         q8,  r2
+1:      ldr             r12, [r1], #4
+        vld1.32         {q0},[r12,:128]
+        vmul.f32        q0,  q0,  q8
+        vst1.32         {q0},[r0,:128]!
+        subs            len, len, #4
+        bgt             1b
+        bx              lr
+        .unreq          len
+endfunc
+
+function ff_butterflies_float_neon, export=1
+1:      vld1.32         {q0},[r0,:128]
+        vld1.32         {q1},[r1,:128]
+        vsub.f32        q2,  q0,  q1
+        vadd.f32        q1,  q0,  q1
+        vst1.32         {q2},[r1,:128]!
+        vst1.32         {q1},[r0,:128]!
+        subs            r2,  r2,  #4
+        bgt             1b
+        bx              lr
+endfunc
+
+function ff_scalarproduct_float_neon, export=1
+        vmov.f32        q2,  #0.0
+1:      vld1.32         {q0},[r0,:128]!
+        vld1.32         {q1},[r1,:128]!
+        vmla.f32        q2,  q0,  q1
+        subs            r2,  r2,  #4
+        bgt             1b
+        vadd.f32        d0,  d4,  d5
+        vpadd.f32       d0,  d0,  d0
+NOVFP   vmov.32         r0,  d0[0]
+        bx              lr
+endfunc
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+VFP     vdup.32         q0,  d0[0]
+VFP     len     .req    r2
+NOVFP   vdup.32         q0,  r2
+NOVFP   len     .req    r3
+
+        vld1.32         {q1},[r1,:128]!
+        vcvt.f32.s32    q3,  q1
+        vld1.32         {q2},[r1,:128]!
+        vcvt.f32.s32    q8,  q2
+1:      subs            len, len, #8
+        pld             [r1, #16]
+        vmul.f32        q9,  q3,  q0
+        vmul.f32        q10, q8,  q0
+        beq             2f
+        vld1.32         {q1},[r1,:128]!
+        vcvt.f32.s32    q3,  q1
+        vld1.32         {q2},[r1,:128]!
+        vcvt.f32.s32    q8,  q2
+        vst1.32         {q9}, [r0,:128]!
+        vst1.32         {q10},[r0,:128]!
+        b               1b
+2:      vst1.32         {q9}, [r0,:128]!
+        vst1.32         {q10},[r0,:128]!
+        bx              lr
+        .unreq  len
+endfunc
+
+function ff_vector_fmul_reverse_neon, export=1
+        add             r2,  r2,  r3,  lsl #2
+        sub             r2,  r2,  #32
+        mov             r12, #-32
+        vld1.32         {q0-q1},  [r1,:128]!
+        vld1.32         {q2-q3},  [r2,:128], r12
+1:      pld             [r1, #32]
+        vrev64.32       q3,  q3
+        vmul.f32        d16, d0,  d7
+        vmul.f32        d17, d1,  d6
+        pld             [r2, #-32]
+        vrev64.32       q2,  q2
+        vmul.f32        d18, d2,  d5
+        vmul.f32        d19, d3,  d4
+        subs            r3,  r3,  #8
+        beq             2f
+        vld1.32         {q0-q1},  [r1,:128]!
+        vld1.32         {q2-q3},  [r2,:128], r12
+        vst1.32         {q8-q9},  [r0,:128]!
+        b               1b
+2:      vst1.32         {q8-q9},  [r0,:128]!
+        bx              lr
+endfunc
+
+function ff_vector_fmul_add_neon, export=1
+        ldr             r12, [sp]
+        vld1.32         {q0-q1},  [r1,:128]!
+        vld1.32         {q8-q9},  [r2,:128]!
+        vld1.32         {q2-q3},  [r3,:128]!
+        vmul.f32        q10, q0,  q8
+        vmul.f32        q11, q1,  q9
+1:      vadd.f32        q12, q2,  q10
+        vadd.f32        q13, q3,  q11
+        pld             [r1, #16]
+        pld             [r2, #16]
+        pld             [r3, #16]
+        subs            r12, r12, #8
+        beq             2f
+        vld1.32         {q0},     [r1,:128]!
+        vld1.32         {q8},     [r2,:128]!
+        vmul.f32        q10, q0,  q8
+        vld1.32         {q1},     [r1,:128]!
+        vld1.32         {q9},     [r2,:128]!
+        vmul.f32        q11, q1,  q9
+        vld1.32         {q2-q3},  [r3,:128]!
+        vst1.32         {q12-q13},[r0,:128]!
+        b               1b
+2:      vst1.32         {q12-q13},[r0,:128]!
+        bx              lr
+endfunc
+
+function ff_vector_clipf_neon, export=1
+VFP     vdup.32         q1,  d0[1]
+VFP     vdup.32         q0,  d0[0]
+NOVFP   vdup.32         q0,  r2
+NOVFP   vdup.32         q1,  r3
+NOVFP   ldr             r2,  [sp]
+        vld1.f32        {q2},[r1,:128]!
+        vmin.f32        q10, q2,  q1
+        vld1.f32        {q3},[r1,:128]!
+        vmin.f32        q11, q3,  q1
+1:      vmax.f32        q8,  q10, q0
+        vmax.f32        q9,  q11, q0
+        subs            r2,  r2,  #8
+        beq             2f
+        vld1.f32        {q2},[r1,:128]!
+        vmin.f32        q10, q2,  q1
+        vld1.f32        {q3},[r1,:128]!
+        vmin.f32        q11, q3,  q1
+        vst1.f32        {q8},[r0,:128]!
+        vst1.f32        {q9},[r0,:128]!
+        b               1b
+2:      vst1.f32        {q8},[r0,:128]!
+        vst1.f32        {q9},[r0,:128]!
+        bx              lr
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_vfp.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/dsputil_vfp.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+        .syntax unified
+/*
+ * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
+ * throughput for almost all the instructions (except for double precision
+ * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
+ * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
+ * important for performance. One more interesting feature is that VFP has
+ * independent load/store and arithmetics pipelines, so it is possible to make
+ * them work simultaneously and get more than 1 operation per cycle. Load/store
+ * pipeline can process 2 single precision floating point values per cycle and
+ * supports bulk loads and stores for large sets of registers. Arithmetic operations
+ * can be done on vectors, which allows to keep the arithmetics pipeline busy,
+ * while the processor may issue and execute other instructions. Detailed
+ * optimization manuals can be found at http://www.arm.com
+ */
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
+function ff_vector_fmul_vfp, export=1
+        vpush           {d8-d15}
+        mov             r3,  r0
+        fmrx            r12, fpscr
+        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
+        fmxr            fpscr, r12
+
+        vldmia          r3!, {s0-s3}
+        vldmia          r1!, {s8-s11}
+        vldmia          r3!, {s4-s7}
+        vldmia          r1!, {s12-s15}
+        vmul.f32        s8,  s0,  s8
+1:
+        subs            r2,  r2,  #16
+        vmul.f32        s12, s4,  s12
+        vldmiage        r3!, {s16-s19}
+        vldmiage        r1!, {s24-s27}
+        vldmiage        r3!, {s20-s23}
+        vldmiage        r1!, {s28-s31}
+        vmulge.f32      s24, s16, s24
+        vstmia          r0!, {s8-s11}
+        vstmia          r0!, {s12-s15}
+        vmulge.f32      s28, s20, s28
+        vldmiagt        r3!, {s0-s3}
+        vldmiagt        r1!, {s8-s11}
+        vldmiagt        r3!, {s4-s7}
+        vldmiagt        r1!, {s12-s15}
+        vmulge.f32      s8,  s0,  s8
+        vstmiage        r0!, {s24-s27}
+        vstmiage        r0!, {s28-s31}
+        bgt             1b
+
+        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
+        fmxr            fpscr, r12
+        vpop            {d8-d15}
+        bx              lr
+endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@                                 const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+        vpush           {d8-d15}
+        add             r2,  r2,  r3, lsl #2
+        vldmdb          r2!, {s0-s3}
+        vldmia          r1!, {s8-s11}
+        vldmdb          r2!, {s4-s7}
+        vldmia          r1!, {s12-s15}
+        vmul.f32        s8,  s3,  s8
+        vmul.f32        s9,  s2,  s9
+        vmul.f32        s10, s1,  s10
+        vmul.f32        s11, s0,  s11
+1:
+        subs            r3,  r3,  #16
+        vldmdbge        r2!, {s16-s19}
+        vmul.f32        s12, s7,  s12
+        vldmiage        r1!, {s24-s27}
+        vmul.f32        s13, s6,  s13
+        vldmdbge        r2!, {s20-s23}
+        vmul.f32        s14, s5,  s14
+        vldmiage        r1!, {s28-s31}
+        vmul.f32        s15, s4,  s15
+        vmulge.f32      s24, s19, s24
+        vldmdbgt        r2!, {s0-s3}
+        vmulge.f32      s25, s18, s25
+        vstmia          r0!, {s8-s13}
+        vmulge.f32      s26, s17, s26
+        vldmiagt        r1!, {s8-s11}
+        vmulge.f32      s27, s16, s27
+        vmulge.f32      s28, s23, s28
+        vldmdbgt        r2!, {s4-s7}
+        vmulge.f32      s29, s22, s29
+        vstmia          r0!, {s14-s15}
+        vmulge.f32      s30, s21, s30
+        vmulge.f32      s31, s20, s31
+        vmulge.f32      s8,  s3,  s8
+        vldmiagt        r1!, {s12-s15}
+        vmulge.f32      s9,  s2,  s9
+        vmulge.f32      s10, s1,  s10
+        vstmiage        r0!, {s24-s27}
+        vmulge.f32      s11, s0,  s11
+        vstmiage        r0!, {s28-s31}
+        bgt             1b
+
+        vpop            {d8-d15}
+        bx              lr
+endfunc
+
+#if HAVE_ARMV6
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+        push            {r4-r8,lr}
+        vpush           {d8-d11}
+        vldmia          r1!, {s16-s23}
+        vcvt.s32.f32    s0,  s16
+        vcvt.s32.f32    s1,  s17
+        vcvt.s32.f32    s2,  s18
+        vcvt.s32.f32    s3,  s19
+        vcvt.s32.f32    s4,  s20
+        vcvt.s32.f32    s5,  s21
+        vcvt.s32.f32    s6,  s22
+        vcvt.s32.f32    s7,  s23
+1:
+        subs            r2,  r2,  #8
+        vmov            r3,  r4,  s0, s1
+        vmov            r5,  r6,  s2, s3
+        vmov            r7,  r8,  s4, s5
+        vmov            ip,  lr,  s6, s7
+        vldmiagt        r1!, {s16-s23}
+        ssat            r4,  #16, r4
+        ssat            r3,  #16, r3
+        ssat            r6,  #16, r6
+        ssat            r5,  #16, r5
+        pkhbt           r3,  r3,  r4, lsl #16
+        pkhbt           r4,  r5,  r6, lsl #16
+        vcvtgt.s32.f32  s0,  s16
+        vcvtgt.s32.f32  s1,  s17
+        vcvtgt.s32.f32  s2,  s18
+        vcvtgt.s32.f32  s3,  s19
+        vcvtgt.s32.f32  s4,  s20
+        vcvtgt.s32.f32  s5,  s21
+        vcvtgt.s32.f32  s6,  s22
+        vcvtgt.s32.f32  s7,  s23
+        ssat            r8,  #16, r8
+        ssat            r7,  #16, r7
+        ssat            lr,  #16, lr
+        ssat            ip,  #16, ip
+        pkhbt           r5,  r7,  r8, lsl #16
+        pkhbt           r6,  ip,  lr, lsl #16
+        stmia           r0!, {r3-r6}
+        bgt             1b
+
+        vpop            {d8-d11}
+        pop             {r4-r8,pc}
+endfunc
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/fft_init_arm.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/fft_init_arm.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale, float bias);
+
+av_cold void ff_fft_init_arm(FFTContext *s)
+{
+    if (HAVE_NEON) {
+        s->fft_permute  = ff_fft_permute_neon;
+        s->fft_calc     = ff_fft_calc_neon;
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->permutation  = FF_MDCT_PERM_INTERLEAVE;
+    }
+}
+
+#if CONFIG_RDFT
+av_cold void ff_rdft_init_arm(RDFTContext *s)
+{
+    if (HAVE_NEON)
+        s->rdft_calc    = ff_rdft_calc_neon;
+}
+#endif
+
+#if CONFIG_DCA_DECODER
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+{
+    if (HAVE_NEON)
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/fft_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/fft_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,371 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+        .text
+
+function fft4_neon
+        vld1.32         {d0-d3}, [r0,:128]
+
+        vext.32         q8,  q1,  q1,  #1       @ i2,r3 d3=i3,r2
+        vsub.f32        d6,  d0,  d1            @ r0-r1,i0-i1
+        vsub.f32        d7,  d16, d17           @ r3-r2,i2-i3
+        vadd.f32        d4,  d0,  d1            @ r0+r1,i0+i1
+        vadd.f32        d5,  d2,  d3            @ i2+i3,r2+r3
+        vadd.f32        d1,  d6,  d7
+        vsub.f32        d3,  d6,  d7
+        vadd.f32        d0,  d4,  d5
+        vsub.f32        d2,  d4,  d5
+
+        vst1.32         {d0-d3}, [r0,:128]
+
+        bx              lr
+endfunc
+
+function fft8_neon
+        mov             r1,  r0
+        vld1.32         {d0-d3},   [r1,:128]!
+        vld1.32         {d16-d19}, [r1,:128]
+
+        movw            r2,  #0x04f3            @ sqrt(1/2)
+        movt            r2,  #0x3f35
+        eor             r3,  r2,  #1<<31
+        vdup.32         d31, r2
+
+        vext.32         q11, q1,  q1,  #1       @ i2,r3,i3,r2
+        vadd.f32        d4,  d16, d17           @ r4+r5,i4+i5
+        vmov            d28, r3,  r2
+        vadd.f32        d5,  d18, d19           @ r6+r7,i6+i7
+        vsub.f32        d17, d16, d17           @ r4-r5,i4-i5
+        vsub.f32        d19, d18, d19           @ r6-r7,i6-i7
+        vrev64.32       d29, d28
+        vadd.f32        d20, d0,  d1            @ r0+r1,i0+i1
+        vadd.f32        d21, d2,  d3            @ r2+r3,i2+i3
+        vmul.f32        d26, d17, d28           @ -a2r*w,a2i*w
+        vext.32         q3,  q2,  q2,  #1
+        vmul.f32        d27, d19, d29           @ a3r*w,-a3i*w
+        vsub.f32        d23, d22, d23           @ i2-i3,r3-r2
+        vsub.f32        d22, d0,  d1            @ r0-r1,i0-i1
+        vmul.f32        d24, d17, d31           @ a2r*w,a2i*w
+        vmul.f32        d25, d19, d31           @ a3r*w,a3i*w
+        vadd.f32        d0,  d20, d21
+        vsub.f32        d2,  d20, d21
+        vadd.f32        d1,  d22, d23
+        vrev64.32       q13, q13
+        vsub.f32        d3,  d22, d23
+        vsub.f32        d6,  d6,  d7
+        vadd.f32        d24, d24, d26           @ a2r+a2i,a2i-a2r   t1,t2
+        vadd.f32        d25, d25, d27           @ a3r-a3i,a3i+a3r   t5,t6
+        vadd.f32        d7,  d4,  d5
+        vsub.f32        d18, d2,  d6
+        vext.32         q13, q12, q12, #1
+        vadd.f32        d2,  d2,  d6
+        vsub.f32        d16, d0,  d7
+        vadd.f32        d5,  d25, d24
+        vsub.f32        d4,  d26, d27
+        vadd.f32        d0,  d0,  d7
+        vsub.f32        d17, d1,  d5
+        vsub.f32        d19, d3,  d4
+        vadd.f32        d3,  d3,  d4
+        vadd.f32        d1,  d1,  d5
+
+        vst1.32         {d16-d19}, [r1,:128]
+        vst1.32         {d0-d3},   [r0,:128]
+
+        bx              lr
+endfunc
+
+function fft16_neon
+        movrel          r1, mppm
+        vld1.32         {d16-d19}, [r0,:128]!   @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
+        pld             [r0, #32]
+        vld1.32         {d2-d3}, [r1,:128]
+        vext.32         q13, q9,  q9,  #1
+        vld1.32         {d22-d25}, [r0,:128]!   @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
+        vadd.f32        d4,  d16, d17
+        vsub.f32        d5,  d16, d17
+        vadd.f32        d18, d18, d19
+        vsub.f32        d19, d26, d27
+
+        vadd.f32        d20, d22, d23
+        vsub.f32        d22, d22, d23
+        vsub.f32        d23, d24, d25
+        vadd.f32        q8,  q2,  q9            @ {r0,i0,r1,i1}
+        vadd.f32        d21, d24, d25
+        vmul.f32        d24, d22, d2
+        vsub.f32        q9,  q2,  q9            @ {r2,i2,r3,i3}
+        vmul.f32        d25, d23, d3
+        vuzp.32         d16, d17                @ {r0,r1,i0,i1}
+        vmul.f32        q1,  q11, d2[1]
+        vuzp.32         d18, d19                @ {r2,r3,i2,i3}
+        vrev64.32       q12, q12
+        vadd.f32        q11, q12, q1            @ {t1a,t2a,t5,t6}
+        vld1.32         {d24-d27}, [r0,:128]!   @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
+        vzip.32         q10, q11
+        vld1.32         {d28-d31}, [r0,:128]    @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
+        vadd.f32        d0,  d22, d20
+        vadd.f32        d1,  d21, d23
+        vsub.f32        d2,  d21, d23
+        vsub.f32        d3,  d22, d20
+        sub             r0,  r0,  #96
+        vext.32         q13, q13, q13, #1
+        vsub.f32        q10, q8,  q0            @ {r4,r5,i4,i5}
+        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
+        vext.32         q15, q15, q15, #1
+        vsub.f32        q11, q9,  q1            @ {r6,r7,i6,i7}
+        vswp            d25, d26                @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
+        vadd.f32        q9,  q9,  q1            @ {r2,r3,i2,i3}
+        vswp            d29, d30                @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
+        vadd.f32        q0,  q12, q13           @ {t1,t2,t5,t6}
+        vadd.f32        q1,  q14, q15           @ {t1a,t2a,t5a,t6a}
+        movrel          r2,  X(ff_cos_16)
+        vsub.f32        q13, q12, q13           @ {t3,t4,t7,t8}
+        vrev64.32       d1,  d1
+        vsub.f32        q15, q14, q15           @ {t3a,t4a,t7a,t8a}
+        vrev64.32       d3,  d3
+        movrel          r3,  pmmp
+        vswp            d1,  d26                @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
+        vswp            d3,  d30                @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
+        vadd.f32        q12, q0,  q13           @ {r8,i8,r9,i9}
+        vadd.f32        q14, q1,  q15           @ {r12,i12,r13,i13}
+        vld1.32         {d4-d5},  [r2,:64]
+        vsub.f32        q13, q0,  q13           @ {r10,i10,r11,i11}
+        vsub.f32        q15, q1,  q15           @ {r14,i14,r15,i15}
+        vswp            d25, d28                @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
+        vld1.32         {d6-d7},  [r3,:128]
+        vrev64.32       q1,  q14
+        vmul.f32        q14, q14, d4[1]
+        vmul.f32        q1,  q1,  q3
+        vmla.f32        q14, q1,  d5[1]         @ {t1a,t2a,t5a,t6a}
+        vswp            d27, d30                @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
+        vzip.32         q12, q14
+        vadd.f32        d0,  d28, d24
+        vadd.f32        d1,  d25, d29
+        vsub.f32        d2,  d25, d29
+        vsub.f32        d3,  d28, d24
+        vsub.f32        q12, q8,  q0            @ {r8,r9,i8,i9}
+        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
+        vsub.f32        q14, q10, q1            @ {r12,r13,i12,i13}
+        mov             r1,  #32
+        vadd.f32        q10, q10, q1            @ {r4,r5,i4,i5}
+        vrev64.32       q0,  q13
+        vmul.f32        q13, q13, d5[0]
+        vrev64.32       q1,  q15
+        vmul.f32        q15, q15, d5[1]
+        vst2.32         {d16-d17},[r0,:128], r1
+        vmul.f32        q0,  q0,  q3
+        vst2.32         {d20-d21},[r0,:128], r1
+        vmul.f32        q1,  q1,  q3
+        vmla.f32        q13, q0,  d5[0]         @ {t1,t2,t5,t6}
+        vmla.f32        q15, q1,  d4[1]         @ {t1a,t2a,t5a,t6a}
+        vst2.32         {d24-d25},[r0,:128], r1
+        vst2.32         {d28-d29},[r0,:128]
+        vzip.32         q13, q15
+        sub             r0, r0, #80
+        vadd.f32        d0,  d30, d26
+        vadd.f32        d1,  d27, d31
+        vsub.f32        d2,  d27, d31
+        vsub.f32        d3,  d30, d26
+        vsub.f32        q13, q9,  q0            @ {r10,r11,i10,i11}
+        vadd.f32        q9,  q9,  q0            @ {r2,r3,i2,i3}
+        vsub.f32        q15, q11, q1            @ {r14,r15,i14,i15}
+        vadd.f32        q11, q11, q1            @ {r6,r7,i6,i7}
+        vst2.32         {d18-d19},[r0,:128], r1
+        vst2.32         {d22-d23},[r0,:128], r1
+        vst2.32         {d26-d27},[r0,:128], r1
+        vst2.32         {d30-d31},[r0,:128]
+        bx              lr
+endfunc
+
+function fft_pass_neon
+        push            {r4-r6,lr}
+        mov             r6,  r2                 @ n
+        lsl             r5,  r2,  #3            @ 2 * n * sizeof FFTSample
+        lsl             r4,  r2,  #4            @ 2 * n * sizeof FFTComplex
+        lsl             r2,  r2,  #5            @ 4 * n * sizeof FFTComplex
+        add             r3,  r2,  r4
+        add             r4,  r4,  r0            @ &z[o1]
+        add             r2,  r2,  r0            @ &z[o2]
+        add             r3,  r3,  r0            @ &z[o3]
+        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
+        movrel          r12, pmmp
+        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
+        add             r5,  r5,  r1            @ wim
+        vld1.32         {d6-d7},  [r12,:128]    @ pmmp
+        vswp            d21, d22
+        vld1.32         {d4},     [r1,:64]!     @ {wre[0],wre[1]}
+        sub             r5,  r5,  #4            @ wim--
+        vrev64.32       q1,  q11
+        vmul.f32        q11, q11, d4[1]
+        vmul.f32        q1,  q1,  q3
+        vld1.32         {d5[0]},  [r5,:32]      @ d5[0] = wim[-1]
+        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
+        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
+        sub             r6, r6, #1              @ n--
+        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
+        vzip.32         q10, q11
+        vadd.f32        d0,  d22, d20
+        vadd.f32        d1,  d21, d23
+        vsub.f32        d2,  d21, d23
+        vsub.f32        d3,  d22, d20
+        vsub.f32        q10, q8,  q0
+        vadd.f32        q8,  q8,  q0
+        vsub.f32        q11, q9,  q1
+        vadd.f32        q9,  q9,  q1
+        vst2.32         {d20-d21},[r2,:128]!    @ {z[o2],z[o2+1]}
+        vst2.32         {d16-d17},[r0,:128]!    @ {z[0],z[1]}
+        vst2.32         {d22-d23},[r3,:128]!    @ {z[o3],z[o3+1]}
+        vst2.32         {d18-d19},[r4,:128]!    @ {z[o1],z[o1+1]}
+        sub             r5,  r5,  #8            @ wim -= 2
+1:
+        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
+        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
+        vswp            d21, d22
+        vld1.32         {d4}, [r1]!             @ {wre[0],wre[1]}
+        vrev64.32       q0,  q10
+        vmul.f32        q10, q10, d4[0]
+        vrev64.32       q1,  q11
+        vmul.f32        q11, q11, d4[1]
+        vld1.32         {d5}, [r5]              @ {wim[-1],wim[0]}
+        vmul.f32        q0,  q0,  q3
+        sub             r5,  r5,  #8            @ wim -= 2
+        vmul.f32        q1,  q1,  q3
+        vmla.f32        q10, q0,  d5[1]         @ {t1,t2,t5,t6}
+        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
+        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
+        subs            r6,  r6,  #1            @ n--
+        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
+        vzip.32         q10, q11
+        vadd.f32        d0,  d22, d20
+        vadd.f32        d1,  d21, d23
+        vsub.f32        d2,  d21, d23
+        vsub.f32        d3,  d22, d20
+        vsub.f32        q10, q8,  q0
+        vadd.f32        q8,  q8,  q0
+        vsub.f32        q11, q9,  q1
+        vadd.f32        q9,  q9,  q1
+        vst2.32         {d20-d21}, [r2,:128]!   @ {z[o2],z[o2+1]}
+        vst2.32         {d16-d17}, [r0,:128]!   @ {z[0],z[1]}
+        vst2.32         {d22-d23}, [r3,:128]!   @ {z[o3],z[o3+1]}
+        vst2.32         {d18-d19}, [r4,:128]!   @ {z[o1],z[o1+1]}
+        bne             1b
+
+        pop             {r4-r6,pc}
+endfunc
+
+.macro  def_fft n, n2, n4
+        .align 6
+function fft\n\()_neon
+        push            {r4, lr}
+        mov             r4,  r0
+        bl              fft\n2\()_neon
+        add             r0,  r4,  #\n4*2*8
+        bl              fft\n4\()_neon
+        add             r0,  r4,  #\n4*3*8
+        bl              fft\n4\()_neon
+        mov             r0,  r4
+        pop             {r4, lr}
+        movrel          r1,  X(ff_cos_\n)
+        mov             r2,  #\n4/2
+        b               fft_pass_neon
+endfunc
+.endm
+
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+        ldr             r2,  [r0]
+        sub             r2,  r2,  #2
+        movrel          r3,  fft_tab_neon
+        ldr             r3,  [r3, r2, lsl #2]
+        mov             r0,  r1
+        bx              r3
+endfunc
+
+function ff_fft_permute_neon, export=1
+        push            {r4,lr}
+        mov             r12, #1
+        ldr             r2,  [r0]       @ nbits
+        ldr             r3,  [r0, #20]  @ tmp_buf
+        ldr             r0,  [r0, #8]   @ revtab
+        lsl             r12, r12, r2
+        mov             r2,  r12
+1:
+        vld1.32         {d0-d1}, [r1,:128]!
+        ldr             r4,  [r0], #4
+        uxth            lr,  r4
+        uxth            r4,  r4,  ror #16
+        add             lr,  r3,  lr,  lsl #3
+        add             r4,  r3,  r4,  lsl #3
+        vst1.32         {d0}, [lr,:64]
+        vst1.32         {d1}, [r4,:64]
+        subs            r12, r12, #2
+        bgt             1b
+
+        sub             r1,  r1,  r2,  lsl #3
+1:
+        vld1.32         {d0-d3}, [r3,:128]!
+        vst1.32         {d0-d3}, [r1,:128]!
+        subs            r2,  r2,  #4
+        bgt             1b
+
+        pop             {r4,pc}
+endfunc
+
+        .section .rodata
+        .align 4
+fft_tab_neon:
+        .word fft4_neon
+        .word fft8_neon
+        .word fft16_neon
+        .word fft32_neon
+        .word fft64_neon
+        .word fft128_neon
+        .word fft256_neon
+        .word fft512_neon
+        .word fft1024_neon
+        .word fft2048_neon
+        .word fft4096_neon
+        .word fft8192_neon
+        .word fft16384_neon
+        .word fft32768_neon
+        .word fft65536_neon
+        .size fft_tab_neon, . - fft_tab_neon
+
+        .align 4
+pmmp:   .float  +1.0, -1.0, -1.0, +1.0
+mppm:   .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264dsp_init_arm.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/h264dsp_init_arm.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h264dsp.h"
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+
+void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
+                                      int weight, int offset);
+void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
+                                     int weight, int offset);
+void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
+                                     int weight, int offset);
+void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+
+void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                        int log2_den, int weightd, int weights,
+                                        int offset);
+void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                       int log2_den, int weightd, int weights,
+                                       int offset);
+void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                       int log2_den, int weightd, int weights,
+                                       int offset);
+void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+
+void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+                             DCTELEM *block, int stride,
+                             const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+                                  DCTELEM *block, int stride,
+                                  const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+                            DCTELEM *block, int stride,
+                            const uint8_t nnzc[6*8]);
+
+#if HAVE_NEON
+static void ff_h264dsp_init_neon(H264DSPContext *c)
+{
+    c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
+    c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
+    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
+    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
+    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
+    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
+    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
+
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
+    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
+    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
+    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
+    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
+    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+
+    c->h264_idct_add        = ff_h264_idct_add_neon;
+    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
+    c->h264_idct_add16      = ff_h264_idct_add16_neon;
+    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+    c->h264_idct_add8       = ff_h264_idct_add8_neon;
+}
+#endif
+
+void ff_h264dsp_init_arm(H264DSPContext *c)
+{
+    if (HAVE_NEON) ff_h264dsp_init_neon(c);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264dsp_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/h264dsp_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
+        vtrn.32         \r0, \r4
+        vtrn.32         \r1, \r5
+        vtrn.32         \r2, \r6
+        vtrn.32         \r3, \r7
+        vtrn.16         \r0, \r2
+        vtrn.16         \r1, \r3
+        vtrn.16         \r4, \r6
+        vtrn.16         \r5, \r7
+        vtrn.8          \r0, \r1
+        vtrn.8          \r2, \r3
+        vtrn.8          \r4, \r5
+        vtrn.8          \r6, \r7
+        .endm
+
+        .macro transpose_4x4 r0 r1 r2 r3
+        vtrn.16         \r0, \r2
+        vtrn.16         \r1, \r3
+        vtrn.8          \r0, \r1
+        vtrn.8          \r2, \r3
+        .endm
+
+        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
+        vswp            \r0, \r4
+        vswp            \r1, \r5
+        vswp            \r2, \r6
+        vswp            \r3, \r7
+        .endm
+
+        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.32         \r4, \r6
+        vtrn.32         \r5, \r7
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+        vtrn.16         \r4, \r5
+        vtrn.16         \r6, \r7
+        .endm
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+        .macro  h264_chroma_mc8 type
+function ff_\type\()_h264_chroma_mc8_neon, export=1
+        push            {r4-r7, lr}
+        ldrd            r4,  [sp, #20]
+.ifc \type,avg
+        mov             lr,  r0
+.endif
+        pld             [r1]
+        pld             [r1, r2]
+
+        muls            r7,  r4,  r5
+        rsb             r6,  r7,  r5,  lsl #3
+        rsb             ip,  r7,  r4,  lsl #3
+        sub             r4,  r7,  r4,  lsl #3
+        sub             r4,  r4,  r5,  lsl #3
+        add             r4,  r4,  #64
+
+        beq             2f
+
+        add             r5,  r1,  r2
+
+        vdup.8          d0,  r4
+        lsl             r4,  r2,  #1
+        vdup.8          d1,  ip
+        vld1.64         {d4, d5}, [r1], r4
+        vdup.8          d2,  r6
+        vld1.64         {d6, d7}, [r5], r4
+        vdup.8          d3,  r7
+
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+
+1:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+        vld1.64         {d4, d5}, [r1], r4
+        vmlal.u8        q8,  d6,  d2
+        vext.8          d5,  d4,  d5,  #1
+        vmlal.u8        q8,  d7,  d3
+        vmull.u8        q9,  d6,  d0
+        subs            r3,  r3,  #2
+        vmlal.u8        q9,  d7,  d1
+        vmlal.u8        q9,  d4,  d2
+        vmlal.u8        q9,  d5,  d3
+        vrshrn.u16      d16, q8,  #6
+        vld1.64         {d6, d7}, [r5], r4
+        pld             [r1]
+        vrshrn.u16      d17, q9,  #6
+.ifc \type,avg
+        vld1.64         {d20}, [lr,:64], r2
+        vld1.64         {d21}, [lr,:64], r2
+        vrhadd.u8       q8,  q8,  q10
+.endif
+        vext.8          d7,  d6,  d7,  #1
+        vst1.64         {d16}, [r0,:64], r2
+        vst1.64         {d17}, [r0,:64], r2
+        bgt             1b
+
+        pop             {r4-r7, pc}
+
+2:      tst             r6,  r6
+        add             ip,  ip,  r6
+        vdup.8          d0,  r4
+        vdup.8          d1,  ip
+
+        beq             4f
+
+        add             r5,  r1,  r2
+        lsl             r4,  r2,  #1
+        vld1.64         {d4}, [r1], r4
+        vld1.64         {d6}, [r5], r4
+
+3:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d6,  d1
+        vld1.64         {d4}, [r1], r4
+        vmull.u8        q9,  d6,  d0
+        vmlal.u8        q9,  d4,  d1
+        vld1.64         {d6}, [r5], r4
+        vrshrn.u16      d16, q8,  #6
+        vrshrn.u16      d17, q9,  #6
+.ifc \type,avg
+        vld1.64         {d20}, [lr,:64], r2
+        vld1.64         {d21}, [lr,:64], r2
+        vrhadd.u8       q8,  q8,  q10
+.endif
+        subs            r3,  r3,  #2
+        pld             [r1]
+        vst1.64         {d16}, [r0,:64], r2
+        vst1.64         {d17}, [r0,:64], r2
+        bgt             3b
+
+        pop             {r4-r7, pc}
+
+4:      vld1.64         {d4, d5}, [r1], r2
+        vld1.64         {d6, d7}, [r1], r2
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+
+5:      pld             [r1]
+        subs            r3,  r3,  #2
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+        vld1.64         {d4, d5}, [r1], r2
+        vmull.u8        q9,  d6,  d0
+        vmlal.u8        q9,  d7,  d1
+        pld             [r1]
+        vext.8          d5,  d4,  d5,  #1
+        vrshrn.u16      d16, q8,  #6
+        vrshrn.u16      d17, q9,  #6
+.ifc \type,avg
+        vld1.64         {d20}, [lr,:64], r2
+        vld1.64         {d21}, [lr,:64], r2
+        vrhadd.u8       q8,  q8,  q10
+.endif
+        vld1.64         {d6, d7}, [r1], r2
+        vext.8          d7,  d6,  d7,  #1
+        vst1.64         {d16}, [r0,:64], r2
+        vst1.64         {d17}, [r0,:64], r2
+        bgt             5b
+
+        pop             {r4-r7, pc}
+endfunc
+        .endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+        .macro  h264_chroma_mc4 type
+function ff_\type\()_h264_chroma_mc4_neon, export=1
+        push            {r4-r7, lr}
+        ldrd            r4,  [sp, #20]
+.ifc \type,avg
+        mov             lr,  r0
+.endif
+        pld             [r1]
+        pld             [r1, r2]
+
+        muls            r7,  r4,  r5
+        rsb             r6,  r7,  r5,  lsl #3
+        rsb             ip,  r7,  r4,  lsl #3
+        sub             r4,  r7,  r4,  lsl #3
+        sub             r4,  r4,  r5,  lsl #3
+        add             r4,  r4,  #64
+
+        beq             2f
+
+        add             r5,  r1,  r2
+
+        vdup.8          d0,  r4
+        lsl             r4,  r2,  #1
+        vdup.8          d1,  ip
+        vld1.64         {d4},     [r1], r4
+        vdup.8          d2,  r6
+        vld1.64         {d6},     [r5], r4
+        vdup.8          d3,  r7
+
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d4,  d5
+        vtrn.32         d6,  d7
+
+        vtrn.32         d0,  d1
+        vtrn.32         d2,  d3
+
+1:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d6,  d2
+        vld1.64         {d4},     [r1], r4
+        vext.8          d5,  d4,  d5,  #1
+        vtrn.32         d4,  d5
+        vmull.u8        q9,  d6,  d0
+        vmlal.u8        q9,  d4,  d2
+        vld1.64         {d6},     [r5], r4
+        vadd.i16        d16, d16, d17
+        vadd.i16        d17, d18, d19
+        vrshrn.u16      d16, q8,  #6
+        subs            r3,  r3,  #2
+        pld             [r1]
+.ifc \type,avg
+        vld1.32         {d20[0]}, [lr,:32], r2
+        vld1.32         {d20[1]}, [lr,:32], r2
+        vrhadd.u8       d16, d16, d20
+.endif
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d6,  d7
+        vst1.32         {d16[0]}, [r0,:32], r2
+        vst1.32         {d16[1]}, [r0,:32], r2
+        bgt             1b
+
+        pop             {r4-r7, pc}
+
+2:      tst             r6,  r6
+        add             ip,  ip,  r6
+        vdup.8          d0,  r4
+        vdup.8          d1,  ip
+        vtrn.32         d0,  d1
+
+        beq             4f
+
+        vext.32         d1,  d0,  d1,  #1
+        add             r5,  r1,  r2
+        lsl             r4,  r2,  #1
+        vld1.32         {d4[0]},  [r1], r4
+        vld1.32         {d4[1]},  [r5], r4
+
+3:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vld1.32         {d4[0]},  [r1], r4
+        vmull.u8        q9,  d4,  d1
+        vld1.32         {d4[1]},  [r5], r4
+        vadd.i16        d16, d16, d17
+        vadd.i16        d17, d18, d19
+        vrshrn.u16      d16, q8,  #6
+.ifc \type,avg
+        vld1.32         {d20[0]}, [lr,:32], r2
+        vld1.32         {d20[1]}, [lr,:32], r2
+        vrhadd.u8       d16, d16, d20
+.endif
+        subs            r3,  r3,  #2
+        pld             [r1]
+        vst1.32         {d16[0]}, [r0,:32], r2
+        vst1.32         {d16[1]}, [r0,:32], r2
+        bgt             3b
+
+        pop             {r4-r7, pc}
+
+4:      vld1.64         {d4},     [r1], r2
+        vld1.64         {d6},     [r1], r2
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d4,  d5
+        vtrn.32         d6,  d7
+
+5:      vmull.u8        q8,  d4,  d0
+        vmull.u8        q9,  d6,  d0
+        subs            r3,  r3,  #2
+        vld1.64         {d4},     [r1], r2
+        vext.8          d5,  d4,  d5,  #1
+        vtrn.32         d4,  d5
+        vadd.i16        d16, d16, d17
+        vadd.i16        d17, d18, d19
+        pld             [r1]
+        vrshrn.u16      d16, q8,  #6
+.ifc \type,avg
+        vld1.32         {d20[0]}, [lr,:32], r2
+        vld1.32         {d20[1]}, [lr,:32], r2
+        vrhadd.u8       d16, d16, d20
+.endif
+        vld1.64         {d6},     [r1], r2
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d6,  d7
+        pld             [r1]
+        vst1.32         {d16[0]}, [r0,:32], r2
+        vst1.32         {d16[1]}, [r0,:32], r2
+        bgt             5b
+
+        pop             {r4-r7, pc}
+endfunc
+        .endm
+
+        .macro  h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+        push            {r4-r6, lr}
+        ldr             r4,  [sp, #16]
+        ldr             lr,  [sp, #20]
+        pld             [r1]
+        pld             [r1, r2]
+        orrs            r5,  r4,  lr
+        beq             2f
+
+        mul             r5,  r4,  lr
+        rsb             r6,  r5,  lr,  lsl #3
+        rsb             r12, r5,  r4,  lsl #3
+        sub             r4,  r5,  r4,  lsl #3
+        sub             r4,  r4,  lr,  lsl #3
+        add             r4,  r4,  #64
+        vdup.8          d0,  r4
+        vdup.8          d2,  r12
+        vdup.8          d1,  r6
+        vdup.8          d3,  r5
+        vtrn.16         q0,  q1
+1:
+        vld1.32         {d4[0]},  [r1], r2
+        vld1.32         {d4[1]},  [r1], r2
+        vrev64.32       d5,  d4
+        vld1.32         {d5[1]},  [r1]
+        vext.8          q3,  q2,  q2,  #1
+        vtrn.16         q2,  q3
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+.ifc \type,avg
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+.endif
+        vtrn.32         d16, d17
+        vadd.i16        d16, d16, d17
+        vrshrn.u16      d16, q8,  #6
+.ifc \type,avg
+        vrhadd.u8       d16, d16, d18
+.endif
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+        subs            r3,  r3,  #2
+        bgt             1b
+        pop             {r4-r6, pc}
+2:
+.ifc \type,put
+        ldrh            r5,  [r1], r2
+        strh            r5,  [r0], r2
+        ldrh            r6,  [r1], r2
+        strh            r6,  [r0], r2
+.else
+        vld1.16         {d16[0]}, [r1], r2
+        vld1.16         {d16[1]}, [r1], r2
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+        vrhadd.u8       d16, d16, d18
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+.endif
+        subs            r3,  r3,  #2
+        bgt             2b
+        pop             {r4-r6, pc}
+endfunc
+.endm
+
+        .text
+        .align
+
+        h264_chroma_mc8 put
+        h264_chroma_mc8 avg
+        h264_chroma_mc4 put
+        h264_chroma_mc4 avg
+        h264_chroma_mc2 put
+        h264_chroma_mc2 avg
+
+        /* H.264 loop filter */
+
+        .macro h264_loop_filter_start
+        ldr             ip,  [sp]
+        tst             r2,  r2
+        ldr             ip,  [ip]
+        tstne           r3,  r3
+        vmov.32         d24[0], ip
+        and             ip,  ip,  ip, lsl #16
+        bxeq            lr
+        ands            ip,  ip,  ip, lsl #8
+        bxlt            lr
+        .endm
+
+        .macro align_push_regs
+        and             ip,  sp,  #15
+        add             ip,  ip,  #32
+        sub             sp,  sp,  ip
+        vst1.64         {d12-d15}, [sp,:128]
+        sub             sp,  sp,  #32
+        vst1.64         {d8-d11},  [sp,:128]
+        .endm
+
+        .macro align_pop_regs
+        vld1.64         {d8-d11},  [sp,:128]!
+        vld1.64         {d12-d15}, [sp,:128], ip
+        .endm
+
+        .macro h264_loop_filter_luma
+        vdup.8          q11, r2         @ alpha
+        vmovl.u8        q12, d24
+        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
+        vmovl.u16       q12, d24
+        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
+        vsli.16         q12, q12, #8
+        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
+        vsli.32         q12, q12, #16
+        vclt.u8         q6,  q6,  q11   @ < alpha
+        vdup.8          q11, r3         @ beta
+        vclt.s8         q7,  q12, #0
+        vclt.u8         q14, q14, q11   @ < beta
+        vclt.u8         q15, q15, q11   @ < beta
+        vbic            q6,  q6,  q7
+        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
+        vand            q6,  q6,  q14
+        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
+        vclt.u8         q4,  q4,  q11   @ < beta
+        vand            q6,  q6,  q15
+        vclt.u8         q5,  q5,  q11   @ < beta
+        vand            q4,  q4,  q6
+        vand            q5,  q5,  q6
+        vand            q12, q12, q6
+        vrhadd.u8       q14, q8,  q0
+        vsub.i8         q6,  q12, q4
+        vqadd.u8        q7,  q9,  q12
+        vhadd.u8        q10, q10, q14
+        vsub.i8         q6,  q6,  q5
+        vhadd.u8        q14, q2,  q14
+        vmin.u8         q7,  q7,  q10
+        vqsub.u8        q11, q9,  q12
+        vqadd.u8        q2,  q1,  q12
+        vmax.u8         q7,  q7,  q11
+        vqsub.u8        q11, q1,  q12
+        vmin.u8         q14, q2,  q14
+        vmovl.u8        q2,  d0
+        vmax.u8         q14, q14, q11
+        vmovl.u8        q10, d1
+        vsubw.u8        q2,  q2,  d16
+        vsubw.u8        q10, q10, d17
+        vshl.i16        q2,  q2,  #2
+        vshl.i16        q10, q10, #2
+        vaddw.u8        q2,  q2,  d18
+        vaddw.u8        q10, q10, d19
+        vsubw.u8        q2,  q2,  d2
+        vsubw.u8        q10, q10, d3
+        vrshrn.i16      d4,  q2,  #3
+        vrshrn.i16      d5,  q10, #3
+        vbsl            q4,  q7,  q9
+        vbsl            q5,  q14, q1
+        vneg.s8         q7,  q6
+        vmovl.u8        q14, d16
+        vmin.s8         q2,  q2,  q6
+        vmovl.u8        q6,  d17
+        vmax.s8         q2,  q2,  q7
+        vmovl.u8        q11, d0
+        vmovl.u8        q12, d1
+        vaddw.s8        q14, q14, d4
+        vaddw.s8        q6,  q6,  d5
+        vsubw.s8        q11, q11, d4
+        vsubw.s8        q12, q12, d5
+        vqmovun.s16     d16, q14
+        vqmovun.s16     d17, q6
+        vqmovun.s16     d0,  q11
+        vqmovun.s16     d1,  q12
+        .endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+
+        vld1.64         {d0, d1},  [r0,:128], r1
+        vld1.64         {d2, d3},  [r0,:128], r1
+        vld1.64         {d4, d5},  [r0,:128], r1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+        vld1.64         {d20,d21}, [r0,:128], r1
+        vld1.64         {d18,d19}, [r0,:128], r1
+        vld1.64         {d16,d17}, [r0,:128], r1
+
+        align_push_regs
+
+        h264_loop_filter_luma
+
+        sub             r0,  r0,  r1, lsl #1
+        vst1.64         {d8, d9},  [r0,:128], r1
+        vst1.64         {d16,d17}, [r0,:128], r1
+        vst1.64         {d0, d1},  [r0,:128], r1
+        vst1.64         {d10,d11}, [r0,:128]
+
+        align_pop_regs
+        bx              lr
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+
+        sub             r0,  r0,  #4
+        vld1.64         {d6},  [r0], r1
+        vld1.64         {d20}, [r0], r1
+        vld1.64         {d18}, [r0], r1
+        vld1.64         {d16}, [r0], r1
+        vld1.64         {d0},  [r0], r1
+        vld1.64         {d2},  [r0], r1
+        vld1.64         {d4},  [r0], r1
+        vld1.64         {d26}, [r0], r1
+        vld1.64         {d7},  [r0], r1
+        vld1.64         {d21}, [r0], r1
+        vld1.64         {d19}, [r0], r1
+        vld1.64         {d17}, [r0], r1
+        vld1.64         {d1},  [r0], r1
+        vld1.64         {d3},  [r0], r1
+        vld1.64         {d5},  [r0], r1
+        vld1.64         {d27}, [r0], r1
+
+        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
+
+        align_push_regs
+
+        h264_loop_filter_luma
+
+        transpose_4x4   q4, q8, q0, q5
+
+        sub             r0,  r0,  r1, lsl #4
+        add             r0,  r0,  #2
+        vst1.32         {d8[0]},  [r0], r1
+        vst1.32         {d16[0]}, [r0], r1
+        vst1.32         {d0[0]},  [r0], r1
+        vst1.32         {d10[0]}, [r0], r1
+        vst1.32         {d8[1]},  [r0], r1
+        vst1.32         {d16[1]}, [r0], r1
+        vst1.32         {d0[1]},  [r0], r1
+        vst1.32         {d10[1]}, [r0], r1
+        vst1.32         {d9[0]},  [r0], r1
+        vst1.32         {d17[0]}, [r0], r1
+        vst1.32         {d1[0]},  [r0], r1
+        vst1.32         {d11[0]}, [r0], r1
+        vst1.32         {d9[1]},  [r0], r1
+        vst1.32         {d17[1]}, [r0], r1
+        vst1.32         {d1[1]},  [r0], r1
+        vst1.32         {d11[1]}, [r0], r1
+
+        align_pop_regs
+        bx              lr
+endfunc
+
+        .macro h264_loop_filter_chroma
+        vdup.8          d22, r2         @ alpha
+        vmovl.u8        q12, d24
+        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
+        vmovl.u8        q2,  d0
+        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
+        vsubw.u8        q2,  q2,  d16
+        vsli.16         d24, d24, #8
+        vshl.i16        q2,  q2,  #2
+        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
+        vaddw.u8        q2,  q2,  d18
+        vclt.u8         d26, d26, d22   @ < alpha
+        vsubw.u8        q2,  q2,  d2
+        vdup.8          d22, r3         @ beta
+        vclt.s8         d25, d24, #0
+        vrshrn.i16      d4,  q2,  #3
+        vclt.u8         d28, d28, d22   @ < beta
+        vbic            d26, d26, d25
+        vclt.u8         d30, d30, d22   @ < beta
+        vand            d26, d26, d28
+        vneg.s8         d25, d24
+        vand            d26, d26, d30
+        vmin.s8         d4,  d4,  d24
+        vmovl.u8        q14, d16
+        vand            d4,  d4,  d26
+        vmax.s8         d4,  d4,  d25
+        vmovl.u8        q11, d0
+        vaddw.s8        q14, q14, d4
+        vsubw.s8        q11, q11, d4
+        vqmovun.s16     d16, q14
+        vqmovun.s16     d0,  q11
+        .endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             r0,  r0,  r1, lsl #1
+        vld1.64         {d18}, [r0,:64], r1
+        vld1.64         {d16}, [r0,:64], r1
+        vld1.64         {d0},  [r0,:64], r1
+        vld1.64         {d2},  [r0,:64]
+
+        h264_loop_filter_chroma
+
+        sub             r0,  r0,  r1, lsl #1
+        vst1.64         {d16}, [r0,:64], r1
+        vst1.64         {d0},  [r0,:64], r1
+
+        bx              lr
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             r0,  r0,  #2
+        vld1.32         {d18[0]}, [r0], r1
+        vld1.32         {d16[0]}, [r0], r1
+        vld1.32         {d0[0]},  [r0], r1
+        vld1.32         {d2[0]},  [r0], r1
+        vld1.32         {d18[1]}, [r0], r1
+        vld1.32         {d16[1]}, [r0], r1
+        vld1.32         {d0[1]},  [r0], r1
+        vld1.32         {d2[1]},  [r0], r1
+
+        vtrn.16         d18, d0
+        vtrn.16         d16, d2
+        vtrn.8          d18, d16
+        vtrn.8          d0,  d2
+
+        h264_loop_filter_chroma
+
+        vtrn.16         d18, d0
+        vtrn.16         d16, d2
+        vtrn.8          d18, d16
+        vtrn.8          d0,  d2
+
+        sub             r0,  r0,  r1, lsl #3
+        vst1.32         {d18[0]}, [r0], r1
+        vst1.32         {d16[0]}, [r0], r1
+        vst1.32         {d0[0]},  [r0], r1
+        vst1.32         {d2[0]},  [r0], r1
+        vst1.32         {d18[1]}, [r0], r1
+        vst1.32         {d16[1]}, [r0], r1
+        vst1.32         {d0[1]},  [r0], r1
+        vst1.32         {d2[1]},  [r0], r1
+
+        bx              lr
+endfunc
+
+        /* H.264 qpel MC */
+
+        .macro  lowpass_const r
+        movw            \r,  #5
+        movt            \r,  #20
+        vmov.32         d6[0], \r
+        .endm
+
+        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
+.if \narrow
+        t0 .req q0
+        t1 .req q8
+.else
+        t0 .req \d0
+        t1 .req \d1
+.endif
+        vext.8          d2,  \r0, \r1, #2
+        vext.8          d3,  \r0, \r1, #3
+        vaddl.u8        q1,  d2,  d3
+        vext.8          d4,  \r0, \r1, #1
+        vext.8          d5,  \r0, \r1, #4
+        vaddl.u8        q2,  d4,  d5
+        vext.8          d30, \r0, \r1, #5
+        vaddl.u8        t0,  \r0, d30
+        vext.8          d18, \r2, \r3, #2
+        vmla.i16        t0,  q1,  d6[1]
+        vext.8          d19, \r2, \r3, #3
+        vaddl.u8        q9,  d18, d19
+        vext.8          d20, \r2, \r3, #1
+        vmls.i16        t0,  q2,  d6[0]
+        vext.8          d21, \r2, \r3, #4
+        vaddl.u8        q10, d20, d21
+        vext.8          d31, \r2, \r3, #5
+        vaddl.u8        t1,  \r2, d31
+        vmla.i16        t1,  q9,  d6[1]
+        vmls.i16        t1,  q10, d6[0]
+.if \narrow
+        vqrshrun.s16    \d0, t0,  #5
+        vqrshrun.s16    \d1, t1,  #5
+.endif
+        .unreq  t0
+        .unreq  t1
+        .endm
+
+        .macro  lowpass_8_1 r0, r1, d0, narrow=1
+.if \narrow
+        t0 .req q0
+.else
+        t0 .req \d0
+.endif
+        vext.8          d2,  \r0, \r1, #2
+        vext.8          d3,  \r0, \r1, #3
+        vaddl.u8        q1,  d2,  d3
+        vext.8          d4,  \r0, \r1, #1
+        vext.8          d5,  \r0, \r1, #4
+        vaddl.u8        q2,  d4,  d5
+        vext.8          d30, \r0, \r1, #5
+        vaddl.u8        t0,  \r0, d30
+        vmla.i16        t0,  q1,  d6[1]
+        vmls.i16        t0,  q2,  d6[0]
+.if \narrow
+        vqrshrun.s16    \d0, t0,  #5
+.endif
+        .unreq  t0
+        .endm
+
+        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
+        vext.16         q1,  \r0, \r1, #2
+        vext.16         q0,  \r0, \r1, #3
+        vaddl.s16       q9,  d2,  d0
+        vext.16         q2,  \r0, \r1, #1
+        vaddl.s16       q1,  d3,  d1
+        vext.16         q3,  \r0, \r1, #4
+        vaddl.s16       q10, d4,  d6
+        vext.16         \r1, \r0, \r1, #5
+        vaddl.s16       q2,  d5,  d7
+        vaddl.s16       q0,  \h0, \h1
+        vaddl.s16       q8,  \l0, \l1
+
+        vshl.i32        q3,  q9,  #4
+        vshl.i32        q9,  q9,  #2
+        vshl.i32        q15, q10, #2
+        vadd.i32        q9,  q9,  q3
+        vadd.i32        q10, q10, q15
+
+        vshl.i32        q3,  q1,  #4
+        vshl.i32        q1,  q1,  #2
+        vshl.i32        q15, q2,  #2
+        vadd.i32        q1,  q1,  q3
+        vadd.i32        q2,  q2,  q15
+
+        vadd.i32        q9,  q9,  q8
+        vsub.i32        q9,  q9,  q10
+
+        vadd.i32        q1,  q1,  q0
+        vsub.i32        q1,  q1,  q2
+
+        vrshrn.s32      d18, q9,  #10
+        vrshrn.s32      d19, q1,  #10
+
+        vqmovun.s16     \d,  q9
+        .endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+        mov             r4,  lr
+        mov             ip,  #16
+        mov             r3,  #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        sub             r1,  r1,  r2, lsl #4
+        add             r1,  r1,  #8
+        mov             ip,  #16
+        mov             lr,  r4
+        b               put_h264_qpel8_h_lowpass_neon
+endfunc
+
+        .macro h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
+        push            {lr}
+        mov             ip,  #16
+        bl              \type\()_h264_qpel8_h_lowpass_neon
+        sub             r0,  r0,  r3, lsl #4
+        sub             r1,  r1,  r2, lsl #4
+        add             r0,  r0,  #8
+        add             r1,  r1,  #8
+        mov             ip,  #16
+        pop             {lr}
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon
+1:      vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d16,d17}, [r1], r2
+        subs            ip,  ip,  #2
+        lowpass_8       d0,  d1,  d16, d17, d0,  d16
+.ifc \type,avg
+        vld1.8          {d2},     [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d3},     [r0,:64]
+        vrhadd.u8       d16, d16, d3
+        sub             r0,  r0,  r3
+.endif
+        vst1.64         {d0},     [r0,:64], r3
+        vst1.64         {d16},    [r0,:64], r3
+        bne             1b
+        bx              lr
+endfunc
+        .endm
+
+        h264_qpel_h_lowpass put
+        h264_qpel_h_lowpass avg
+
+        .macro h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
+        push            {lr}
+        mov             ip,  #16
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
+        sub             r0,  r0,  r2, lsl #4
+        sub             r1,  r1,  r2, lsl #4
+        sub             r3,  r3,  r2, lsl #4
+        add             r0,  r0,  #8
+        add             r1,  r1,  #8
+        add             r3,  r3,  #8
+        mov             ip,  #16
+        pop             {lr}
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon
+1:      vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d16,d17}, [r1], r2
+        vld1.64         {d28},     [r3], r2
+        vld1.64         {d29},     [r3], r2
+        subs            ip,  ip,  #2
+        lowpass_8       d0,  d1,  d16, d17, d0,  d1
+        vrhadd.u8       q0,  q0,  q14
+.ifc \type,avg
+        vld1.8          {d2},      [r0,:64], r2
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d3},      [r0,:64]
+        vrhadd.u8       d1,  d1,  d3
+        sub             r0,  r0,  r2
+.endif
+        vst1.64         {d0},      [r0,:64], r2
+        vst1.64         {d1},      [r0,:64], r2
+        bne             1b
+        bx              lr
+endfunc
+        .endm
+
+        h264_qpel_h_lowpass_l2 put
+        h264_qpel_h_lowpass_l2 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed
+        mov             r4,  lr
+        mov             r2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+        .macro h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
+        mov             r4,  lr
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             r0,  r0,  r2, lsl #4
+        add             r0,  r0,  #8
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon
+        vld1.64         {d8},  [r1], r3
+        vld1.64         {d10}, [r1], r3
+        vld1.64         {d12}, [r1], r3
+        vld1.64         {d14}, [r1], r3
+        vld1.64         {d22}, [r1], r3
+        vld1.64         {d24}, [r1], r3
+        vld1.64         {d26}, [r1], r3
+        vld1.64         {d28}, [r1], r3
+        vld1.64         {d9},  [r1], r3
+        vld1.64         {d11}, [r1], r3
+        vld1.64         {d13}, [r1], r3
+        vld1.64         {d15}, [r1], r3
+        vld1.64         {d23}, [r1]
+
+        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
+        lowpass_8       d8,  d9,  d10, d11, d8,  d10
+        lowpass_8       d12, d13, d14, d15, d12, d14
+        lowpass_8       d22, d23, d24, d25, d22, d24
+        lowpass_8       d26, d27, d28, d29, d26, d28
+        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
+
+.ifc \type,avg
+        vld1.8          {d9},  [r0,:64], r2
+        vrhadd.u8       d8,  d8,  d9
+        vld1.8          {d11}, [r0,:64], r2
+        vrhadd.u8       d10, d10, d11
+        vld1.8          {d13}, [r0,:64], r2
+        vrhadd.u8       d12, d12, d13
+        vld1.8          {d15}, [r0,:64], r2
+        vrhadd.u8       d14, d14, d15
+        vld1.8          {d23}, [r0,:64], r2
+        vrhadd.u8       d22, d22, d23
+        vld1.8          {d25}, [r0,:64], r2
+        vrhadd.u8       d24, d24, d25
+        vld1.8          {d27}, [r0,:64], r2
+        vrhadd.u8       d26, d26, d27
+        vld1.8          {d29}, [r0,:64], r2
+        vrhadd.u8       d28, d28, d29
+        sub             r0,  r0,  r2,  lsl #3
+.endif
+
+        vst1.64         {d8},  [r0,:64], r2
+        vst1.64         {d10}, [r0,:64], r2
+        vst1.64         {d12}, [r0,:64], r2
+        vst1.64         {d14}, [r0,:64], r2
+        vst1.64         {d22}, [r0,:64], r2
+        vst1.64         {d24}, [r0,:64], r2
+        vst1.64         {d26}, [r0,:64], r2
+        vst1.64         {d28}, [r0,:64], r2
+
+        bx              lr
+endfunc
+        .endm
+
+        h264_qpel_v_lowpass put
+        h264_qpel_v_lowpass avg
+
+        .macro h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
+        mov             r4,  lr
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             r0,  r0,  r3, lsl #4
+        sub             ip,  ip,  r2, lsl #4
+        add             r0,  r0,  #8
+        add             ip,  ip,  #8
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon
+        vld1.64         {d8},  [r1], r3
+        vld1.64         {d10}, [r1], r3
+        vld1.64         {d12}, [r1], r3
+        vld1.64         {d14}, [r1], r3
+        vld1.64         {d22}, [r1], r3
+        vld1.64         {d24}, [r1], r3
+        vld1.64         {d26}, [r1], r3
+        vld1.64         {d28}, [r1], r3
+        vld1.64         {d9},  [r1], r3
+        vld1.64         {d11}, [r1], r3
+        vld1.64         {d13}, [r1], r3
+        vld1.64         {d15}, [r1], r3
+        vld1.64         {d23}, [r1]
+
+        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
+        lowpass_8       d8,  d9,  d10, d11, d8,  d9
+        lowpass_8       d12, d13, d14, d15, d12, d13
+        lowpass_8       d22, d23, d24, d25, d22, d23
+        lowpass_8       d26, d27, d28, d29, d26, d27
+        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
+
+        vld1.64         {d0},  [ip], r2
+        vld1.64         {d1},  [ip], r2
+        vld1.64         {d2},  [ip], r2
+        vld1.64         {d3},  [ip], r2
+        vld1.64         {d4},  [ip], r2
+        vrhadd.u8       q0,  q0,  q4
+        vld1.64         {d5},  [ip], r2
+        vrhadd.u8       q1,  q1,  q6
+        vld1.64         {d10}, [ip], r2
+        vrhadd.u8       q2,  q2,  q11
+        vld1.64         {d11}, [ip], r2
+        vrhadd.u8       q5,  q5,  q13
+
+.ifc \type,avg
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d1,  d1,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d2,  d2,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d3,  d3,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d4,  d4,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d5,  d5,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d10, d10, d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d11, d11, d17
+        sub             r0,  r0,  r3,  lsl #3
+.endif
+
+        vst1.64         {d0},  [r0,:64], r3
+        vst1.64         {d1},  [r0,:64], r3
+        vst1.64         {d2},  [r0,:64], r3
+        vst1.64         {d3},  [r0,:64], r3
+        vst1.64         {d4},  [r0,:64], r3
+        vst1.64         {d5},  [r0,:64], r3
+        vst1.64         {d10}, [r0,:64], r3
+        vst1.64         {d11}, [r0,:64], r3
+
+        bx              lr
+endfunc
+        .endm
+
+        h264_qpel_v_lowpass_l2 put
+        h264_qpel_v_lowpass_l2 avg
+
+function put_h264_qpel8_hv_lowpass_neon_top
+        lowpass_const   ip
+        mov             ip,  #12
+1:      vld1.64         {d0, d1},  [r1], r3
+        vld1.64         {d16,d17}, [r1], r3
+        subs            ip,  ip,  #2
+        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
+        vst1.64         {d22-d25}, [r4,:128]!
+        bne             1b
+
+        vld1.64         {d0, d1},  [r1]
+        lowpass_8_1     d0,  d1,  q12, narrow=0
+
+        mov             ip,  #-16
+        add             r4,  r4,  ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        vld1.64         {d20,d21}, [r4,:128], ip
+        vld1.64         {d18,d19}, [r4,:128], ip
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d14,d15}, [r4,:128], ip
+        vld1.64         {d12,d13}, [r4,:128], ip
+        vld1.64         {d10,d11}, [r4,:128], ip
+        vld1.64         {d8, d9},  [r4,:128], ip
+        vld1.64         {d6, d7},  [r4,:128], ip
+        vld1.64         {d4, d5},  [r4,:128], ip
+        vld1.64         {d2, d3},  [r4,:128], ip
+        vld1.64         {d0, d1},  [r4,:128]
+
+        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
+        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
+
+        swap4           d17, d19, d21, d31, d24, d26, d28, d22
+        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
+
+        vst1.64         {d30,d31}, [r4,:128]!
+        vst1.64         {d6, d7},  [r4,:128]!
+        vst1.64         {d20,d21}, [r4,:128]!
+        vst1.64         {d4, d5},  [r4,:128]!
+        vst1.64         {d18,d19}, [r4,:128]!
+        vst1.64         {d2, d3},  [r4,:128]!
+        vst1.64         {d16,d17}, [r4,:128]!
+        vst1.64         {d0, d1},  [r4,:128]
+
+        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
+        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
+        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
+        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
+
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128]
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
+
+        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
+
+        bx              lr
+endfunc
+
+        .macro h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
+        mov             r10, lr
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+.ifc \type,avg
+        vld1.8          {d0},      [r0,:64], r2
+        vrhadd.u8       d12, d12, d0
+        vld1.8          {d1},      [r0,:64], r2
+        vrhadd.u8       d13, d13, d1
+        vld1.8          {d2},      [r0,:64], r2
+        vrhadd.u8       d14, d14, d2
+        vld1.8          {d3},      [r0,:64], r2
+        vrhadd.u8       d15, d15, d3
+        vld1.8          {d4},      [r0,:64], r2
+        vrhadd.u8       d8,  d8,  d4
+        vld1.8          {d5},      [r0,:64], r2
+        vrhadd.u8       d9,  d9,  d5
+        vld1.8          {d6},      [r0,:64], r2
+        vrhadd.u8       d10, d10, d6
+        vld1.8          {d7},      [r0,:64], r2
+        vrhadd.u8       d11, d11, d7
+        sub             r0,  r0,  r2,  lsl #3
+.endif
+        vst1.64         {d12},     [r0,:64], r2
+        vst1.64         {d13},     [r0,:64], r2
+        vst1.64         {d14},     [r0,:64], r2
+        vst1.64         {d15},     [r0,:64], r2
+        vst1.64         {d8},      [r0,:64], r2
+        vst1.64         {d9},      [r0,:64], r2
+        vst1.64         {d10},     [r0,:64], r2
+        vst1.64         {d11},     [r0,:64], r2
+
+        mov             lr,  r10
+        bx              lr
+endfunc
+        .endm
+
+        h264_qpel8_hv_lowpass put
+        h264_qpel8_hv_lowpass avg
+
+        .macro h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             r10, lr
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+
+        vld1.64         {d0, d1},  [r2,:128]!
+        vld1.64         {d2, d3},  [r2,:128]!
+        vrhadd.u8       q0,  q0,  q6
+        vld1.64         {d4, d5},  [r2,:128]!
+        vrhadd.u8       q1,  q1,  q7
+        vld1.64         {d6, d7},  [r2,:128]!
+        vrhadd.u8       q2,  q2,  q4
+        vrhadd.u8       q3,  q3,  q5
+.ifc \type,avg
+        vld1.8          {d16},     [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d16
+        vld1.8          {d17},     [r0,:64], r3
+        vrhadd.u8       d1,  d1,  d17
+        vld1.8          {d18},     [r0,:64], r3
+        vrhadd.u8       d2,  d2,  d18
+        vld1.8          {d19},     [r0,:64], r3
+        vrhadd.u8       d3,  d3,  d19
+        vld1.8          {d20},     [r0,:64], r3
+        vrhadd.u8       d4,  d4,  d20
+        vld1.8          {d21},     [r0,:64], r3
+        vrhadd.u8       d5,  d5,  d21
+        vld1.8          {d22},     [r0,:64], r3
+        vrhadd.u8       d6,  d6,  d22
+        vld1.8          {d23},     [r0,:64], r3
+        vrhadd.u8       d7,  d7,  d23
+        sub             r0,  r0,  r3,  lsl #3
+.endif
+        vst1.64         {d0},      [r0,:64], r3
+        vst1.64         {d1},      [r0,:64], r3
+        vst1.64         {d2},      [r0,:64], r3
+        vst1.64         {d3},      [r0,:64], r3
+        vst1.64         {d4},      [r0,:64], r3
+        vst1.64         {d5},      [r0,:64], r3
+        vst1.64         {d6},      [r0,:64], r3
+        vst1.64         {d7},      [r0,:64], r3
+
+        mov             lr,  r10
+        bx              lr
+endfunc
+        .endm
+
+        h264_qpel8_hv_lowpass_l2 put
+        h264_qpel8_hv_lowpass_l2 avg
+
+        .macro h264_qpel16_hv type
+function \type\()_h264_qpel16_hv_lowpass_neon
+        mov             r9,  lr
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        sub             r0,  r0,  r2, lsl #4
+        add             r0,  r0,  #8
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r9
+        b               \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
+
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             r9,  lr
+        sub             r2,  r4,  #256
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        sub             r0,  r0,  r3, lsl #4
+        add             r0,  r0,  #8
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r9
+        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+        .endm
+
+        h264_qpel16_hv put
+        h264_qpel16_hv avg
+
+        .macro h264_qpel8 type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
+        lowpass_const   r3
+        mov             r3,  r1
+        sub             r1,  r1,  #2
+        mov             ip,  #8
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
+        lowpass_const   r3
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        mov             ip,  #8
+        b               \type\()_h264_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
+        lowpass_const   r3
+        add             r3,  r1,  #1
+        sub             r1,  r1,  #2
+        mov             ip,  #8
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
+        push            {lr}
+        mov             ip,  r1
+\type\()_h264_qpel8_mc01:
+        lowpass_const   r3
+        mov             r3,  r2
+        sub             r1,  r1,  r2, lsl #1
+        vpush           {d8-d15}
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        pop             {pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+        push            {r0, r1, r11, lr}
+\type\()_h264_qpel8_mc11:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #64
+        mov             r0,  sp
+        sub             r1,  r1,  #2
+        mov             r3,  #8
+        mov             ip,  #8
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_h_lowpass_neon
+        ldrd            r0,  [r11]
+        mov             r3,  r2
+        add             ip,  sp,  #64
+        sub             r1,  r1,  r2, lsl #1
+        mov             r2,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11, #8
+        pop             {r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+\type\()_h264_qpel8_mc21:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             r1,  r1,  #2
+        mov             r3,  #8
+        mov             r0,  sp
+        mov             ip,  #8
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_h_lowpass_neon
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        sub             r2,  r4,  #64
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4, r10, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r11, lr}
+        sub             r1,  r1,  #1
+        b               \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
+        push            {lr}
+        lowpass_const   r3
+        sub             r1,  r1,  r2, lsl #1
+        mov             r3,  r2
+        vpush           {d8-d15}
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        vpop            {d8-d15}
+        pop             {pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+\type\()_h264_qpel8_mc12:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             r1,  r1,  r2, lsl #1
+        mov             r3,  r2
+        mov             r2,  #8
+        mov             r0,  sp
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_v_lowpass_neon
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r3, lsl #1
+        sub             r1,  r1,  #2
+        sub             r2,  r4,  #64
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4, r10, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
+        push            {r4, r10, r11, lr}
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        sub             sp,  sp,  #(16*12)
+        mov             r4,  sp
+        vpush           {d8-d15}
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        vpop            {d8-d15}
+        mov             sp,  r11
+        pop             {r4, r10, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+        add             r1,  r1,  #1
+        b               \type\()_h264_qpel8_mc12
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
+        push            {lr}
+        add             ip,  r1,  r2
+        b               \type\()_h264_qpel8_mc01
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+        push            {r0, r1, r11, lr}
+        add             r1,  r1,  r2
+        b               \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+        add             r1,  r1,  r2
+        b               \type\()_h264_qpel8_mc21
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r11, lr}
+        add             r1,  r1,  r2
+        sub             r1,  r1,  #1
+        b               \type\()_h264_qpel8_mc11
+endfunc
+        .endm
+
+        h264_qpel8 put
+        h264_qpel8 avg
+
+        .macro h264_qpel16 type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
+        lowpass_const   r3
+        mov             r3,  r1
+        sub             r1,  r1,  #2
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
+        lowpass_const   r3
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        b               \type\()_h264_qpel16_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
+        lowpass_const   r3
+        add             r3,  r1,  #1
+        sub             r1,  r1,  #2
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
+        push            {r4, lr}
+        mov             ip,  r1
+\type\()_h264_qpel16_mc01:
+        lowpass_const   r3
+        mov             r3,  r2
+        sub             r1,  r1,  r2, lsl #1
+        vpush           {d8-d15}
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        pop             {r4, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+        push            {r0, r1, r4, r11, lr}
+\type\()_h264_qpel16_mc11:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #256
+        mov             r0,  sp
+        sub             r1,  r1,  #2
+        mov             r3,  #16
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_h_lowpass_neon
+        ldrd            r0,  [r11]
+        mov             r3,  r2
+        add             ip,  sp,  #64
+        sub             r1,  r1,  r2, lsl #1
+        mov             r2,  #16
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11, #8
+        pop             {r4, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+\type\()_h264_qpel16_mc21:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             r1,  r1,  #2
+        mov             r0,  sp
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_h_lowpass_neon_packed
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4-r5, r9-r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r4, r11, lr}
+        sub             r1,  r1,  #1
+        b               \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
+        push            {r4, lr}
+        lowpass_const   r3
+        sub             r1,  r1,  r2, lsl #1
+        mov             r3,  r2
+        vpush           {d8-d15}
+        bl              \type\()_h264_qpel16_v_lowpass_neon
+        vpop            {d8-d15}
+        pop             {r4, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+\type\()_h264_qpel16_mc12:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             r1,  r1,  r2, lsl #1
+        mov             r0,  sp
+        mov             r3,  r2
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_v_lowpass_neon_packed
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r3, lsl #1
+        sub             r1,  r1,  #2
+        mov             r2,  r3
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4-r5, r9-r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
+        push            {r4, r9-r11, lr}
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        sub             sp,  sp,  #(16*12)
+        mov             r4,  sp
+        vpush           {d8-d15}
+        bl              \type\()_h264_qpel16_hv_lowpass_neon
+        vpop            {d8-d15}
+        mov             sp,  r11
+        pop             {r4, r9-r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+        add             r1,  r1,  #1
+        b               \type\()_h264_qpel16_mc12
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
+        push            {r4, lr}
+        add             ip,  r1,  r2
+        b               \type\()_h264_qpel16_mc01
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+        push            {r0, r1, r4, r11, lr}
+        add             r1,  r1,  r2
+        b               \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+        add             r1,  r1,  r2
+        b               \type\()_h264_qpel16_mc21
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r4, r11, lr}
+        add             r1,  r1,  r2
+        sub             r1,  r1,  #1
+        b               \type\()_h264_qpel16_mc11
+endfunc
+        .endm
+
+        h264_qpel16 put
+        h264_qpel16 avg
+
+@ Biweighted prediction
+
+        .macro  biweight_16 macs, macd
+        vdup.8          d0,  r4
+        vdup.8          d1,  r5
+        vmov            q2,  q8
+        vmov            q3,  q8
+1:      subs            ip,  ip,  #2
+        vld1.8          {d20-d21},[r0,:128], r2
+        \macd           q2,  d0,  d20
+        pld             [r0]
+        \macd           q3,  d0,  d21
+        vld1.8          {d22-d23},[r1,:128], r2
+        \macs           q2,  d1,  d22
+        pld             [r1]
+        \macs           q3,  d1,  d23
+        vmov            q12, q8
+        vld1.8          {d28-d29},[r0,:128], r2
+        vmov            q13, q8
+        \macd           q12, d0,  d28
+        pld             [r0]
+        \macd           q13, d0,  d29
+        vld1.8          {d30-d31},[r1,:128], r2
+        \macs           q12, d1,  d30
+        pld             [r1]
+        \macs           q13, d1,  d31
+        vshl.s16        q2,  q2,  q9
+        vshl.s16        q3,  q3,  q9
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d5,  q3
+        vshl.s16        q12, q12, q9
+        vshl.s16        q13, q13, q9
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d25, q13
+        vmov            q3,  q8
+        vst1.8          {d4- d5}, [r6,:128], r2
+        vmov            q2,  q8
+        vst1.8          {d24-d25},[r6,:128], r2
+        bne             1b
+        pop             {r4-r6, pc}
+        .endm
+
+        .macro  biweight_8 macs, macd
+        vdup.8          d0,  r4
+        vdup.8          d1,  r5
+        vmov            q1,  q8
+        vmov            q10, q8
+1:      subs            ip,  ip,  #2
+        vld1.8          {d4},[r0,:64], r2
+        \macd           q1,  d0,  d4
+        pld             [r0]
+        vld1.8          {d5},[r1,:64], r2
+        \macs           q1,  d1,  d5
+        pld             [r1]
+        vld1.8          {d6},[r0,:64], r2
+        \macd           q10, d0,  d6
+        pld             [r0]
+        vld1.8          {d7},[r1,:64], r2
+        \macs           q10, d1,  d7
+        pld             [r1]
+        vshl.s16        q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vshl.s16        q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vmov            q10, q8
+        vst1.8          {d2},[r6,:64], r2
+        vmov            q1,  q8
+        vst1.8          {d4},[r6,:64], r2
+        bne             1b
+        pop             {r4-r6, pc}
+        .endm
+
+        .macro  biweight_4 macs, macd
+        vdup.8          d0,  r4
+        vdup.8          d1,  r5
+        vmov            q1,  q8
+        vmov            q10, q8
+1:      subs            ip,  ip,  #4
+        vld1.32         {d4[0]},[r0,:32], r2
+        vld1.32         {d4[1]},[r0,:32], r2
+        \macd           q1,  d0,  d4
+        pld             [r0]
+        vld1.32         {d5[0]},[r1,:32], r2
+        vld1.32         {d5[1]},[r1,:32], r2
+        \macs           q1,  d1,  d5
+        pld             [r1]
+        blt             2f
+        vld1.32         {d6[0]},[r0,:32], r2
+        vld1.32         {d6[1]},[r0,:32], r2
+        \macd           q10, d0,  d6
+        pld             [r0]
+        vld1.32         {d7[0]},[r1,:32], r2
+        vld1.32         {d7[1]},[r1,:32], r2
+        \macs           q10, d1,  d7
+        pld             [r1]
+        vshl.s16        q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vshl.s16        q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vmov            q10, q8
+        vst1.32         {d2[0]},[r6,:32], r2
+        vst1.32         {d2[1]},[r6,:32], r2
+        vmov            q1,  q8
+        vst1.32         {d4[0]},[r6,:32], r2
+        vst1.32         {d4[1]},[r6,:32], r2
+        bne             1b
+        pop             {r4-r6, pc}
+2:      vshl.s16        q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vst1.32         {d2[0]},[r6,:32], r2
+        vst1.32         {d2[1]},[r6,:32], r2
+        pop             {r4-r6, pc}
+        .endm
+
+        .macro  biweight_func w
+function biweight_h264_pixels_\w\()_neon
+        push            {r4-r6, lr}
+        add             r4,  sp,  #16
+        ldm             r4,  {r4-r6}
+        lsr             lr,  r4,  #31
+        add             r6,  r6,  #1
+        eors            lr,  lr,  r5,  lsr #30
+        orr             r6,  r6,  #1
+        vdup.16         q9,  r3
+        lsl             r6,  r6,  r3
+        vmvn            q9,  q9
+        vdup.16         q8,  r6
+        mov             r6,  r0
+        beq             10f
+        subs            lr,  lr,  #1
+        beq             20f
+        subs            lr,  lr,  #1
+        beq             30f
+        b               40f
+10:     biweight_\w     vmlal.u8, vmlal.u8
+20:     rsb             r4,  r4,  #0
+        biweight_\w     vmlal.u8, vmlsl.u8
+30:     rsb             r4,  r4,  #0
+        rsb             r5,  r5,  #0
+        biweight_\w     vmlsl.u8, vmlsl.u8
+40:     rsb             r5,  r5,  #0
+        biweight_\w     vmlsl.u8, vmlal.u8
+endfunc
+        .endm
+
+        .macro  biweight_entry w, h, b=1
+function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
+        mov             ip,  #\h
+.if \b
+        b               biweight_h264_pixels_\w\()_neon
+.endif
+endfunc
+        .endm
+
+        biweight_entry  16, 8
+        biweight_entry  16, 16, b=0
+        biweight_func   16
+
+        biweight_entry  8,  16
+        biweight_entry  8,  4
+        biweight_entry  8,  8,  b=0
+        biweight_func   8
+
+        biweight_entry  4,  8
+        biweight_entry  4,  2
+        biweight_entry  4,  4,  b=0
+        biweight_func   4
+
+@ Weighted prediction
+
+        .macro  weight_16 add
+        vdup.8          d0,  r3
+1:      subs            ip,  ip,  #2
+        vld1.8          {d20-d21},[r0,:128], r1
+        vmull.u8        q2,  d0,  d20
+        pld             [r0]
+        vmull.u8        q3,  d0,  d21
+        vld1.8          {d28-d29},[r0,:128], r1
+        vmull.u8        q12, d0,  d28
+        pld             [r0]
+        vmull.u8        q13, d0,  d29
+        \add            q2,  q8,  q2
+        vrshl.s16       q2,  q2,  q9
+        \add            q3,  q8,  q3
+        vrshl.s16       q3,  q3,  q9
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d5,  q3
+        \add            q12, q8,  q12
+        vrshl.s16       q12, q12, q9
+        \add            q13, q8,  q13
+        vrshl.s16       q13, q13, q9
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d25, q13
+        vst1.8          {d4- d5}, [r4,:128], r1
+        vst1.8          {d24-d25},[r4,:128], r1
+        bne             1b
+        pop             {r4, pc}
+        .endm
+
+        .macro  weight_8 add
+        vdup.8          d0,  r3
+1:      subs            ip,  ip,  #2
+        vld1.8          {d4},[r0,:64], r1
+        vmull.u8        q1,  d0,  d4
+        pld             [r0]
+        vld1.8          {d6},[r0,:64], r1
+        vmull.u8        q10, d0,  d6
+        \add            q1,  q8,  q1
+        pld             [r0]
+        vrshl.s16       q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        \add            q10, q8,  q10
+        vrshl.s16       q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vst1.8          {d2},[r4,:64], r1
+        vst1.8          {d4},[r4,:64], r1
+        bne             1b
+        pop             {r4, pc}
+        .endm
+
+        .macro  weight_4 add
+        vdup.8          d0,  r3
+        vmov            q1,  q8
+        vmov            q10, q8
+1:      subs            ip,  ip,  #4
+        vld1.32         {d4[0]},[r0,:32], r1
+        vld1.32         {d4[1]},[r0,:32], r1
+        vmull.u8        q1,  d0,  d4
+        pld             [r0]
+        blt             2f
+        vld1.32         {d6[0]},[r0,:32], r1
+        vld1.32         {d6[1]},[r0,:32], r1
+        vmull.u8        q10, d0,  d6
+        pld             [r0]
+        \add            q1,  q8,  q1
+        vrshl.s16       q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        \add            q10, q8,  q10
+        vrshl.s16       q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vmov            q10, q8
+        vst1.32         {d2[0]},[r4,:32], r1
+        vst1.32         {d2[1]},[r4,:32], r1
+        vmov            q1,  q8
+        vst1.32         {d4[0]},[r4,:32], r1
+        vst1.32         {d4[1]},[r4,:32], r1
+        bne             1b
+        pop             {r4, pc}
+2:      \add            q1,  q8,  q1
+        vrshl.s16       q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vst1.32         {d2[0]},[r4,:32], r1
+        vst1.32         {d2[1]},[r4,:32], r1
+        pop             {r4, pc}
+        .endm
+
+        .macro  weight_func w
+function weight_h264_pixels_\w\()_neon
+        push            {r4, lr}
+        ldr             r4,  [sp, #8]
+        cmp             r2,  #1
+        lsl             r4,  r4,  r2
+        vdup.16         q8,  r4
+        mov             r4,  r0
+        ble             20f
+        rsb             lr,  r2,  #1
+        vdup.16         q9,  lr
+        cmp             r3,  #0
+        blt             10f
+        weight_\w       vhadd.s16
+10:     rsb             r3,  r3,  #0
+        weight_\w       vhsub.s16
+20:     rsb             lr,  r2,  #0
+        vdup.16         q9,  lr
+        cmp             r3,  #0
+        blt             10f
+        weight_\w       vadd.s16
+10:     rsb             r3,  r3,  #0
+        weight_\w       vsub.s16
+endfunc
+        .endm
+
+        .macro  weight_entry w, h, b=1
+function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
+        mov             ip,  #\h
+.if \b
+        b               weight_h264_pixels_\w\()_neon
+.endif
+endfunc
+        .endm
+
+        weight_entry    16, 8
+        weight_entry    16, 16, b=0
+        weight_func     16
+
+        weight_entry    8,  16
+        weight_entry    8,  4
+        weight_entry    8,  8,  b=0
+        weight_func     8
+
+        weight_entry    4,  8
+        weight_entry    4,  2
+        weight_entry    4,  4,  b=0
+        weight_func     4
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264idct_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/h264idct_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+        .text
+
+function ff_h264_idct_add_neon, export=1
+        vld1.64         {d0-d3},  [r1,:128]
+
+        vswp            d1,  d2
+        vadd.i16        d4,  d0,  d1
+        vshr.s16        q8,  q1,  #1
+        vsub.i16        d5,  d0,  d1
+        vadd.i16        d6,  d2,  d17
+        vsub.i16        d7,  d16, d3
+        vadd.i16        q0,  q2,  q3
+        vsub.i16        q1,  q2,  q3
+
+        vtrn.16         d0,  d1
+        vtrn.16         d3,  d2
+        vtrn.32         d0,  d3
+        vtrn.32         d1,  d2
+
+        vadd.i16        d4,  d0,  d3
+        vld1.32         {d18[0]}, [r0,:32], r2
+        vswp            d1,  d3
+        vshr.s16        q8,  q1,  #1
+        vld1.32         {d19[1]}, [r0,:32], r2
+        vsub.i16        d5,  d0,  d1
+        vld1.32         {d18[1]}, [r0,:32], r2
+        vadd.i16        d6,  d16, d3
+        vld1.32         {d19[0]}, [r0,:32], r2
+        vsub.i16        d7,  d2,  d17
+        sub             r0,  r0,  r2, lsl #2
+        vadd.i16        q0,  q2,  q3
+        vsub.i16        q1,  q2,  q3
+
+        vrshr.s16       q0,  q0,  #6
+        vrshr.s16       q1,  q1,  #6
+
+        vaddw.u8        q0,  q0,  d18
+        vaddw.u8        q1,  q1,  d19
+
+        vqmovun.s16     d0,  q0
+        vqmovun.s16     d1,  q1
+
+        vst1.32         {d0[0]},  [r0,:32], r2
+        vst1.32         {d1[1]},  [r0,:32], r2
+        vst1.32         {d0[1]},  [r0,:32], r2
+        vst1.32         {d1[0]},  [r0,:32], r2
+
+        bx              lr
+endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+        vld1.16         {d2[],d3[]}, [r1,:16]
+        vrshr.s16       q1,  q1,  #6
+        vld1.32         {d0[0]},  [r0,:32], r2
+        vld1.32         {d0[1]},  [r0,:32], r2
+        vaddw.u8        q2,  q1,  d0
+        vld1.32         {d1[0]},  [r0,:32], r2
+        vld1.32         {d1[1]},  [r0,:32], r2
+        vaddw.u8        q1,  q1,  d1
+        vqmovun.s16     d0,  q2
+        vqmovun.s16     d1,  q1
+        sub             r0,  r0,  r2, lsl #2
+        vst1.32         {d0[0]},  [r0,:32], r2
+        vst1.32         {d0[1]},  [r0,:32], r2
+        vst1.32         {d1[0]},  [r0,:32], r2
+        vst1.32         {d1[1]},  [r0,:32], r2
+        bx              lr
+endfunc
+
+function ff_h264_idct_add16_neon, export=1
+        push            {r4-r8,lr}
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r1,  r2
+        mov             r2,  r3
+        ldr             r6,  [sp, #24]
+        movrel          r7,  scan8
+        mov             ip,  #16
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        subs            r8,  r8,  #1
+        blt             2f
+        ldrsh           lr,  [r1]
+        add             r0,  r0,  r4
+        movne           lr,  #0
+        cmp             lr,  #0
+        adrne           lr,  ff_h264_idct_dc_add_neon
+        adreq           lr,  ff_h264_idct_add_neon
+        blx             lr
+2:      subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r8,pc}
+endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+        push            {r4-r8,lr}
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r1,  r2
+        mov             r2,  r3
+        ldr             r6,  [sp, #24]
+        movrel          r7,  scan8
+        mov             ip,  #16
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        add             r0,  r0,  r4
+        cmp             r8,  #0
+        ldrsh           r8,  [r1]
+        adrne           lr,  ff_h264_idct_add_neon
+        adreq           lr,  ff_h264_idct_dc_add_neon
+        cmpeq           r8,  #0
+        blxne           lr
+        subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r8,pc}
+endfunc
+
+function ff_h264_idct_add8_neon, export=1
+        push            {r4-r10,lr}
+        ldm             r0,  {r4,r9}
+        add             r5,  r1,  #16*4
+        add             r1,  r2,  #16*32
+        mov             r2,  r3
+        ldr             r6,  [sp, #32]
+        movrel          r7,  scan8+16
+        mov             ip,  #8
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        tst             ip,  #4
+        addeq           r0,  r0,  r4
+        addne           r0,  r0,  r9
+        cmp             r8,  #0
+        ldrsh           r8,  [r1]
+        adrne           lr,  ff_h264_idct_add_neon
+        adreq           lr,  ff_h264_idct_dc_add_neon
+        cmpeq           r8,  #0
+        blxne           lr
+        subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r10,pc}
+endfunc
+
+        .section .rodata
+scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
+        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
+        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
+        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
+        .byte           1+1*8, 2+1*8
+        .byte           1+2*8, 2+2*8
+        .byte           1+4*8, 2+4*8
+        .byte           1+5*8, 2+5*8
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264pred_init_arm.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/h264pred_init_arm.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vert_neon(uint8_t *src, int stride);
+void ff_pred16x16_hor_neon(uint8_t *src, int stride);
+void ff_pred16x16_plane_neon(uint8_t *src, int stride);
+void ff_pred16x16_dc_neon(uint8_t *src, int stride);
+void ff_pred16x16_128_dc_neon(uint8_t *src, int stride);
+void ff_pred16x16_left_dc_neon(uint8_t *src, int stride);
+void ff_pred16x16_top_dc_neon(uint8_t *src, int stride);
+
+void ff_pred8x8_vert_neon(uint8_t *src, int stride);
+void ff_pred8x8_hor_neon(uint8_t *src, int stride);
+void ff_pred8x8_plane_neon(uint8_t *src, int stride);
+void ff_pred8x8_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_128_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_left_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_top_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_l0t_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride);
+void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride);
+
+#if HAVE_NEON
+static void ff_h264_pred_init_neon(H264PredContext *h)
+{
+    h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
+    h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
+    h->pred8x8[PLANE_PRED8x8    ] = ff_pred8x8_plane_neon;
+    h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
+    
+    h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
+    h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+    h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+    h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+    h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+    h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+    h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+
+
+    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
+    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
+    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
+    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+}
+#endif
+
+void ff_h264_pred_init_arm(H264PredContext *h)
+{
+    if (HAVE_NEON)    ff_h264_pred_init_neon(h);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264pred_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/h264pred_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
+.if \n == 8 || \hi == 0
+        vld1.8          {\rd[0]}, [\rs], \rt
+        vld1.8          {\rd[1]}, [\rs], \rt
+        vld1.8          {\rd[2]}, [\rs], \rt
+        vld1.8          {\rd[3]}, [\rs], \rt
+.endif
+.if \n == 8 || \hi == 1
+        vld1.8          {\rd[4]}, [\rs], \rt
+        vld1.8          {\rd[5]}, [\rs], \rt
+        vld1.8          {\rd[6]}, [\rs], \rt
+        vld1.8          {\rd[7]}, [\rs], \rt
+.endif
+        .endm
+
+        .macro add16x8  dq,  dl,  dh,  rl,  rh
+        vaddl.u8        \dq, \rl, \rh
+        vadd.u16        \dl, \dl, \dh
+        vpadd.u16       \dl, \dl, \dl
+        vpadd.u16       \dl, \dl, \dl
+        .endm
+
+function ff_pred16x16_128_dc_neon, export=1
+        vmov.i8         q0,  #128
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_top_dc_neon, export=1
+        sub             r2,  r0,  r1
+        vld1.8          {q0},     [r2,:128]
+        add16x8         q0,  d0,  d1,  d0,  d1
+        vrshrn.u16      d0,  q0,  #4
+        vdup.8          q0,  d0[0]
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_left_dc_neon, export=1
+        sub             r2,  r0,  #1
+        ldcol.8         d0,  r2,  r1
+        ldcol.8         d1,  r2,  r1
+        add16x8         q0,  d0,  d1,  d0,  d1
+        vrshrn.u16      d0,  q0,  #4
+        vdup.8          q0,  d0[0]
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_dc_neon, export=1
+        sub             r2,  r0,  r1
+        vld1.8          {q0},     [r2,:128]
+        sub             r2,  r0,  #1
+        ldcol.8         d2,  r2,  r1
+        ldcol.8         d3,  r2,  r1
+        vaddl.u8        q0,  d0,  d1
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0,  d0
+        vpadd.u16       d0,  d0,  d0
+        vrshrn.u16      d0,  q0,  #5
+        vdup.8          q0,  d0[0]
+.L_pred16x16_dc_end:
+        mov             r3,  #8
+6:      vst1.8          {q0},     [r0,:128], r1
+        vst1.8          {q0},     [r0,:128], r1
+        subs            r3,  r3,  #1
+        bne             6b
+        bx              lr
+endfunc
+
+function ff_pred16x16_hor_neon, export=1
+        sub             r2,  r0,  #1
+        mov             r3,  #16
+1:      vld1.8          {d0[],d1[]},[r2],      r1
+        vst1.8          {q0},       [r0,:128], r1
+        subs            r3,  r3,  #1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_pred16x16_vert_neon, export=1
+        sub             r0,  r0,  r1
+        vld1.8          {q0},     [r0,:128], r1
+        mov             r3,  #8
+1:      vst1.8          {q0},     [r0,:128], r1
+        vst1.8          {q0},     [r0,:128], r1
+        subs            r3,  r3,  #1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_pred16x16_plane_neon, export=1
+        sub             r3,  r0,  r1
+        add             r2,  r3,  #8
+        sub             r3,  r3,  #1
+        vld1.8          {d0},     [r3]
+        vld1.8          {d2},     [r2,:64], r1
+        ldcol.8         d1,  r3,  r1
+        add             r3,  r3,  r1
+        ldcol.8         d3,  r3,  r1
+        vrev64.8        q0,  q0
+        vaddl.u8        q8,  d2,  d3
+        vsubl.u8        q2,  d2,  d0
+        vsubl.u8        q3,  d3,  d1
+        movrel          r3,  p16weight
+        vld1.8          {q0},     [r3,:128]
+        vmul.s16        q2,  q2,  q0
+        vmul.s16        q3,  q3,  q0
+        vadd.i16        d4,  d4,  d5
+        vadd.i16        d5,  d6,  d7
+        vpadd.i16       d4,  d4,  d5
+        vpadd.i16       d4,  d4,  d4
+        vshl.i16        d5,  d4,  #2
+        vaddl.s16       q2,  d4,  d5
+        vrshrn.s32      d4,  q2,  #6
+        mov             r3,  #0
+        vtrn.16         d4,  d5
+        vadd.i16        d2,  d4,  d5
+        vshl.i16        d3,  d2,  #3
+        vrev64.16       d16, d17
+        vsub.i16        d3,  d3,  d2
+        vadd.i16        d16, d16, d0
+        vshl.i16        d2,  d16, #4
+        vsub.i16        d2,  d2,  d3
+        vshl.i16        d3,  d4,  #4
+        vext.16         q0,  q0,  q0,  #7
+        vsub.i16        d6,  d5,  d3
+        vmov.16         d0[0], r3
+        vmul.i16        q0,  q0,  d4[0]
+        vdup.16         q1,  d2[0]
+        vdup.16         q2,  d4[0]
+        vdup.16         q3,  d6[0]
+        vshl.i16        q2,  q2,  #3
+        vadd.i16        q1,  q1,  q0
+        vadd.i16        q3,  q3,  q2
+        mov             r3,  #16
+1:
+        vqshrun.s16     d0,  q1,  #5
+        vadd.i16        q1,  q1,  q2
+        vqshrun.s16     d1,  q1,  #5
+        vadd.i16        q1,  q1,  q3
+        vst1.8          {q0},     [r0,:128], r1
+        subs            r3,  r3,  #1
+        bne             1b
+        bx              lr
+endfunc
+
+        .section        .rodata
+        .align          4
+p16weight:
+        .short          1,2,3,4,5,6,7,8
+
+        .text
+
+function ff_pred8x8_hor_neon, export=1
+        sub             r2,  r0,  #1
+        mov             r3,  #8
+1:      vld1.8          {d0[]},   [r2],     r1
+        vst1.8          {d0},     [r0,:64], r1
+        subs            r3,  r3,  #1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_pred8x8_vert_neon, export=1
+        sub             r0,  r0,  r1
+        vld1.8          {d0},     [r0,:64], r1
+        mov             r3,  #4
+1:      vst1.8          {d0},     [r0,:64], r1
+        vst1.8          {d0},     [r0,:64], r1
+        subs            r3,  r3,  #1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_pred8x8_plane_neon, export=1
+        sub             r3,  r0,  r1
+        add             r2,  r3,  #4
+        sub             r3,  r3,  #1
+        vld1.32         {d0[0]},  [r3]
+        vld1.32         {d2[0]},  [r2,:32], r1
+        ldcol.8         d0,  r3,  r1,  4,  hi=1
+        add             r3,  r3,  r1
+        ldcol.8         d3,  r3,  r1,  4
+        vaddl.u8        q8,  d2,  d3
+        vrev32.8        d0,  d0
+        vtrn.32         d2,  d3
+        vsubl.u8        q2,  d2,  d0
+        movrel          r3,  p16weight
+        vld1.16         {q0},     [r3,:128]
+        vmul.s16        d4,  d4,  d0
+        vmul.s16        d5,  d5,  d0
+        vpadd.i16       d4,  d4,  d5
+        vpaddl.s16      d4,  d4
+        vshl.i32        d5,  d4,  #4
+        vadd.s32        d4,  d4,  d5
+        vrshrn.s32      d4,  q2,  #5
+        mov             r3,  #0
+        vtrn.16         d4,  d5
+        vadd.i16        d2,  d4,  d5
+        vshl.i16        d3,  d2,  #2
+        vrev64.16       d16, d16
+        vsub.i16        d3,  d3,  d2
+        vadd.i16        d16, d16, d0
+        vshl.i16        d2,  d16, #4
+        vsub.i16        d2,  d2,  d3
+        vshl.i16        d3,  d4,  #3
+        vext.16         q0,  q0,  q0,  #7
+        vsub.i16        d6,  d5,  d3
+        vmov.16         d0[0], r3
+        vmul.i16        q0,  q0,  d4[0]
+        vdup.16         q1,  d2[0]
+        vdup.16         q2,  d4[0]
+        vdup.16         q3,  d6[0]
+        vshl.i16        q2,  q2,  #3
+        vadd.i16        q1,  q1,  q0
+        vadd.i16        q3,  q3,  q2
+        mov             r3,  #8
+1:
+        vqshrun.s16     d0,  q1,  #5
+        vadd.i16        q1,  q1,  q3
+        vst1.8          {d0},     [r0,:64], r1
+        subs            r3,  r3,  #1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_pred8x8_128_dc_neon, export=1
+        vmov.i8         q0,  #128
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_top_dc_neon, export=1
+        sub             r2,  r0,  r1
+        vld1.8          {d0},     [r2,:64]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0,  d0
+        vrshrn.u16      d0,  q0,  #2
+        vdup.8          d1,  d0[1]
+        vdup.8          d0,  d0[0]
+        vtrn.32         d0,  d1
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_left_dc_neon, export=1
+        sub             r2,  r0,  #1
+        ldcol.8         d0,  r2,  r1
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0,  d0
+        vrshrn.u16      d0,  q0,  #2
+        vdup.8          d1,  d0[1]
+        vdup.8          d0,  d0[0]
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_dc_neon, export=1
+        sub             r2,  r0,  r1
+        vld1.8          {d0},     [r2,:64]
+        sub             r2,  r0,  #1
+        ldcol.8         d1,  r2,  r1
+        vtrn.32         d0,  d1
+        vpaddl.u8       q0,  q0
+        vpadd.u16       d0,  d0,  d1
+        vpadd.u16       d1,  d0,  d0
+        vrshrn.u16      d2,  q0,  #3
+        vrshrn.u16      d3,  q0,  #2
+        vdup.8          d0,  d2[4]
+        vdup.8          d1,  d3[3]
+        vdup.8          d4,  d3[2]
+        vdup.8          d5,  d2[5]
+        vtrn.32         q0,  q2
+.L_pred8x8_dc_end:
+        mov             r3,  #4
+        add             r2,  r0,  r1,  lsl #2
+6:      vst1.8          {d0},     [r0,:64], r1
+        vst1.8          {d1},     [r2,:64], r1
+        subs            r3,  r3,  #1
+        bne             6b
+        bx              lr
+endfunc
+
+function ff_pred8x8_l0t_dc_neon, export=1
+        sub             r2,  r0,  r1
+        vld1.8          {d0},     [r2,:64]
+        sub             r2,  r0,  #1
+        ldcol.8         d1,  r2,  r1,  4
+        vtrn.32         d0,  d1
+        vpaddl.u8       q0,  q0
+        vpadd.u16       d0,  d0,  d1
+        vpadd.u16       d1,  d0,  d0
+        vrshrn.u16      d2,  q0,  #3
+        vrshrn.u16      d3,  q0,  #2
+        vdup.8          d0,  d2[4]
+        vdup.8          d1,  d3[0]
+        vdup.8          q2,  d3[2]
+        vtrn.32         q0,  q2
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_l00_dc_neon, export=1
+        sub             r2,  r0,  #1
+        ldcol.8         d0,  r2,  r1,  4
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0,  d0
+        vrshrn.u16      d0,  q0,  #2
+        vmov.i8         d1,  #128
+        vdup.8          d0,  d0[0]
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0lt_dc_neon, export=1
+        sub             r2,  r0,  r1
+        vld1.8          {d0},     [r2,:64]
+        add             r2,  r0,  r1,  lsl #2
+        sub             r2,  r2,  #1
+        ldcol.8         d1,  r2,  r1,  4,  hi=1
+        vtrn.32         d0,  d1
+        vpaddl.u8       q0,  q0
+        vpadd.u16       d0,  d0,  d1
+        vpadd.u16       d1,  d0,  d0
+        vrshrn.u16      d3,  q0,  #2
+        vrshrn.u16      d2,  q0,  #3
+        vdup.8          d0,  d3[0]
+        vdup.8          d1,  d3[3]
+        vdup.8          d4,  d3[2]
+        vdup.8          d5,  d2[5]
+        vtrn.32         q0,  q2
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0l0_dc_neon, export=1
+        add             r2,  r0,  r1,  lsl #2
+        sub             r2,  r2,  #1
+        ldcol.8         d1,  r2,  r1,  4
+        vpaddl.u8       d2,  d1
+        vpadd.u16       d2,  d2,  d2
+        vrshrn.u16      d1,  q1,  #2
+        vmov.i8         d0,  #128
+        vdup.8          d1,  d1[0]
+        b               .L_pred8x8_dc_end
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/int_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/int_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,118 @@
+/*
+ * ARM NEON optimised integer operations
+ * Copyright (c) 2009 Kostya Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+        .fpu neon
+        .text
+
+function ff_scalarproduct_int16_neon, export=1
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        negs            r3,  r3
+        beq             2f
+
+        vdup.s32        q12, r3
+1:      vld1.16         {d16-d17}, [r0]!
+        vld1.16         {d20-d21}, [r1,:128]!
+        vmull.s16       q12, d16,  d20
+        vld1.16         {d18-d19}, [r0]!
+        vmull.s16       q13, d17,  d21
+        vld1.16         {d22-d23}, [r1,:128]!
+        vmull.s16       q14, d18,  d22
+        vmull.s16       q15, d19,  d23
+        vshl.s32        q8,  q12,  q12
+        vshl.s32        q9,  q13,  q12
+        vadd.s32        q0,  q0,   q8
+        vshl.s32        q10, q14,  q12
+        vadd.s32        q1,  q1,   q9
+        vshl.s32        q11, q15,  q12
+        vadd.s32        q2,  q2,   q10
+        vadd.s32        q3,  q3,   q11
+        subs            r2,  r2,   #16
+        bne             1b
+        b               3f
+
+2:      vld1.16         {d16-d17}, [r0]!
+        vld1.16         {d20-d21}, [r1,:128]!
+        vmlal.s16       q0,  d16,  d20
+        vld1.16         {d18-d19}, [r0]!
+        vmlal.s16       q1,  d17,  d21
+        vld1.16         {d22-d23}, [r1,:128]!
+        vmlal.s16       q2,  d18,  d22
+        vmlal.s16       q3,  d19,  d23
+        subs            r2,  r2,   #16
+        bne             2b
+
+3:      vpadd.s32       d16, d0,   d1
+        vpadd.s32       d17, d2,   d3
+        vpadd.s32       d10, d4,   d5
+        vpadd.s32       d11, d6,   d7
+        vpadd.s32       d0,  d16,  d17
+        vpadd.s32       d1,  d10,  d11
+        vpadd.s32       d2,  d0,   d1
+        vpaddl.s32      d3,  d2
+        vmov.32         r0,  d3[0]
+        bx              lr
+endfunc
+
+@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
+function ff_scalarproduct_and_madd_int16_neon, export=1
+        vld1.16         {d28[],d29[]}, [sp]
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        mov             r12, r0
+
+1:      vld1.16         {d16-d17}, [r0,:128]!
+        vld1.16         {d18-d19}, [r1]!
+        vld1.16         {d20-d21}, [r2]!
+        vld1.16         {d22-d23}, [r0,:128]!
+        vld1.16         {d24-d25}, [r1]!
+        vld1.16         {d26-d27}, [r2]!
+        vmul.s16        q10, q10,  q14
+        vmul.s16        q13, q13,  q14
+        vmlal.s16       q0,  d16,  d18
+        vmlal.s16       q1,  d17,  d19
+        vadd.s16        q10, q8,   q10
+        vadd.s16        q13, q11,  q13
+        vmlal.s16       q2,  d22,  d24
+        vmlal.s16       q3,  d23,  d25
+        vst1.16         {q10},     [r12,:128]!
+        subs            r3,  r3,   #16
+        vst1.16         {q13},     [r12,:128]!
+        bne             1b
+
+        vpadd.s32       d16, d0,   d1
+        vpadd.s32       d17, d2,   d3
+        vpadd.s32       d10, d4,   d5
+        vpadd.s32       d11, d6,   d7
+        vpadd.s32       d0,  d16,  d17
+        vpadd.s32       d1,  d10,  d11
+        vpadd.s32       d2,  d0,   d1
+        vpaddl.s32      d3,  d2
+        vmov.32         r0,  d3[0]
+        bx              lr
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/jrevdct_arm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/jrevdct_arm.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,388 @@
+/*
+   C-like prototype :
+        void j_rev_dct_arm(DCTBLOCK data)
+
+   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
+
+   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+#include "asm.S"
+
+#define FIX_0_298631336 2446
+#define FIX_0_541196100 4433
+#define FIX_0_765366865 6270
+#define FIX_1_175875602 9633
+#define FIX_1_501321110 12299
+#define FIX_2_053119869 16819
+#define FIX_3_072711026 25172
+#define FIX_M_0_390180644 -3196
+#define FIX_M_0_899976223 -7373
+#define FIX_M_1_847759065 -15137
+#define FIX_M_1_961570560 -16069
+#define FIX_M_2_562915447 -20995
+#define FIX_0xFFFF 0xFFFF
+
+#define FIX_0_298631336_ID      0
+#define FIX_0_541196100_ID      4
+#define FIX_0_765366865_ID      8
+#define FIX_1_175875602_ID     12
+#define FIX_1_501321110_ID     16
+#define FIX_2_053119869_ID     20
+#define FIX_3_072711026_ID     24
+#define FIX_M_0_390180644_ID   28
+#define FIX_M_0_899976223_ID   32
+#define FIX_M_1_847759065_ID   36
+#define FIX_M_1_961570560_ID   40
+#define FIX_M_2_562915447_ID   44
+#define FIX_0xFFFF_ID          48
+        .text
+        .align
+
+function ff_j_rev_dct_arm, export=1
+        stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
+
+        sub sp, sp, #4                  @ reserve some space on the stack
+        str r0, [ sp ]                  @ save the DCT pointer to the stack
+
+        mov lr, r0                      @ lr = pointer to the current row
+        mov r12, #8                     @ r12 = row-counter
+        adr r11, const_array            @ r11 = base pointer to the constants array
+row_loop:
+        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
+        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
+
+        @ Optimization for row that have all items except the first set to 0
+        @ (this works as the DCTELEMS are always 4-byte aligned)
+        ldr r5, [lr, # 0]
+        ldr r6, [lr, # 4]
+        ldr r3, [lr, # 8]
+        ldr r4, [lr, #12]
+        orr r3, r3, r4
+        orr r3, r3, r6
+        orrs r5, r3, r5
+        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
+        orrs r3, r3, r2
+        beq empty_row
+
+        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
+        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
+        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
+
+        ldr r3, [r11, #FIX_0_541196100_ID]
+        add r7, r2, r6
+        ldr r5, [r11, #FIX_M_1_847759065_ID]
+        mul r7, r3, r7                      @ r7 = z1
+        ldr r3, [r11, #FIX_0_765366865_ID]
+        mla r6, r5, r6, r7                  @ r6 = tmp2
+        add r5, r0, r4                      @ r5 = tmp0
+        mla r2, r3, r2, r7                  @ r2 = tmp3
+        sub r3, r0, r4                      @ r3 = tmp1
+
+        add r0, r2, r5, lsl #13             @ r0 = tmp10
+        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
+        add r4, r6, r3, lsl #13             @ r4 = tmp11
+        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
+
+        stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+        ldrsh r3, [lr, #10]             @ r3 = 'd3'
+        ldrsh r5, [lr, #12]             @ r5 = 'd5'
+        ldrsh r7, [lr, #14]             @ r7 = 'd7'
+
+        add r0, r3, r5                        @ r0 = 'z2'
+        add r2, r1, r7                  @ r2 = 'z1'
+        add r4, r3, r7                  @ r4 = 'z3'
+        add r6, r1, r5                  @ r6 = 'z4'
+        ldr r9, [r11, #FIX_1_175875602_ID]
+        add r8, r4, r6                  @ r8 = z3 + z4
+        ldr r10, [r11, #FIX_M_0_899976223_ID]
+        mul r8, r9, r8                  @ r8 = 'z5'
+        ldr r9, [r11, #FIX_M_2_562915447_ID]
+        mul r2, r10, r2                 @ r2 = 'z1'
+        ldr r10, [r11, #FIX_M_1_961570560_ID]
+        mul r0, r9, r0                  @ r0 = 'z2'
+        ldr r9, [r11, #FIX_M_0_390180644_ID]
+        mla r4, r10, r4, r8             @ r4 = 'z3'
+        ldr r10, [r11, #FIX_0_298631336_ID]
+        mla r6, r9, r6, r8              @ r6 = 'z4'
+        ldr r9, [r11, #FIX_2_053119869_ID]
+        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
+        ldr r10, [r11, #FIX_3_072711026_ID]
+        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
+        ldr r9, [r11, #FIX_1_501321110_ID]
+        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
+        add r7, r7, r4                  @ r7 = tmp0
+        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
+        add r5,        r5, r6                  @ r5 = tmp1
+        add r3, r3, r4                  @ r3 = tmp2
+        add r1, r1, r6                  @ r1 = tmp3
+
+        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+
+        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
+        add r8, r0, r1
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 0]
+
+        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
+        sub r8, r0, r1
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, #14]
+
+        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
+        add r8, r6, r3
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 2]
+
+        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
+        sub r8, r6, r3
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, #12]
+
+        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
+        add r8, r4, r5
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 4]
+
+        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
+        sub r8, r4, r5
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, #10]
+
+        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
+        add r8, r2, r7
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 6]
+
+        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
+        sub r8, r2, r7
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 8]
+
+        @ End of row loop
+        add lr, lr, #16
+        subs r12, r12, #1
+        bne row_loop
+        beq start_column_loop
+
+empty_row:
+        ldr r1, [r11, #FIX_0xFFFF_ID]
+        mov r0, r0, lsl #2
+        and r0, r0, r1
+        add r0, r0, r0, lsl #16
+        str r0, [lr, # 0]
+        str r0, [lr, # 4]
+        str r0, [lr, # 8]
+        str r0, [lr, #12]
+
+end_of_row_loop:
+        @ End of loop
+        add lr, lr, #16
+        subs r12, r12, #1
+        bne row_loop
+
+start_column_loop:
+        @ Start of column loop
+        ldr lr, [ sp ]
+        mov r12, #8
+column_loop:
+        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
+        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
+        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
+        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
+
+        ldr r3, [r11, #FIX_0_541196100_ID]
+        add r1, r2, r6
+        ldr r5, [r11, #FIX_M_1_847759065_ID]
+        mul r1, r3, r1                      @ r1 = z1
+        ldr r3, [r11, #FIX_0_765366865_ID]
+        mla r6, r5, r6, r1                  @ r6 = tmp2
+        add r5, r0, r4                      @ r5 = tmp0
+        mla r2, r3, r2, r1                  @ r2 = tmp3
+        sub r3, r0, r4                      @ r3 = tmp1
+
+        add r0, r2, r5, lsl #13             @ r0 = tmp10
+        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
+        add r4, r6, r3, lsl #13             @ r4 = tmp11
+        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
+
+        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
+        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
+        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
+        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
+
+        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
+        orr r9, r1, r3
+        orr r10, r5, r7
+        orrs r10, r9, r10
+        beq empty_odd_column
+
+        stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+        add r0, r3, r5                  @ r0 = 'z2'
+        add r2, r1, r7                  @ r2 = 'z1'
+        add r4, r3, r7                  @ r4 = 'z3'
+        add r6, r1, r5                  @ r6 = 'z4'
+        ldr r9, [r11, #FIX_1_175875602_ID]
+        add r8, r4, r6
+        ldr r10, [r11, #FIX_M_0_899976223_ID]
+        mul r8, r9, r8                  @ r8 = 'z5'
+        ldr r9, [r11, #FIX_M_2_562915447_ID]
+        mul r2, r10, r2                 @ r2 = 'z1'
+        ldr r10, [r11, #FIX_M_1_961570560_ID]
+        mul r0, r9, r0                  @ r0 = 'z2'
+        ldr r9, [r11, #FIX_M_0_390180644_ID]
+        mla r4, r10, r4, r8             @ r4 = 'z3'
+        ldr r10, [r11, #FIX_0_298631336_ID]
+        mla r6, r9, r6, r8              @ r6 = 'z4'
+        ldr r9, [r11, #FIX_2_053119869_ID]
+        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
+        ldr r10, [r11, #FIX_3_072711026_ID]
+        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
+        ldr r9, [r11, #FIX_1_501321110_ID]
+        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
+        add r7, r7, r4                  @ r7 = tmp0
+        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
+        add r5,        r5, r6                  @ r5 = tmp1
+        add r3, r3, r4                  @ r3 = tmp2
+        add r1, r1, r6                  @ r1 = tmp3
+
+        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+
+        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+        add r8, r0, r1
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 0*8)]
+
+        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+        sub r8, r0, r1
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #(14*8)]
+
+        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+        add r8, r4, r3
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 2*8)]
+
+        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+        sub r8, r4, r3
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #(12*8)]
+
+        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+        add r8, r6, r5
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 4*8)]
+
+        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+        sub r8, r6, r5
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #(10*8)]
+
+        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+        add r8, r2, r7
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 6*8)]
+
+        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+        sub r8, r2, r7
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 8*8)]
+
+        @ End of row loop
+        add lr, lr, #2
+        subs r12, r12, #1
+        bne column_loop
+        beq the_end
+
+empty_odd_column:
+        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+        add r0, r0, #(1<<17)
+        mov r0, r0, asr #18
+        strh r0, [lr, #( 0*8)]
+        strh r0, [lr, #(14*8)]
+
+        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+        add r4, r4, #(1<<17)
+        mov r4, r4, asr #18
+        strh r4, [lr, #( 2*8)]
+        strh r4, [lr, #(12*8)]
+
+        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+        add r6, r6, #(1<<17)
+        mov r6, r6, asr #18
+        strh r6, [lr, #( 4*8)]
+        strh r6, [lr, #(10*8)]
+
+        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+        add r2, r2, #(1<<17)
+        mov r2, r2, asr #18
+        strh r2, [lr, #( 6*8)]
+        strh r2, [lr, #( 8*8)]
+
+        @ End of row loop
+        add lr, lr, #2
+        subs r12, r12, #1
+        bne column_loop
+
+the_end:
+        @ The end....
+        add sp, sp, #4
+        ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
+
+const_array:
+        .align
+        .word FIX_0_298631336
+        .word FIX_0_541196100
+        .word FIX_0_765366865
+        .word FIX_1_175875602
+        .word FIX_1_501321110
+        .word FIX_2_053119869
+        .word FIX_3_072711026
+        .word FIX_M_0_390180644
+        .word FIX_M_0_899976223
+        .word FIX_M_1_847759065
+        .word FIX_M_1_961570560
+        .word FIX_M_2_562915447
+        .word FIX_0xFFFF
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mathops.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mathops.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,116 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MATHOPS_H
+#define AVCODEC_ARM_MATHOPS_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/common.h"
+
+#if HAVE_INLINE_ASM
+
+#   define MULL MULL
+static inline av_const int MULL(int a, int b, unsigned shift)
+{
+    int lo, hi;
+    __asm__("smull %0, %1, %2, %3     \n\t"
+            "mov   %0, %0,     lsr %4 \n\t"
+            "add   %1, %0, %1, lsl %5 \n\t"
+            : "=&r"(lo), "=&r"(hi)
+            : "r"(b), "r"(a), "ir"(shift), "ir"(32-shift));
+    return hi;
+}
+
+#define MULH MULH
+#if HAVE_ARMV6
+static inline av_const int MULH(int a, int b)
+{
+    int r;
+    __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+    return r;
+}
+#else
+static inline av_const int MULH(int a, int b)
+{
+    int lo, hi;
+    __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
+    return hi;
+}
+#endif
+
+static inline av_const int64_t MUL64(int a, int b)
+{
+    union { uint64_t x; unsigned hl[2]; } x;
+    __asm__ ("smull %0, %1, %2, %3"
+             : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b));
+    return x.x;
+}
+#define MUL64 MUL64
+
+static inline av_const int64_t MAC64(int64_t d, int a, int b)
+{
+    union { uint64_t x; unsigned hl[2]; } x = { d };
+    __asm__ ("smlal %0, %1, %2, %3"
+             : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b));
+    return x.x;
+}
+#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
+#define MLS64(d, a, b) MAC64(d, -(a), b)
+
+#if HAVE_ARMV5TE
+
+/* signed 16x16 -> 32 multiply add accumulate */
+#   define MAC16(rt, ra, rb)                                            \
+    __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
+
+/* signed 16x16 -> 32 multiply */
+#   define MUL16 MUL16
+static inline av_const int MUL16(int ra, int rb)
+{
+    int rt;
+    __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
+    return rt;
+}
+
+#endif
+
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+    int m;
+    __asm__ volatile (
+        "mov   %0, %2  \n\t"
+        "cmp   %1, %2  \n\t"
+        "movgt %0, %1  \n\t"
+        "movgt %1, %2  \n\t"
+        "cmp   %1, %3  \n\t"
+        "movle %1, %3  \n\t"
+        "cmp   %0, %1  \n\t"
+        "movgt %0, %1  \n\t"
+        : "=&r"(m), "+r"(a)
+        : "r"(b), "r"(c));
+    return m;
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_ARM_MATHOPS_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mdct_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mdct_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,303 @@
+/*
+ * ARM NEON optimised MDCT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+
+        .text
+
+#define ff_fft_calc_neon X(ff_fft_calc_neon)
+
+function ff_imdct_half_neon, export=1
+        push            {r4-r8,lr}
+
+        mov             r12, #1
+        ldr             lr,  [r0, #28]          @ mdct_bits
+        ldr             r4,  [r0, #32]          @ tcos
+        ldr             r3,  [r0, #8]           @ revtab
+        lsl             r12, r12, lr            @ n  = 1 << nbits
+        lsr             lr,  r12, #2            @ n4 = n >> 2
+        add             r7,  r2,  r12,  lsl #1
+        mov             r12, #-16
+        sub             r7,  r7,  #16
+
+        vld2.32         {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
+        vld2.32         {d0-d1},  [r2,:128]!    @ d0 =m0,x d1 =m1,x
+        vrev64.32       d17, d17
+        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
+        vmul.f32        d6,  d17, d2
+        vmul.f32        d7,  d0,  d2
+1:
+        subs            lr,  lr,  #2
+        ldr             r6,  [r3], #4
+        vmul.f32        d4,  d0,  d3
+        vmul.f32        d5,  d17, d3
+        vsub.f32        d4,  d6,  d4
+        vadd.f32        d5,  d5,  d7
+        uxth            r8,  r6,  ror #16
+        uxth            r6,  r6
+        add             r8,  r1,  r8,  lsl #3
+        add             r6,  r1,  r6,  lsl #3
+        beq             1f
+        vld2.32         {d16-d17},[r7,:128],r12
+        vld2.32         {d0-d1},  [r2,:128]!
+        vrev64.32       d17, d17
+        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
+        vmul.f32        d6,  d17, d2
+        vmul.f32        d7,  d0,  d2
+        vst2.32         {d4[0],d5[0]}, [r6,:64]
+        vst2.32         {d4[1],d5[1]}, [r8,:64]
+        b               1b
+1:
+        vst2.32         {d4[0],d5[0]}, [r6,:64]
+        vst2.32         {d4[1],d5[1]}, [r8,:64]
+
+        mov             r4,  r0
+        mov             r6,  r1
+        bl              ff_fft_calc_neon
+
+        mov             r12, #1
+        ldr             lr,  [r4, #28]          @ mdct_bits
+        ldr             r4,  [r4, #32]          @ tcos
+        lsl             r12, r12, lr            @ n  = 1 << nbits
+        lsr             lr,  r12, #3            @ n8 = n >> 3
+
+        add             r4,  r4,  lr,  lsl #3
+        add             r6,  r6,  lr,  lsl #3
+        sub             r1,  r4,  #16
+        sub             r3,  r6,  #16
+
+        mov             r7,  #-16
+        mov             r8,  r6
+        mov             r0,  r3
+
+        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
+        vld2.32         {d20-d21},[r6,:128]!    @ d20=i2,r2 d21=i3,r3
+        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
+1:
+        subs            lr,  lr,  #2
+        vmul.f32        d7,  d0,  d18
+        vld2.32         {d17,d19},[r4,:128]!    @ d17=c2,c3 d19=s2,s3
+        vmul.f32        d4,  d1,  d18
+        vmul.f32        d5,  d21, d19
+        vmul.f32        d6,  d20, d19
+        vmul.f32        d22, d1,  d16
+        vmul.f32        d23, d21, d17
+        vmul.f32        d24, d0,  d16
+        vmul.f32        d25, d20, d17
+        vadd.f32        d7,  d7,  d22
+        vadd.f32        d6,  d6,  d23
+        vsub.f32        d4,  d4,  d24
+        vsub.f32        d5,  d5,  d25
+        beq             1f
+        vld2.32         {d0-d1},  [r3,:128], r7
+        vld2.32         {d20-d21},[r6,:128]!
+        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
+        vrev64.32       q3,  q3
+        vst2.32         {d4,d6},  [r0,:128], r7
+        vst2.32         {d5,d7},  [r8,:128]!
+        b               1b
+1:
+        vrev64.32       q3,  q3
+        vst2.32         {d4,d6},  [r0,:128]
+        vst2.32         {d5,d7},  [r8,:128]
+
+        pop             {r4-r8,pc}
+endfunc
+
+function ff_imdct_calc_neon, export=1
+        push            {r4-r6,lr}
+
+        ldr             r3,  [r0, #28]
+        mov             r4,  #1
+        mov             r5,  r1
+        lsl             r4,  r4,  r3
+        add             r1,  r1,  r4
+
+        bl              ff_imdct_half_neon
+
+        add             r0,  r5,  r4,  lsl #2
+        add             r1,  r5,  r4,  lsl #1
+        sub             r0,  r0,  #8
+        sub             r2,  r1,  #16
+        mov             r3,  #-16
+        mov             r6,  #-8
+        vmov.i32        d30, #1<<31
+1:
+        vld1.32         {d0-d1},  [r2,:128], r3
+        pld             [r0, #-16]
+        vrev64.32       q0,  q0
+        vld1.32         {d2-d3},  [r1,:128]!
+        veor            d4,  d1,  d30
+        pld             [r2, #-16]
+        vrev64.32       q1,  q1
+        veor            d5,  d0,  d30
+        vst1.32         {d2},     [r0,:64], r6
+        vst1.32         {d3},     [r0,:64], r6
+        vst1.32         {d4-d5},  [r5,:128]!
+        subs            r4,  r4,  #16
+        bgt             1b
+
+        pop             {r4-r6,pc}
+endfunc
+
+function ff_mdct_calc_neon, export=1
+        push            {r4-r10,lr}
+
+        mov             r12, #1
+        ldr             lr,  [r0, #28]          @ mdct_bits
+        ldr             r4,  [r0, #32]          @ tcos
+        ldr             r3,  [r0, #8]           @ revtab
+        lsl             lr,  r12, lr            @ n  = 1 << nbits
+        add             r7,  r2,  lr            @ in4u
+        sub             r9,  r7,  #16           @ in4d
+        add             r2,  r7,  lr,  lsl #1   @ in3u
+        add             r8,  r9,  lr,  lsl #1   @ in3d
+        add             r5,  r4,  lr,  lsl #1
+        sub             r5,  r5,  #16
+        sub             r3,  r3,  #4
+        mov             r12, #-16
+
+        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
+        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
+        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
+        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
+        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
+        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
+        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
+        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
+        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
+        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
+        vsub.f32        d16, d16, d2            @ in0u-in2d      R
+        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
+1:
+        vmul.f32        d7,  d0,  d21           @  I*s
+        ldr             r10, [r3, lr, lsr #1]
+        vmul.f32        d6,  d1,  d20           @ -R*c
+        ldr             r6,  [r3, #4]!
+        vmul.f32        d4,  d1,  d21           @ -R*s
+        vmul.f32        d5,  d0,  d20           @  I*c
+        vmul.f32        d24, d16, d30           @  R*c
+        vmul.f32        d25, d17, d31           @ -I*s
+        vmul.f32        d22, d16, d31           @  R*s
+        vmul.f32        d23, d17, d30           @  I*c
+        subs            lr,  lr,  #16
+        vsub.f32        d6,  d6,  d7            @ -R*c-I*s
+        vadd.f32        d7,  d4,  d5            @ -R*s+I*c
+        vsub.f32        d24, d25, d24           @ I*s-R*c
+        vadd.f32        d25, d22, d23           @ R*s-I*c
+        beq             1f
+        mov             r12, #-16
+        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
+        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
+        vneg.f32        d7,  d7                 @  R*s-I*c
+        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
+        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
+        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
+        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
+        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
+        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
+        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
+        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
+        vsub.f32        d16, d16, d2            @ in0u-in2d      R
+        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
+        uxth            r12, r6,  ror #16
+        uxth            r6,  r6
+        add             r12, r1,  r12, lsl #3
+        add             r6,  r1,  r6,  lsl #3
+        vst2.32         {d6[0],d7[0]}, [r6,:64]
+        vst2.32         {d6[1],d7[1]}, [r12,:64]
+        uxth            r6,  r10, ror #16
+        uxth            r10, r10
+        add             r6 , r1,  r6,  lsl #3
+        add             r10, r1,  r10, lsl #3
+        vst2.32         {d24[0],d25[0]},[r10,:64]
+        vst2.32         {d24[1],d25[1]},[r6,:64]
+        b               1b
+1:
+        vneg.f32        d7,  d7                 @  R*s-I*c
+        uxth            r12, r6,  ror #16
+        uxth            r6,  r6
+        add             r12, r1,  r12, lsl #3
+        add             r6,  r1,  r6,  lsl #3
+        vst2.32         {d6[0],d7[0]}, [r6,:64]
+        vst2.32         {d6[1],d7[1]}, [r12,:64]
+        uxth            r6,  r10, ror #16
+        uxth            r10, r10
+        add             r6 , r1,  r6,  lsl #3
+        add             r10, r1,  r10, lsl #3
+        vst2.32         {d24[0],d25[0]},[r10,:64]
+        vst2.32         {d24[1],d25[1]},[r6,:64]
+
+        mov             r4,  r0
+        mov             r6,  r1
+        bl              ff_fft_calc_neon
+
+        mov             r12, #1
+        ldr             lr,  [r4, #28]          @ mdct_bits
+        ldr             r4,  [r4, #32]          @ tcos
+        lsl             r12, r12, lr            @ n  = 1 << nbits
+        lsr             lr,  r12, #3            @ n8 = n >> 3
+
+        add             r4,  r4,  lr,  lsl #3
+        add             r6,  r6,  lr,  lsl #3
+        sub             r1,  r4,  #16
+        sub             r3,  r6,  #16
+
+        mov             r7,  #-16
+        mov             r8,  r6
+        mov             r0,  r3
+
+        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
+        vld2.32         {d20-d21},[r6,:128]!    @ d20=r2,i2 d21=r3,i3
+        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
+1:
+        subs            lr,  lr,  #2
+        vmul.f32        d7,  d0,  d18           @ r1*s1,r0*s0
+        vld2.32         {d17,d19},[r4,:128]!    @ c2,c3 s2,s3
+        vmul.f32        d4,  d1,  d18           @ i1*s1,i0*s0
+        vmul.f32        d5,  d21, d19           @ i2*s2,i3*s3
+        vmul.f32        d6,  d20, d19           @ r2*s2,r3*s3
+        vmul.f32        d24, d0,  d16           @ r1*c1,r0*c0
+        vmul.f32        d25, d20, d17           @ r2*c2,r3*c3
+        vmul.f32        d22, d21, d17           @ i2*c2,i3*c3
+        vmul.f32        d23, d1,  d16           @ i1*c1,i0*c0
+        vadd.f32        d4,  d4,  d24           @ i1*s1+r1*c1,i0*s0+r0*c0
+        vadd.f32        d5,  d5,  d25           @ i2*s2+r2*c2,i3*s3+r3*c3
+        vsub.f32        d6,  d22, d6            @ i2*c2-r2*s2,i3*c3-r3*s3
+        vsub.f32        d7,  d23, d7            @ i1*c1-r1*s1,i0*c0-r0*s0
+        vneg.f32        q2,  q2
+        beq             1f
+        vld2.32         {d0-d1},  [r3,:128], r7
+        vld2.32         {d20-d21},[r6,:128]!
+        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
+        vrev64.32       q3,  q3
+        vst2.32         {d4,d6},  [r0,:128], r7
+        vst2.32         {d5,d7},  [r8,:128]!
+        b               1b
+1:
+        vrev64.32       q3,  q3
+        vst2.32         {d4,d6},  [r0,:128]
+        vst2.32         {d5,d7},  [r8,:128]
+
+        pop             {r4-r10,pc}
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_arm.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mpegvideo_arm.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideo_arm.h"
+
+void MPV_common_init_arm(MpegEncContext *s)
+{
+    /* IWMMXT support is a superset of armv5te, so
+     * allow optimized functions for armv5te unless
+     * a better iwmmxt function exists
+     */
+#if HAVE_ARMV5TE
+    MPV_common_init_armv5te(s);
+#endif
+#if HAVE_IWMMXT
+    MPV_common_init_iwmmxt(s);
+#endif
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_arm.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mpegvideo_arm.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,27 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MPEGVIDEO_H
+#define AVCODEC_ARM_MPEGVIDEO_H
+
+#include "libavcodec/mpegvideo.h"
+
+void MPV_common_init_iwmmxt(MpegEncContext *s);
+void MPV_common_init_armv5te(MpegEncContext *s);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_armv5te.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mpegvideo_armv5te.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,101 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideo_arm.h"
+
+void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count);
+
+#ifdef ENABLE_ARM_TESTS
+/**
+ * h263 dequantizer supplementary function, it is performance critical and needs to
+ * have optimized implementations for each architecture. Is also used as a reference
+ * implementation in regression tests
+ */
+static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count)
+{
+    int i, level;
+    for (i = 0; i < count; i++) {
+        level = block[i];
+        if (level) {
+            if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[i] = level;
+        }
+    }
+}
+#endif
+
+static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
+                                  DCTELEM *block, int n, int qscale)
+{
+    int level, qmul, qadd;
+    int nCoeffs;
+
+    assert(s->block_last_index[n]>=0);
+
+    qmul = qscale << 1;
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    }else{
+        qadd = 0;
+        level = block[0];
+    }
+    if(s->ac_pred)
+        nCoeffs=63;
+    else
+        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+    block[0] = level;
+}
+
+static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
+                                  DCTELEM *block, int n, int qscale)
+{
+    int qmul, qadd;
+    int nCoeffs;
+
+    assert(s->block_last_index[n]>=0);
+
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+
+    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+}
+
+void MPV_common_init_armv5te(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_armv5te_s.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,117 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+/*
+ * Special optimized version of dct_unquantize_h263_helper_c, it
+ * requires the block to be at least 8 bytes aligned, and may process
+ * more elements than requested.  But it is guaranteed to never
+ * process more than 64 elements provided that count argument is <= 64,
+ * so it is safe. This function is optimized for a common distribution
+ * of values for nCoeffs (they are mostly multiple of 8 plus one or
+ * two extra elements). So this function processes data as 8 elements
+ * per loop iteration and contains optional 2 elements processing in
+ * the end.
+ *
+ * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
+ */
+function ff_dct_unquantize_h263_armv5te, export=1
+        push            {r4-r9,lr}
+        mov             ip, #0
+        subs            r3, r3, #2
+        ble             2f
+        ldrd            r4, [r0, #0]
+1:
+        ldrd            r6, [r0, #8]
+
+        rsbs            r9, ip, r4, asr #16
+        addgt           r9, r2, #0
+        rsblt           r9, r2, #0
+        smlatbne        r9, r4, r1, r9
+
+        rsbs            lr, ip, r5, asr #16
+        addgt           lr, r2, #0
+        rsblt           lr, r2, #0
+        smlatbne        lr, r5, r1, lr
+
+        rsbs            r8, ip, r4, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r4, r4, r1, r8
+
+        rsbs            r8, ip, r5, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r5, r5, r1, r8
+
+        strh            r4, [r0], #2
+        strh            r9, [r0], #2
+        strh            r5, [r0], #2
+        strh            lr, [r0], #2
+
+        rsbs            r9, ip, r6, asr #16
+        addgt           r9, r2, #0
+        rsblt           r9, r2, #0
+        smlatbne        r9, r6, r1, r9
+
+        rsbs            lr, ip, r7, asr #16
+        addgt           lr, r2, #0
+        rsblt           lr, r2, #0
+        smlatbne        lr, r7, r1, lr
+
+        rsbs            r8, ip, r6, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r6, r6, r1, r8
+
+        rsbs            r8, ip, r7, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r7, r7, r1, r8
+
+        strh            r6, [r0], #2
+        strh            r9, [r0], #2
+        strh            r7, [r0], #2
+        strh            lr, [r0], #2
+
+        subs            r3, r3, #8
+        ldrgtd          r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
+        bgt             1b
+
+        adds            r3, r3, #2
+        pople           {r4-r9,pc}
+2:
+        ldrsh           r9, [r0, #0]
+        ldrsh           lr, [r0, #2]
+        mov             r8, r2
+        cmp             r9, #0
+        rsblt           r8, r2, #0
+        smlabbne        r9, r9, r1, r8
+        mov             r8, r2
+        cmp             lr, #0
+        rsblt           r8, r2, #0
+        smlabbne        lr, lr, r1, r8
+        strh            r9, [r0], #2
+        strh            lr, [r0], #2
+        pop             {r4-r9,pc}
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_iwmmxt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/mpegvideo_iwmmxt.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,120 @@
+/*
+ * copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideo_arm.h"
+
+static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
+                                             DCTELEM *block, int n, int qscale)
+{
+    int level, qmul, qadd;
+    int nCoeffs;
+    DCTELEM *block_orig = block;
+
+    assert(s->block_last_index[n]>=0);
+
+    qmul = qscale << 1;
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    }else{
+        qadd = 0;
+        level = block[0];
+    }
+    if(s->ac_pred)
+        nCoeffs=63;
+    else
+        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+    __asm__ volatile (
+/*      "movd %1, %%mm6                 \n\t" //qmul */
+/*      "packssdw %%mm6, %%mm6          \n\t" */
+/*      "packssdw %%mm6, %%mm6          \n\t" */
+        "tbcsth wr6, %[qmul]            \n\t"
+/*      "movd %2, %%mm5                 \n\t" //qadd */
+/*      "packssdw %%mm5, %%mm5          \n\t" */
+/*      "packssdw %%mm5, %%mm5          \n\t" */
+        "tbcsth wr5, %[qadd]            \n\t"
+        "wzero wr7                      \n\t" /* "pxor %%mm7, %%mm7             \n\t" */
+        "wzero wr4                      \n\t" /* "pxor %%mm4, %%mm4             \n\t" */
+        "wsubh wr7, wr5, wr7            \n\t" /* "psubw %%mm5, %%mm7            \n\t" */
+        "1:                             \n\t"
+        "wldrd wr2, [%[block]]          \n\t" /* "movq (%0, %3), %%mm0          \n\t" */
+        "wldrd wr3, [%[block], #8]      \n\t" /* "movq 8(%0, %3), %%mm1         \n\t" */
+        "wmulsl wr0, wr6, wr2           \n\t" /* "pmullw %%mm6, %%mm0           \n\t" */
+        "wmulsl wr1, wr6, wr3           \n\t" /* "pmullw %%mm6, %%mm1           \n\t" */
+/*      "movq (%0, %3), %%mm2           \n\t" */
+/*      "movq 8(%0, %3), %%mm3          \n\t" */
+        "wcmpgtsh wr2, wr4, wr2         \n\t" /* "pcmpgtw %%mm4, %%mm2          \n\t" // block[i] < 0 ? -1 : 0 */
+        "wcmpgtsh wr3, wr4, wr2         \n\t" /* "pcmpgtw %%mm4, %%mm3          \n\t" // block[i] < 0 ? -1 : 0 */
+        "wxor wr0, wr2, wr0             \n\t" /* "pxor %%mm2, %%mm0             \n\t" */
+        "wxor wr1, wr3, wr1             \n\t" /* "pxor %%mm3, %%mm1             \n\t" */
+        "waddh wr0, wr7, wr0            \n\t" /* "paddw %%mm7, %%mm0            \n\t" */
+        "waddh wr1, wr7, wr1            \n\t" /* "paddw %%mm7, %%mm1            \n\t" */
+        "wxor wr2, wr0, wr2             \n\t" /* "pxor %%mm0, %%mm2             \n\t" */
+        "wxor wr3, wr1, wr3             \n\t" /* "pxor %%mm1, %%mm3             \n\t" */
+        "wcmpeqh wr0, wr7, wr0          \n\t" /* "pcmpeqw %%mm7, %%mm0          \n\t" // block[i] == 0 ? -1 : 0 */
+        "wcmpeqh wr1, wr7, wr1          \n\t" /* "pcmpeqw %%mm7, %%mm1          \n\t" // block[i] == 0 ? -1 : 0 */
+        "wandn wr0, wr2, wr0            \n\t" /* "pandn %%mm2, %%mm0            \n\t" */
+        "wandn wr1, wr3, wr1            \n\t" /* "pandn %%mm3, %%mm1            \n\t" */
+        "wstrd wr0, [%[block]]          \n\t" /* "movq %%mm0, (%0, %3)          \n\t" */
+        "wstrd wr1, [%[block], #8]      \n\t" /* "movq %%mm1, 8(%0, %3)         \n\t" */
+        "add %[block], %[block], #16    \n\t" /* "addl $16, %3                  \n\t" */
+        "subs %[i], %[i], #1            \n\t"
+        "bne 1b                         \n\t" /* "jng 1b                                \n\t" */
+        :[block]"+r"(block)
+        :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
+        :"memory");
+
+    block_orig[0] = level;
+}
+
+#if 0
+static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
+                                             DCTELEM *block, int n, int qscale)
+{
+    int nCoeffs;
+
+    assert(s->block_last_index[n]>=0);
+
+    if(s->ac_pred)
+        nCoeffs=63;
+    else
+        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+    ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
+}
+#endif
+
+void MPV_common_init_iwmmxt(MpegEncContext *s)
+{
+    if (!(mm_flags & FF_MM_IWMMXT)) return;
+
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
+#if 0
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
+#endif
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/rdft_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/rdft_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,151 @@
+/*
+ * ARM NEON optimised RDFT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+
+function ff_rdft_calc_neon, export=1
+        push            {r4-r8,lr}
+
+        ldr             r6,  [r0, #4]           @ inverse
+        mov             r4,  r0
+        mov             r5,  r1
+
+        lsls            r6,  r6,  #31
+        bne             1f
+        add             r0,  r4,  #20
+        bl              X(ff_fft_permute_neon)
+        add             r0,  r4,  #20
+        mov             r1,  r5
+        bl              X(ff_fft_calc_neon)
+1:
+        ldr             r12, [r4, #0]           @ nbits
+        mov             r2,  #1
+        lsl             r12, r2,  r12
+        add             r0,  r5,  #8
+        add             r1,  r5,  r12, lsl #2
+        lsr             r12, r12, #2
+        ldr             r2,  [r4, #12]          @ tcos
+        sub             r12, r12, #2
+        ldr             r3,  [r4, #16]          @ tsin
+        mov             r7,  r0
+        sub             r1,  r1,  #8
+        mov             lr,  r1
+        mov             r8,  #-8
+        vld1.32         {d0},     [r0,:64]!     @ d1[0,1]
+        vld1.32         {d1},     [r1,:64], r8  @ d2[0,1]
+        vld1.32         {d4},     [r2,:64]!     @ tcos[i]
+        vld1.32         {d5},     [r3,:64]!     @ tsin[i]
+        vmov.f32        d18, #0.5               @ k1
+        vdup.32         d19, r6
+        pld             [r0, #32]
+        veor            d19, d18, d19           @ k2
+        vmov.i32        d16, #0
+        vmov.i32        d17, #1<<31
+        pld             [r1, #-32]
+        vtrn.32         d16, d17
+        pld             [r2, #32]
+        vrev64.32       d16, d16                @ d16=1,0 d17=0,1
+        pld             [r3, #32]
+2:
+        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
+        vld1.32         {d24},    [r0,:64]!     @  d1[0,1]
+        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
+        vld1.32         {d25},    [r1,:64], r8  @  d2[0,1]
+        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
+        veor            q3,  q12, q8            @ -d1[0],d1[1], d2[0],-d2[1]
+        pld             [r0, #32]
+        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
+        pld             [r1, #-32]
+        vadd.f32        d0,  d24, d7            @  d1[0]+d2[0], d1[1]-d2[1]
+        vadd.f32        d1,  d6,  d25           @ -d1[0]+d2[0], d1[1]+d2[1]
+        vmul.f32        q11, q0,  q9            @  ev.re, ev.im, od.im, od.re
+        veor            d7,  d21, d16           @ -od.im, od.re
+        vrev64.32       d3,  d21                @  od.re, od.im
+        veor            d6,  d20, d17           @  ev.re,-ev.im
+        veor            d2,  d3,  d16           @ -od.re, od.im
+        vmla.f32        d20, d3,  d4[1]
+        vmla.f32        d20, d7,  d5[1]
+        vmla.f32        d6,  d2,  d4[1]
+        vmla.f32        d6,  d21, d5[1]
+        vld1.32         {d4},     [r2,:64]!     @  tcos[i]
+        veor            d7,  d23, d16           @ -od.im, od.re
+        vld1.32         {d5},     [r3,:64]!     @  tsin[i]
+        veor            d24, d22, d17           @  ev.re,-ev.im
+        vrev64.32       d3,  d23                @  od.re, od.im
+        pld             [r2, #32]
+        veor            d2,  d3,  d16           @ -od.re, od.im
+        pld             [r3, #32]
+        vmla.f32        d22, d3,  d4[0]
+        vmla.f32        d22, d7,  d5[0]
+        vmla.f32        d24, d2,  d4[0]
+        vmla.f32        d24, d23, d5[0]
+        vld1.32         {d0},     [r0,:64]!     @  d1[0,1]
+        vld1.32         {d1},     [r1,:64], r8  @  d2[0,1]
+        vst1.32         {d20},    [r7,:64]!
+        vst1.32         {d6},     [lr,:64], r8
+        vst1.32         {d22},    [r7,:64]!
+        vst1.32         {d24},    [lr,:64], r8
+        subs            r12, r12, #2
+        bgt             2b
+
+        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
+        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
+        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
+        ldr             r2,  [r4, #8]           @  sign_convention
+        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
+        add             r0,  r0,  #4
+        bfc             r2,  #0,  #31
+        vld1.32         {d0[0]},  [r0,:32]
+        veor            d7,  d21, d16           @ -od.im, od.re
+        vrev64.32       d3,  d21                @  od.re, od.im
+        veor            d6,  d20, d17           @  ev.re,-ev.im
+        vld1.32         {d22},    [r5,:64]
+        vdup.32         d1,  r2
+        vmov            d23, d22
+        veor            d2,  d3,  d16           @ -od.re, od.im
+        vtrn.32         d22, d23
+        veor            d0,  d0,  d1
+        veor            d23, d23, d17
+        vmla.f32        d20, d3,  d4[1]
+        vmla.f32        d20, d7,  d5[1]
+        vmla.f32        d6,  d2,  d4[1]
+        vmla.f32        d6,  d21, d5[1]
+        vadd.f32        d22, d22, d23
+        vst1.32         {d20},    [r7,:64]
+        vst1.32         {d6},     [lr,:64]
+        vst1.32         {d0[0]},  [r0,:32]
+        vst1.32         {d22},    [r5,:64]
+
+        cmp             r6,  #0
+        popeq           {r4-r8,pc}
+
+        vmul.f32        d22, d22, d18
+        vst1.32         {d22},    [r5,:64]
+        add             r0,  r4,  #20
+        mov             r1,  r5
+        bl              X(ff_fft_permute_neon)
+        add             r0,  r4,  #20
+        mov             r1,  r5
+        pop             {r4-r8,lr}
+        b               X(ff_fft_calc_neon)
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_arm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/simple_idct_arm.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,486 @@
+/*
+ * simple_idct_arm.S
+ * Copyright (C) 2002 Frederic 'dilb' Boulay
+ *
+ * Author: Frederic Boulay <dilb@handhelds.org>
+ *
+ * The function defined in this file is derived from the simple_idct function
+ * from the libavcodec library part of the FFmpeg project.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+/* useful constants for the algorithm, they are save in __constant_ptr__ at */
+/* the end of the source code.*/
+#define W1  22725
+#define W2  21407
+#define W3  19266
+#define W4  16383
+#define W5  12873
+#define W6  8867
+#define W7  4520
+#define MASK_MSHW 0xFFFF0000
+
+/* offsets of the constants in the vector */
+#define offW1  0
+#define offW2  4
+#define offW3  8
+#define offW4  12
+#define offW5  16
+#define offW6  20
+#define offW7  24
+#define offMASK_MSHW 28
+
+#define ROW_SHIFT 11
+#define ROW_SHIFT2MSHW (16-11)
+#define COL_SHIFT 20
+#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
+#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
+
+
+        .text
+
+function ff_simple_idct_arm, export=1
+        @@ void simple_idct_arm(int16_t *block)
+        @@ save stack for reg needed (take all of them),
+        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
+        @@ so it must not be overwritten, if it is not saved!!
+        @@ R12 is another scratch register, so it should not be saved too
+        @@ save all registers
+        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
+        @@ at this point, R0=block, other registers are free.
+        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
+        adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
+        @@ add 2 temporary variables in the stack: R0 and R14
+        sub sp, sp, #8          @ allow 2 local variables
+        str r0, [sp, #0]        @ save block in sp[0]
+        @@ stack status
+        @@ sp+4   free
+        @@ sp+0   R0  (block)
+
+
+        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
+
+
+__row_loop:
+        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
+        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
+        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
+        ldr r3, [r14, #8]        @ R3=ROWr32[2]
+        ldr r4, [r14, #12]       @ R4=ROWr32[3]
+        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
+        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
+        @@ else follow the complete algorithm.
+        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
+        orr r5, r4, r3           @ R5=R4 | R3
+        orr r5, r5, r2           @ R5=R4 | R3 | R2
+        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
+        beq __end_row_loop
+        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
+        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
+        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
+        beq __almost_empty_row
+
+__b_evaluation:
+        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
+        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
+
+        @@ MUL16(b0, W1, row[1]);
+        @@ MUL16(b1, W3, row[1]);
+        @@ MUL16(b2, W5, row[1]);
+        @@ MUL16(b3, W7, row[1]);
+        @@ MAC16(b0, W3, row[3]);
+        @@ MAC16(b1, -W7, row[3]);
+        @@ MAC16(b2, -W1, row[3]);
+        @@ MAC16(b3, -W5, row[3]);
+        ldr r8, [r12, #offW1]    @ R8=W1
+        mov r2, r2, asr #16      @ R2=ROWr16[3]
+        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r9, [r12, #offW3]    @ R9=W3
+        ldr r10, [r12, #offW5]   @ R10=W5
+        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r11, [r12, #offW7]   @ R11=W7
+        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+                teq r2, #0               @ if null avoid muls
+                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        rsbne r2, r2, #0         @ R2=-ROWr16[3]
+        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+        @@ if (temp != 0) {}
+        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
+        beq __end_b_evaluation
+
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ MAC16(b0, W5, row[5]);
+        @@ MAC16(b2, W7, row[5]);
+        @@ MAC16(b3, W3, row[5]);
+        @@ MAC16(b1, -W1, row[5]);
+        @@ MAC16(b0, W7, row[7]);
+        @@ MAC16(b2, W3, row[7]);
+        @@ MAC16(b3, -W1, row[7]);
+        @@ MAC16(b1, -W5, row[7]);
+        mov r3, r3, asr #16      @ R3=ROWr16[5]
+                teq r3, #0               @ if null avoid muls
+        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
+        mov r4, r4, asr #16      @ R4=ROWr16[7]
+        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
+        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
+        rsbne r3, r3, #0         @ R3=-ROWr16[5]
+        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
+        @@ R3 is free now
+                teq r4, #0               @ if null avoid muls
+        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
+        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
+        rsbne r4, r4, #0         @ R4=-ROWr16[7]
+        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
+        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
+        @@ R4 is free now
+__end_b_evaluation:
+        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
+        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+
+__a_evaluation:
+        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+        @@ a1 = a0 + W6 * row[2];
+        @@ a2 = a0 - W6 * row[2];
+        @@ a3 = a0 - W2 * row[2];
+        @@ a0 = a0 + W2 * row[2];
+        ldr r9, [r12, #offW4]    @ R9=W4
+        mul r6, r9, r6           @ R6=W4*ROWr16[0]
+        ldr r10, [r12, #offW6]   @ R10=W6
+        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
+        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
+
+        mul r11, r10, r4         @ R11=W6*ROWr16[2]
+        ldr r8, [r12, #offW2]    @ R8=W2
+        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
+        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+        @@ if (temp != 0) {}
+        teq r2, #0
+        beq __end_bef_a_evaluation
+
+        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
+        mul r11, r8, r4          @ R11=W2*ROWr16[2]
+        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
+        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
+
+
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+
+
+        @@ a0 += W4*row[4]
+        @@ a1 -= W4*row[4]
+        @@ a2 -= W4*row[4]
+        @@ a3 += W4*row[4]
+        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
+                teq r11, #0              @ if null avoid muls
+        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
+        @@ R9 is free now
+        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
+        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
+        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
+        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
+        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
+        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+                teq r9, #0               @ if null avoid muls
+        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
+        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
+        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
+        @@ a0 += W6*row[6];
+        @@ a3 -= W6*row[6];
+        @@ a1 -= W2*row[6];
+        @@ a2 += W2*row[6];
+        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
+        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
+        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
+
+__end_a_evaluation:
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
+        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
+        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
+        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
+        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
+        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
+        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
+        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
+        add r8, r6, r0           @ R8=a0+b0
+        add r9, r2, r1           @ R9=a1+b1
+        @@ put 2 16 bits half-words in a 32bits word
+        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
+        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
+        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #0]
+
+        add r8, r3, r5           @ R8=a2+b2
+        add r9, r4, r7           @ R9=a3+b3
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #4]
+
+        sub r8, r4, r7           @ R8=a3-b3
+        sub r9, r3, r5           @ R9=a2-b2
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #8]
+
+        sub r8, r2, r1           @ R8=a1-b1
+        sub r9, r6, r0           @ R9=a0-b0
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #12]
+
+        bal __end_row_loop
+
+__almost_empty_row:
+        @@ the row was empty, except ROWr16[0], now, management of this special case
+        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
+        @@                R8=0xFFFF (temp), R9-R11 free
+        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
+        sub r8, r8, #1           @ R8 is now ready.
+        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
+        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
+        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
+        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
+        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
+        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
+
+__end_row_loop:
+        @@ at this point, R0-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        ldr r0, [sp, #0]         @ R0=block
+        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
+        sub r14, r14, #16
+        bne __row_loop
+
+
+
+        @@ at this point, R0=block, R1-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
+__col_loop:
+
+__b_evaluation2:
+        @@ at this point, R0=block (temp),  R1-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ proceed with b0-b3 first, followed by a0-a3
+        @@ MUL16(b0, W1, col[8x1]);
+        @@ MUL16(b1, W3, col[8x1]);
+        @@ MUL16(b2, W5, col[8x1]);
+        @@ MUL16(b3, W7, col[8x1]);
+        @@ MAC16(b0, W3, col[8x3]);
+        @@ MAC16(b1, -W7, col[8x3]);
+        @@ MAC16(b2, -W1, col[8x3]);
+        @@ MAC16(b3, -W5, col[8x3]);
+        ldr r8, [r12, #offW1]    @ R8=W1
+        ldrsh r7, [r14, #16]
+        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r9, [r12, #offW3]    @ R9=W3
+        ldr r10, [r12, #offW5]   @ R10=W5
+        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r11, [r12, #offW7]   @ R11=W7
+        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldrsh r2, [r14, #48]
+        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        teq r2, #0               @ if 0, then avoid muls
+        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        rsbne r2, r2, #0         @ R2=-ROWr16[3]
+        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
+        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ MAC16(b0, W5, col[5x8]);
+        @@ MAC16(b2, W7, col[5x8]);
+        @@ MAC16(b3, W3, col[5x8]);
+        @@ MAC16(b1, -W1, col[5x8]);
+        @@ MAC16(b0, W7, col[7x8]);
+        @@ MAC16(b2, W3, col[7x8]);
+        @@ MAC16(b3, -W1, col[7x8]);
+        @@ MAC16(b1, -W5, col[7x8]);
+        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
+        teq r3, #0               @ if 0 then avoid muls
+        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
+        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
+        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
+        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
+        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
+        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
+        @@ R3 is free now
+        teq r4, #0               @ if 0 then avoid muls
+        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
+        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
+        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
+        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
+        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
+        @@ R4 is free now
+__end_b_evaluation2:
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
+        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+
+__a_evaluation2:
+        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
+        @@ a1 = a0 + W6 * row[2];
+        @@ a2 = a0 - W6 * row[2];
+        @@ a3 = a0 - W2 * row[2];
+        @@ a0 = a0 + W2 * row[2];
+        ldrsh r6, [r14, #0]
+        ldr r9, [r12, #offW4]    @ R9=W4
+        mul r6, r9, r6           @ R6=W4*ROWr16[0]
+        ldr r10, [r12, #offW6]   @ R10=W6
+        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
+        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
+        mul r11, r10, r4         @ R11=W6*ROWr16[2]
+        ldr r8, [r12, #offW2]    @ R8=W2
+        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
+        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
+        mul r11, r8, r4          @ R11=W2*ROWr16[2]
+        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
+        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
+
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ a0 += W4*row[4]
+        @@ a1 -= W4*row[4]
+        @@ a2 -= W4*row[4]
+        @@ a3 += W4*row[4]
+        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
+        teq r11, #0              @ if null avoid muls
+        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
+        @@ R9 is free now
+        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
+        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
+        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
+        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
+        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
+        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+        teq r9, #0               @ if null avoid muls
+        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
+        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
+        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
+        @@ a0 += W6*row[6];
+        @@ a3 -= W6*row[6];
+        @@ a1 -= W2*row[6];
+        @@ a2 += W2*row[6];
+        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
+        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
+        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
+__end_a_evaluation2:
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
+        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
+        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
+        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
+        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
+        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
+        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
+        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
+        @@@@@ no optimization here @@@@@
+        add r8, r6, r0           @ R8=a0+b0
+        add r9, r2, r1           @ R9=a1+b1
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #0]
+        strh r9, [r14, #16]
+        add r8, r3, r5           @ R8=a2+b2
+        add r9, r4, r7           @ R9=a3+b3
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #32]
+        strh r9, [r14, #48]
+        sub r8, r4, r7           @ R8=a3-b3
+        sub r9, r3, r5           @ R9=a2-b2
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #64]
+        strh r9, [r14, #80]
+        sub r8, r2, r1           @ R8=a1-b1
+        sub r9, r6, r0           @ R9=a0-b0
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #96]
+        strh r9, [r14, #112]
+
+__end_col_loop:
+        @@ at this point, R0-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        ldr r0, [sp, #0]         @ R0=block
+        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
+        sub r14, r14, #2
+        bne __col_loop
+
+
+
+
+__end_simple_idct_arm:
+        @@ restore registers to previous status!
+        add sp, sp, #8 @@ the local variables!
+        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
+
+
+
+@@ kind of sub-function, here not to overload the common case.
+__end_bef_a_evaluation:
+        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
+        mul r11, r8, r4          @ R11=W2*ROWr16[2]
+        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
+        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
+        bal __end_a_evaluation
+
+
+__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
+        .align
+        .word   W1
+        .word   W2
+        .word   W3
+        .word   W4
+        .word   W5
+        .word   W6
+        .word   W7
+        .word   MASK_MSHW
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_armv5te.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/simple_idct_armv5te.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,703 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+        .text
+        .align
+w13:    .long W13
+w26:    .long W26
+w57:    .long W57
+
+function idct_row_armv5te
+        str    lr, [sp, #-4]!
+
+        ldrd   v1, [a1, #8]
+        ldrd   a3, [a1]              /* a3 = row[1:0], a4 = row[3:2] */
+        orrs   v1, v1, v2
+        cmpeq  v1, a4
+        cmpeq  v1, a3, lsr #16
+        beq    row_dc_only
+
+        mov    v1, #(1<<(ROW_SHIFT-1))
+        mov    ip, #16384
+        sub    ip, ip, #1            /* ip = W4 */
+        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
+        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
+        smultb a2, ip, a4
+        smulbb lr, ip, a4
+        add    v2, v1, a2
+        sub    v3, v1, a2
+        sub    v4, v1, lr
+        add    v1, v1, lr
+
+        ldr    ip, w13               /* ip = W1 | (W3 << 16) */
+        ldr    lr, w57               /* lr = W5 | (W7 << 16) */
+        smulbt v5, ip, a3
+        smultt v6, lr, a4
+        smlatt v5, ip, a4, v5
+        smultt a2, ip, a3
+        smulbt v7, lr, a3
+        sub    v6, v6, a2
+        smulbt a2, ip, a4
+        smultt fp, lr, a3
+        sub    v7, v7, a2
+        smulbt a2, lr, a4
+        ldrd   a3, [a1, #8]          /* a3=row[5:4] a4=row[7:6] */
+        sub    fp, fp, a2
+
+        orrs   a2, a3, a4
+        beq    1f
+
+        smlabt v5, lr, a3, v5
+        smlabt v6, ip, a3, v6
+        smlatt v5, lr, a4, v5
+        smlabt v6, lr, a4, v6
+        smlatt v7, lr, a3, v7
+        smlatt fp, ip, a3, fp
+        smulbt a2, ip, a4
+        smlatt v7, ip, a4, v7
+        sub    fp, fp, a2
+
+        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
+        mov    a2, #16384
+        sub    a2, a2, #1            /* a2 =  W4 */
+        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
+        smultb lr, ip, a4            /* lr =  W6*row[6] */
+        add    v1, v1, a2            /* v1 += W4*row[4] */
+        add    v1, v1, lr            /* v1 += W6*row[6] */
+        add    v4, v4, a2            /* v4 += W4*row[4] */
+        sub    v4, v4, lr            /* v4 -= W6*row[6] */
+        smulbb lr, ip, a4            /* lr =  W2*row[6] */
+        sub    v2, v2, a2            /* v2 -= W4*row[4] */
+        sub    v2, v2, lr            /* v2 -= W2*row[6] */
+        sub    v3, v3, a2            /* v3 -= W4*row[4] */
+        add    v3, v3, lr            /* v3 += W2*row[6] */
+
+1:      add    a2, v1, v5
+        mov    a3, a2, lsr #11
+        bic    a3, a3, #0x1f0000
+        sub    a2, v2, v6
+        mov    a2, a2, lsr #11
+        add    a3, a3, a2, lsl #16
+        add    a2, v3, v7
+        mov    a4, a2, lsr #11
+        bic    a4, a4, #0x1f0000
+        add    a2, v4, fp
+        mov    a2, a2, lsr #11
+        add    a4, a4, a2, lsl #16
+        strd   a3, [a1]
+
+        sub    a2, v4, fp
+        mov    a3, a2, lsr #11
+        bic    a3, a3, #0x1f0000
+        sub    a2, v3, v7
+        mov    a2, a2, lsr #11
+        add    a3, a3, a2, lsl #16
+        add    a2, v2, v6
+        mov    a4, a2, lsr #11
+        bic    a4, a4, #0x1f0000
+        sub    a2, v1, v5
+        mov    a2, a2, lsr #11
+        add    a4, a4, a2, lsl #16
+        strd   a3, [a1, #8]
+
+        ldr    pc, [sp], #4
+
+row_dc_only:
+        orr    a3, a3, a3, lsl #16
+        bic    a3, a3, #0xe000
+        mov    a3, a3, lsl #3
+        mov    a4, a3
+        strd   a3, [a1]
+        strd   a3, [a1, #8]
+
+        ldr    pc, [sp], #4
+endfunc
+
+        .macro idct_col
+        ldr    a4, [a1]              /* a4 = col[1:0] */
+        mov    ip, #16384
+        sub    ip, ip, #1            /* ip = W4 */
+#if 0
+        mov    v1, #(1<<(COL_SHIFT-1))
+        smlabt v2, ip, a4, v1        /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
+        smlabb v1, ip, a4, v1        /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
+        ldr    a4, [a1, #(16*4)]
+#else
+        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
+        add    v2, v1, a4, asr #16
+        rsb    v2, v2, v2, lsl #14
+        mov    a4, a4, lsl #16
+        add    v1, v1, a4, asr #16
+        ldr    a4, [a1, #(16*4)]
+        rsb    v1, v1, v1, lsl #14
+#endif
+
+        smulbb lr, ip, a4
+        smulbt a3, ip, a4
+        sub    v3, v1, lr
+        sub    v5, v1, lr
+        add    v7, v1, lr
+        add    v1, v1, lr
+        sub    v4, v2, a3
+        sub    v6, v2, a3
+        add    fp, v2, a3
+        ldr    ip, w26
+        ldr    a4, [a1, #(16*2)]
+        add    v2, v2, a3
+
+        smulbb lr, ip, a4
+        smultb a3, ip, a4
+        add    v1, v1, lr
+        sub    v7, v7, lr
+        add    v3, v3, a3
+        sub    v5, v5, a3
+        smulbt lr, ip, a4
+        smultt a3, ip, a4
+        add    v2, v2, lr
+        sub    fp, fp, lr
+        add    v4, v4, a3
+        ldr    a4, [a1, #(16*6)]
+        sub    v6, v6, a3
+
+        smultb lr, ip, a4
+        smulbb a3, ip, a4
+        add    v1, v1, lr
+        sub    v7, v7, lr
+        sub    v3, v3, a3
+        add    v5, v5, a3
+        smultt lr, ip, a4
+        smulbt a3, ip, a4
+        add    v2, v2, lr
+        sub    fp, fp, lr
+        sub    v4, v4, a3
+        add    v6, v6, a3
+
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
+
+        ldr    ip, w13
+        ldr    a4, [a1, #(16*1)]
+        ldr    lr, w57
+        smulbb v1, ip, a4
+        smultb v3, ip, a4
+        smulbb v5, lr, a4
+        smultb v7, lr, a4
+        smulbt v2, ip, a4
+        smultt v4, ip, a4
+        smulbt v6, lr, a4
+        smultt fp, lr, a4
+        rsb    v4, v4, #0
+        ldr    a4, [a1, #(16*3)]
+        rsb    v3, v3, #0
+
+        smlatb v1, ip, a4, v1
+        smlatb v3, lr, a4, v3
+        smulbb a3, ip, a4
+        smulbb a2, lr, a4
+        sub    v5, v5, a3
+        sub    v7, v7, a2
+        smlatt v2, ip, a4, v2
+        smlatt v4, lr, a4, v4
+        smulbt a3, ip, a4
+        smulbt a2, lr, a4
+        sub    v6, v6, a3
+        ldr    a4, [a1, #(16*5)]
+        sub    fp, fp, a2
+
+        smlabb v1, lr, a4, v1
+        smlabb v3, ip, a4, v3
+        smlatb v5, lr, a4, v5
+        smlatb v7, ip, a4, v7
+        smlabt v2, lr, a4, v2
+        smlabt v4, ip, a4, v4
+        smlatt v6, lr, a4, v6
+        ldr    a3, [a1, #(16*7)]
+        smlatt fp, ip, a4, fp
+
+        smlatb v1, lr, a3, v1
+        smlabb v3, lr, a3, v3
+        smlatb v5, ip, a3, v5
+        smulbb a4, ip, a3
+        smlatt v2, lr, a3, v2
+        sub    v7, v7, a4
+        smlabt v4, lr, a3, v4
+        smulbt a4, ip, a3
+        smlatt v6, ip, a3, v6
+        sub    fp, fp, a4
+        .endm
+
+function idct_col_armv5te
+        str    lr, [sp, #-4]!
+
+        idct_col
+
+        ldmfd  sp!, {a3, a4}
+        adds   a2, a3, v1
+        mov    a2, a2, lsr #20
+        orrmi  a2, a2, #0xf000
+        add    ip, a4, v2
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1]
+        subs   a3, a3, v1
+        mov    a2, a3, lsr #20
+        orrmi  a2, a2, #0xf000
+        sub    a4, a4, v2
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        ldmfd  sp!, {a3, a4}
+        str    a2, [a1, #(16*7)]
+
+        subs   a2, a3, v3
+        mov    a2, a2, lsr #20
+        orrmi  a2, a2, #0xf000
+        sub    ip, a4, v4
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1, #(16*1)]
+        adds   a3, a3, v3
+        mov    a2, a3, lsr #20
+        orrmi  a2, a2, #0xf000
+        add    a4, a4, v4
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        ldmfd  sp!, {a3, a4}
+        str    a2, [a1, #(16*6)]
+
+        adds   a2, a3, v5
+        mov    a2, a2, lsr #20
+        orrmi  a2, a2, #0xf000
+        add    ip, a4, v6
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1, #(16*2)]
+        subs   a3, a3, v5
+        mov    a2, a3, lsr #20
+        orrmi  a2, a2, #0xf000
+        sub    a4, a4, v6
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        ldmfd  sp!, {a3, a4}
+        str    a2, [a1, #(16*5)]
+
+        adds   a2, a3, v7
+        mov    a2, a2, lsr #20
+        orrmi  a2, a2, #0xf000
+        add    ip, a4, fp
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1, #(16*3)]
+        subs   a3, a3, v7
+        mov    a2, a3, lsr #20
+        orrmi  a2, a2, #0xf000
+        sub    a4, a4, fp
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        str    a2, [a1, #(16*4)]
+
+        ldr    pc, [sp], #4
+endfunc
+
+function idct_col_put_armv5te
+        str    lr, [sp, #-4]!
+
+        idct_col
+
+        ldmfd  sp!, {a3, a4}
+        ldr    lr, [sp, #32]
+        add    a2, a3, v1
+        movs   a2, a2, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    ip, a4, v2
+        movs   ip, ip, asr #20
+        movmi  ip, #0
+        cmp    ip, #255
+        movgt  ip, #255
+        orr    a2, a2, ip, lsl #8
+        sub    a3, a3, v1
+        movs   a3, a3, asr #20
+        movmi  a3, #0
+        cmp    a3, #255
+        movgt  a3, #255
+        sub    a4, a4, v2
+        movs   a4, a4, asr #20
+        movmi  a4, #0
+        cmp    a4, #255
+        ldr    v1, [sp, #28]
+        movgt  a4, #255
+        strh   a2, [v1]
+        add    a2, v1, #2
+        str    a2, [sp, #28]
+        orr    a2, a3, a4, lsl #8
+        rsb    v2, lr, lr, lsl #3
+        ldmfd  sp!, {a3, a4}
+        strh   a2, [v2, v1]!
+
+        sub    a2, a3, v3
+        movs   a2, a2, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        sub    ip, a4, v4
+        movs   ip, ip, asr #20
+        movmi  ip, #0
+        cmp    ip, #255
+        movgt  ip, #255
+        orr    a2, a2, ip, lsl #8
+        strh   a2, [v1, lr]!
+        add    a3, a3, v3
+        movs   a2, a3, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    a4, a4, v4
+        movs   a4, a4, asr #20
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        orr    a2, a2, a4, lsl #8
+        ldmfd  sp!, {a3, a4}
+        strh   a2, [v2, -lr]!
+
+        add    a2, a3, v5
+        movs   a2, a2, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    ip, a4, v6
+        movs   ip, ip, asr #20
+        movmi  ip, #0
+        cmp    ip, #255
+        movgt  ip, #255
+        orr    a2, a2, ip, lsl #8
+        strh   a2, [v1, lr]!
+        sub    a3, a3, v5
+        movs   a2, a3, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        sub    a4, a4, v6
+        movs   a4, a4, asr #20
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        orr    a2, a2, a4, lsl #8
+        ldmfd  sp!, {a3, a4}
+        strh   a2, [v2, -lr]!
+
+        add    a2, a3, v7
+        movs   a2, a2, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    ip, a4, fp
+        movs   ip, ip, asr #20
+        movmi  ip, #0
+        cmp    ip, #255
+        movgt  ip, #255
+        orr    a2, a2, ip, lsl #8
+        strh   a2, [v1, lr]
+        sub    a3, a3, v7
+        movs   a2, a3, asr #20
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        sub    a4, a4, fp
+        movs   a4, a4, asr #20
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        orr    a2, a2, a4, lsl #8
+        strh   a2, [v2, -lr]
+
+        ldr    pc, [sp], #4
+endfunc
+
+function idct_col_add_armv5te
+        str    lr, [sp, #-4]!
+
+        idct_col
+
+        ldr    lr, [sp, #36]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh   ip, [lr]
+        add    a2, a3, v1
+        mov    a2, a2, asr #20
+        sub    a3, a3, v1
+        and    v1, ip, #255
+        adds   a2, a2, v1
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    v1, a4, v2
+        mov    v1, v1, asr #20
+        adds   v1, v1, ip, lsr #8
+        movmi  v1, #0
+        cmp    v1, #255
+        movgt  v1, #255
+        orr    a2, a2, v1, lsl #8
+        ldr    v1, [sp, #32]
+        sub    a4, a4, v2
+        rsb    v2, v1, v1, lsl #3
+        ldrh   ip, [v2, lr]!
+        strh   a2, [lr]
+        mov    a3, a3, asr #20
+        and    a2, ip, #255
+        adds   a3, a3, a2
+        movmi  a3, #0
+        cmp    a3, #255
+        movgt  a3, #255
+        mov    a4, a4, asr #20
+        adds   a4, a4, ip, lsr #8
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        add    a2, lr, #2
+        str    a2, [sp, #28]
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh   ip, [lr, v1]!
+        sub    a2, a3, v3
+        mov    a2, a2, asr #20
+        add    a3, a3, v3
+        and    v3, ip, #255
+        adds   a2, a2, v3
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        sub    v3, a4, v4
+        mov    v3, v3, asr #20
+        adds   v3, v3, ip, lsr #8
+        movmi  v3, #0
+        cmp    v3, #255
+        movgt  v3, #255
+        orr    a2, a2, v3, lsl #8
+        add    a4, a4, v4
+        ldrh   ip, [v2, -v1]!
+        strh   a2, [lr]
+        mov    a3, a3, asr #20
+        and    a2, ip, #255
+        adds   a3, a3, a2
+        movmi  a3, #0
+        cmp    a3, #255
+        movgt  a3, #255
+        mov    a4, a4, asr #20
+        adds   a4, a4, ip, lsr #8
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh   ip, [lr, v1]!
+        add    a2, a3, v5
+        mov    a2, a2, asr #20
+        sub    a3, a3, v5
+        and    v3, ip, #255
+        adds   a2, a2, v3
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    v3, a4, v6
+        mov    v3, v3, asr #20
+        adds   v3, v3, ip, lsr #8
+        movmi  v3, #0
+        cmp    v3, #255
+        movgt  v3, #255
+        orr    a2, a2, v3, lsl #8
+        sub    a4, a4, v6
+        ldrh   ip, [v2, -v1]!
+        strh   a2, [lr]
+        mov    a3, a3, asr #20
+        and    a2, ip, #255
+        adds   a3, a3, a2
+        movmi  a3, #0
+        cmp    a3, #255
+        movgt  a3, #255
+        mov    a4, a4, asr #20
+        adds   a4, a4, ip, lsr #8
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh   ip, [lr, v1]!
+        add    a2, a3, v7
+        mov    a2, a2, asr #20
+        sub    a3, a3, v7
+        and    v3, ip, #255
+        adds   a2, a2, v3
+        movmi  a2, #0
+        cmp    a2, #255
+        movgt  a2, #255
+        add    v3, a4, fp
+        mov    v3, v3, asr #20
+        adds   v3, v3, ip, lsr #8
+        movmi  v3, #0
+        cmp    v3, #255
+        movgt  v3, #255
+        orr    a2, a2, v3, lsl #8
+        sub    a4, a4, fp
+        ldrh   ip, [v2, -v1]!
+        strh   a2, [lr]
+        mov    a3, a3, asr #20
+        and    a2, ip, #255
+        adds   a3, a3, a2
+        movmi  a3, #0
+        cmp    a3, #255
+        movgt  a3, #255
+        mov    a4, a4, asr #20
+        adds   a4, a4, ip, lsr #8
+        movmi  a4, #0
+        cmp    a4, #255
+        movgt  a4, #255
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldr    pc, [sp], #4
+endfunc
+
+function ff_simple_idct_armv5te, export=1
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+
+        sub    a1, a1, #(16*7)
+
+        bl     idct_col_armv5te
+        add    a1, a1, #4
+        bl     idct_col_armv5te
+        add    a1, a1, #4
+        bl     idct_col_armv5te
+        add    a1, a1, #4
+        bl     idct_col_armv5te
+
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
+
+function ff_simple_idct_add_armv5te, export=1
+        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+        mov    a1, a3
+
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+
+        sub    a1, a1, #(16*7)
+
+        bl     idct_col_add_armv5te
+        add    a1, a1, #4
+        bl     idct_col_add_armv5te
+        add    a1, a1, #4
+        bl     idct_col_add_armv5te
+        add    a1, a1, #4
+        bl     idct_col_add_armv5te
+
+        add    sp, sp, #8
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
+
+function ff_simple_idct_put_armv5te, export=1
+        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+        mov    a1, a3
+
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+
+        sub    a1, a1, #(16*7)
+
+        bl     idct_col_put_armv5te
+        add    a1, a1, #4
+        bl     idct_col_put_armv5te
+        add    a1, a1, #4
+        bl     idct_col_put_armv5te
+        add    a1, a1, #4
+        bl     idct_col_put_armv5te
+
+        add    sp, sp, #8
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_armv6.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/simple_idct_armv6.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,433 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W42 (W4 | (W2 << 16))
+#define W42n (-W4&0xffff | (-W2 << 16))
+#define W46 (W4 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+        .text
+        .align
+w13:    .long W13
+w26:    .long W26
+w42:    .long W42
+w42n:   .long W42n
+w46:    .long W46
+w57:    .long W57
+
+/*
+  Compute partial IDCT of single row.
+  shift = left-shift amount
+  r0 = source address
+  r2 = row[2,0] <= 2 cycles
+  r3 = row[3,1]
+  ip = w42      <= 2 cycles
+
+  Output in registers r4--r11
+*/
+        .macro idct_row shift
+        ldr    lr, w46               /* lr  = W4 | (W6 << 16) */
+        mov    r1, #(1<<(\shift-1))
+        smlad  r4, r2, ip, r1
+        smlsd  r7, r2, ip, r1
+        ldr    ip, w13               /* ip  = W1 | (W3 << 16) */
+        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
+        smlad  r5, r2, lr, r1
+        smlsd  r6, r2, lr, r1
+
+        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
+        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
+        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
+        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
+        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
+        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
+        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
+        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
+
+        ldr    r3, w42n              /* r3 =  -W4 | (-W2 << 16) */
+        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
+        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
+        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
+        ldr    ip, w46               /* ip =   W4 | (W6 << 16) */
+        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
+
+        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
+        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
+        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
+        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
+        .endm
+
+/*
+  Compute partial IDCT of half row.
+  shift = left-shift amount
+  r2 = row[2,0]
+  r3 = row[3,1]
+  ip = w42
+
+  Output in registers r4--r11
+*/
+        .macro idct_row4 shift
+        ldr    lr, w46               /* lr =  W4 | (W6 << 16) */
+        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
+        mov    r1, #(1<<(\shift-1))
+        smlad  r4, r2, ip, r1
+        smlsd  r7, r2, ip, r1
+        ldr    ip, w13               /* ip =  W1 | (W3 << 16) */
+        smlad  r5, r2, lr, r1
+        smlsd  r6, r2, lr, r1
+        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
+        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
+        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
+        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
+        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
+        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
+        .endm
+
+/*
+  Compute final part of IDCT single row without shift.
+  Input in registers r4--r11
+  Output in registers ip, r4--r6, lr, r8--r10
+*/
+        .macro idct_finish
+        add    ip, r4, r8            /* r1 = A0 + B0 */
+        sub    lr, r4, r8            /* r2 = A0 - B0 */
+        sub    r4, r5, r9            /* r2 = A1 + B1 */
+        add    r8, r5, r9            /* r2 = A1 - B1 */
+        add    r5, r6, r10           /* r1 = A2 + B2 */
+        sub    r9, r6, r10           /* r1 = A2 - B2 */
+        add    r6, r7, r11           /* r2 = A3 + B3 */
+        sub    r10,r7, r11           /* r2 = A3 - B3 */
+        .endm
+
+/*
+  Compute final part of IDCT single row.
+  shift = right-shift amount
+  Input/output in registers r4--r11
+*/
+        .macro idct_finish_shift shift
+        add    r3, r4, r8            /* r3 = A0 + B0 */
+        sub    r2, r4, r8            /* r2 = A0 - B0 */
+        mov    r4, r3, asr #\shift
+        mov    r8, r2, asr #\shift
+
+        sub    r3, r5, r9            /* r3 = A1 + B1 */
+        add    r2, r5, r9            /* r2 = A1 - B1 */
+        mov    r5, r3, asr #\shift
+        mov    r9, r2, asr #\shift
+
+        add    r3, r6, r10           /* r3 = A2 + B2 */
+        sub    r2, r6, r10           /* r2 = A2 - B2 */
+        mov    r6, r3, asr #\shift
+        mov    r10,r2, asr #\shift
+
+        add    r3, r7, r11           /* r3 = A3 + B3 */
+        sub    r2, r7, r11           /* r2 = A3 - B3 */
+        mov    r7, r3, asr #\shift
+        mov    r11,r2, asr #\shift
+        .endm
+
+/*
+  Compute final part of IDCT single row, saturating results at 8 bits.
+  shift = right-shift amount
+  Input/output in registers r4--r11
+*/
+        .macro idct_finish_shift_sat shift
+        add    r3, r4, r8            /* r3 = A0 + B0 */
+        sub    ip, r4, r8            /* ip = A0 - B0 */
+        usat   r4, #8, r3, asr #\shift
+        usat   r8, #8, ip, asr #\shift
+
+        sub    r3, r5, r9            /* r3 = A1 + B1 */
+        add    ip, r5, r9            /* ip = A1 - B1 */
+        usat   r5, #8, r3, asr #\shift
+        usat   r9, #8, ip, asr #\shift
+
+        add    r3, r6, r10           /* r3 = A2 + B2 */
+        sub    ip, r6, r10           /* ip = A2 - B2 */
+        usat   r6, #8, r3, asr #\shift
+        usat   r10,#8, ip, asr #\shift
+
+        add    r3, r7, r11           /* r3 = A3 + B3 */
+        sub    ip, r7, r11           /* ip = A3 - B3 */
+        usat   r7, #8, r3, asr #\shift
+        usat   r11,#8, ip, asr #\shift
+        .endm
+
+/*
+  Compute IDCT of single row, storing as column.
+  r0 = source
+  r1 = dest
+*/
+function idct_row_armv6
+        push   {lr}
+
+        ldr    lr, [r0, #12]         /* lr = row[7,5] */
+        ldr    ip, [r0, #4]          /* ip = row[6,4] */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        orrs   lr, lr, ip
+        cmpeq  lr, r3
+        cmpeq  lr, r2, lsr #16
+        beq    1f
+        push   {r1}
+        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
+        cmp    lr, #0
+        beq    2f
+
+        idct_row   ROW_SHIFT
+        b      3f
+
+2:      idct_row4  ROW_SHIFT
+
+3:      pop    {r1}
+        idct_finish_shift ROW_SHIFT
+
+        strh   r4, [r1]
+        strh   r5, [r1, #(16*2)]
+        strh   r6, [r1, #(16*4)]
+        strh   r7, [r1, #(16*6)]
+        strh   r11,[r1, #(16*1)]
+        strh   r10,[r1, #(16*3)]
+        strh   r9, [r1, #(16*5)]
+        strh   r8, [r1, #(16*7)]
+
+        pop    {pc}
+
+1:      mov    r2, r2, lsl #3
+        strh   r2, [r1]
+        strh   r2, [r1, #(16*2)]
+        strh   r2, [r1, #(16*4)]
+        strh   r2, [r1, #(16*6)]
+        strh   r2, [r1, #(16*1)]
+        strh   r2, [r1, #(16*3)]
+        strh   r2, [r1, #(16*5)]
+        strh   r2, [r1, #(16*7)]
+        pop    {pc}
+endfunc
+
+/*
+  Compute IDCT of single column, read as row.
+  r0 = source
+  r1 = dest
+*/
+function idct_col_armv6
+        push   {r1, lr}
+
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        idct_row COL_SHIFT
+        pop    {r1}
+        idct_finish_shift COL_SHIFT
+
+        strh   r4, [r1]
+        strh   r5, [r1, #(16*1)]
+        strh   r6, [r1, #(16*2)]
+        strh   r7, [r1, #(16*3)]
+        strh   r11,[r1, #(16*4)]
+        strh   r10,[r1, #(16*5)]
+        strh   r9, [r1, #(16*6)]
+        strh   r8, [r1, #(16*7)]
+
+        pop    {pc}
+endfunc
+
+/*
+  Compute IDCT of single column, read as row, store saturated 8-bit.
+  r0 = source
+  r1 = dest
+  r2 = line size
+*/
+function idct_col_put_armv6
+        push   {r1, r2, lr}
+
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        idct_row COL_SHIFT
+        pop    {r1, r2}
+        idct_finish_shift_sat COL_SHIFT
+
+        strb   r4, [r1], r2
+        strb   r5, [r1], r2
+        strb   r6, [r1], r2
+        strb   r7, [r1], r2
+        strb   r11,[r1], r2
+        strb   r10,[r1], r2
+        strb   r9, [r1], r2
+        strb   r8, [r1], r2
+
+        sub    r1, r1, r2, lsl #3
+
+        pop    {pc}
+endfunc
+
+/*
+  Compute IDCT of single column, read as row, add/store saturated 8-bit.
+  r0 = source
+  r1 = dest
+  r2 = line size
+*/
+function idct_col_add_armv6
+        push   {r1, r2, lr}
+
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        idct_row COL_SHIFT
+        pop    {r1, r2}
+        idct_finish
+
+        ldrb   r3, [r1]
+        ldrb   r7, [r1, r2]
+        ldrb   r11,[r1, r2, lsl #2]
+        add    ip, r3, ip, asr #COL_SHIFT
+        usat   ip, #8, ip
+        add    r4, r7, r4, asr #COL_SHIFT
+        strb   ip, [r1], r2
+        ldrb   ip, [r1, r2]
+        usat   r4, #8, r4
+        ldrb   r11,[r1, r2, lsl #2]
+        add    r5, ip, r5, asr #COL_SHIFT
+        usat   r5, #8, r5
+        strb   r4, [r1], r2
+        ldrb   r3, [r1, r2]
+        ldrb   ip, [r1, r2, lsl #2]
+        strb   r5, [r1], r2
+        ldrb   r7, [r1, r2]
+        ldrb   r4, [r1, r2, lsl #2]
+        add    r6, r3, r6, asr #COL_SHIFT
+        usat   r6, #8, r6
+        add    r10,r7, r10,asr #COL_SHIFT
+        usat   r10,#8, r10
+        add    r9, r11,r9, asr #COL_SHIFT
+        usat   r9, #8, r9
+        add    r8, ip, r8, asr #COL_SHIFT
+        usat   r8, #8, r8
+        add    lr, r4, lr, asr #COL_SHIFT
+        usat   lr, #8, lr
+        strb   r6, [r1], r2
+        strb   r10,[r1], r2
+        strb   r9, [r1], r2
+        strb   r8, [r1], r2
+        strb   lr, [r1], r2
+
+        sub    r1, r1, r2, lsl #3
+
+        pop    {pc}
+endfunc
+
+/*
+  Compute 8 IDCT row transforms.
+  func = IDCT row->col function
+  width = width of columns in bytes
+*/
+        .macro idct_rows func width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        sub    r0, r0, #(16*5)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+
+        sub    r0, r0, #(16*7)
+        .endm
+
+/* void ff_simple_idct_armv6(DCTELEM *data); */
+function ff_simple_idct_armv6, export=1
+        push   {r4-r11, lr}
+        sub    sp, sp, #128
+
+        mov    r1, sp
+        idct_rows idct_row_armv6, 2
+        mov    r1, r0
+        mov    r0, sp
+        idct_rows idct_col_armv6, 2
+
+        add    sp, sp, #128
+        pop    {r4-r11, pc}
+endfunc
+
+/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
+function ff_simple_idct_add_armv6, export=1
+        push   {r0, r1, r4-r11, lr}
+        sub    sp, sp, #128
+
+        mov    r0, r2
+        mov    r1, sp
+        idct_rows idct_row_armv6, 2
+        mov    r0, sp
+        ldr    r1, [sp, #128]
+        ldr    r2, [sp, #(128+4)]
+        idct_rows idct_col_add_armv6, 1
+
+        add    sp, sp, #(128+8)
+        pop    {r4-r11, pc}
+endfunc
+
+/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
+function ff_simple_idct_put_armv6, export=1
+        push   {r0, r1, r4-r11, lr}
+        sub    sp, sp, #128
+
+        mov    r0, r2
+        mov    r1, sp
+        idct_rows idct_row_armv6, 2
+        mov    r0, sp
+        ldr    r1, [sp, #128]
+        ldr    r2, [sp, #(128+4)]
+        idct_rows idct_col_put_armv6, 1
+
+        add    sp, sp, #(128+8)
+        pop    {r4-r11, pc}
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/simple_idct_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,373 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+        .macro idct_col4_top
+        vmull.s16       q7,  d6,  w2    /* q9   = W2 * col[2] */
+        vmull.s16       q8,  d6,  w6    /* q10  = W6 * col[2] */
+        vmull.s16       q9,  d4,  w1    /* q9   = W1 * col[1] */
+        vadd.i32        q11, q15, q7
+        vmull.s16       q10, d4,  w3    /* q10  = W3 * col[1] */
+        vadd.i32        q12, q15, q8
+        vmull.s16       q5,  d4,  w5    /* q5   = W5 * col[1] */
+        vsub.i32        q13, q15, q8
+        vmull.s16       q6,  d4,  w7    /* q6   = W7 * col[1] */
+        vsub.i32        q14, q15, q7
+
+        vmlal.s16       q9,  d8,  w3    /* q9  += W3 * col[3] */
+        vmlsl.s16       q10, d8,  w7    /* q10 -= W7 * col[3] */
+        vmlsl.s16       q5,  d8,  w1    /* q5  -= W1 * col[3] */
+        vmlsl.s16       q6,  d8,  w5    /* q6  -= W5 * col[3] */
+        .endm
+
+        .text
+        .align 6
+
+function idct_row4_pld_neon
+        pld             [r0]
+        add             r3,  r0,  r1,  lsl #2
+        pld             [r0, r1]
+        pld             [r0, r1, lsl #1]
+        pld             [r3, -r1]
+        pld             [r3]
+        pld             [r3, r1]
+        add             r3,  r3,  r1,  lsl #1
+        pld             [r3]
+        pld             [r3, r1]
+endfunc
+
+function idct_row4_neon
+        vmov.i32        q15, #(1<<(ROW_SHIFT-1))
+        vld1.64         {d2-d5},  [r2,:128]!
+        vmlal.s16       q15, d2,  w4    /* q15  += W4 * col[0] */
+        vld1.64         {d6,d7},  [r2,:128]!
+        vorr            d10, d3,  d5
+        vld1.64         {d8,d9},  [r2,:128]!
+        add             r2,  r2,  #-64
+
+        vorr            d11, d7,  d9
+        vorr            d10, d10, d11
+        vmov            r3,  r4,  d10
+
+        idct_col4_top
+
+        orrs            r3,  r3,  r4
+        beq             1f
+
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+        vsub.i32        q14, q14, q7
+
+1:      vadd.i32        q3,  q11, q9
+        vadd.i32        q4,  q12, q10
+        vshrn.i32       d2,  q3,  #ROW_SHIFT
+        vshrn.i32       d4,  q4,  #ROW_SHIFT
+        vadd.i32        q7,  q13, q5
+        vadd.i32        q8,  q14, q6
+        vtrn.16         d2,  d4
+        vshrn.i32       d6,  q7,  #ROW_SHIFT
+        vshrn.i32       d8,  q8,  #ROW_SHIFT
+        vsub.i32        q14, q14, q6
+        vsub.i32        q11, q11, q9
+        vtrn.16         d6,  d8
+        vsub.i32        q13, q13, q5
+        vshrn.i32       d3,  q14, #ROW_SHIFT
+        vtrn.32         d2,  d6
+        vsub.i32        q12, q12, q10
+        vtrn.32         d4,  d8
+        vshrn.i32       d5,  q13, #ROW_SHIFT
+        vshrn.i32       d7,  q12, #ROW_SHIFT
+        vshrn.i32       d9,  q11, #ROW_SHIFT
+
+        vtrn.16         d3,  d5
+        vtrn.16         d7,  d9
+        vtrn.32         d3,  d7
+        vtrn.32         d5,  d9
+
+        vst1.64         {d2-d5},  [r2,:128]!
+        vst1.64         {d6-d9},  [r2,:128]!
+
+        bx              lr
+endfunc
+
+function idct_col4_neon
+        mov             ip,  #16
+        vld1.64         {d2}, [r2,:64], ip /* d2 = col[0] */
+        vdup.16         d30, w4c
+        vld1.64         {d4}, [r2,:64], ip /* d3 = col[1] */
+        vadd.i16        d30, d30, d2
+        vld1.64         {d6}, [r2,:64], ip /* d4 = col[2] */
+        vmull.s16       q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+        vld1.64         {d8}, [r2,:64], ip /* d5 = col[3] */
+
+        ldrd            r4,  [r2]
+        ldrd            r6,  [r2, #16]
+        orrs            r4,  r4,  r5
+
+        idct_col4_top
+        addeq           r2,  r2,  #16
+        beq             1f
+
+        vld1.64         {d3}, [r2,:64], ip /* d6 = col[4] */
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+
+1:      orrs            r6,  r6,  r7
+        ldrd            r4,  [r2, #16]
+        addeq           r2,  r2,  #16
+        beq             2f
+
+        vld1.64         {d5}, [r2,:64], ip /* d7 = col[5] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+
+2:      orrs            r4,  r4,  r5
+        ldrd            r4,  [r2, #16]
+        addeq           r2,  r2,  #16
+        beq             3f
+
+        vld1.64         {d7}, [r2,:64], ip /* d8 = col[6] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q14, q14, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+
+3:      orrs            r4,  r4,  r5
+        addeq           r2,  r2,  #16
+        beq             4f
+
+        vld1.64         {d9}, [r2,:64], ip /* d9 = col[7] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+
+4:      vaddhn.i32      d2,  q11, q9
+        vaddhn.i32      d3,  q12, q10
+        vaddhn.i32      d4,  q13, q5
+        vaddhn.i32      d5,  q14, q6
+        vsubhn.i32      d9,  q11, q9
+        vsubhn.i32      d8,  q12, q10
+        vsubhn.i32      d7,  q13, q5
+        vsubhn.i32      d6,  q14, q6
+
+        bx              lr
+endfunc
+
+        .align 6
+
+function idct_col4_st8_neon
+        vqshrun.s16     d2,  q1,  #COL_SHIFT-16
+        vqshrun.s16     d3,  q2,  #COL_SHIFT-16
+        vqshrun.s16     d4,  q3,  #COL_SHIFT-16
+        vqshrun.s16     d5,  q4,  #COL_SHIFT-16
+        vst1.32         {d2[0]}, [r0,:32], r1
+        vst1.32         {d2[1]}, [r0,:32], r1
+        vst1.32         {d3[0]}, [r0,:32], r1
+        vst1.32         {d3[1]}, [r0,:32], r1
+        vst1.32         {d4[0]}, [r0,:32], r1
+        vst1.32         {d4[1]}, [r0,:32], r1
+        vst1.32         {d5[0]}, [r0,:32], r1
+        vst1.32         {d5[1]}, [r0,:32], r1
+
+        bx              lr
+endfunc
+
+        .section .rodata
+        .align 4
+idct_coeff_neon:
+        .short W1, W2, W3, W4, W5, W6, W7, W4c
+        .previous
+
+        .macro idct_start data
+        push            {r4-r7, lr}
+        pld             [\data]
+        pld             [\data, #64]
+        vpush           {d8-d15}
+        movrel          r3,  idct_coeff_neon
+        vld1.64         {d0,d1}, [r3,:128]
+        .endm
+
+        .macro idct_end
+        vpop            {d8-d15}
+        pop             {r4-r7, pc}
+        .endm
+
+/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_put_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_pld_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+
+        idct_end
+endfunc
+
+        .align 6
+
+function idct_col4_add8_neon
+        mov             ip,  r0
+
+        vld1.32         {d10[0]}, [r0,:32], r1
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vld1.32         {d10[1]}, [r0,:32], r1
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vld1.32         {d11[0]}, [r0,:32], r1
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vld1.32         {d11[1]}, [r0,:32], r1
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vld1.32         {d12[0]}, [r0,:32], r1
+        vaddw.u8        q1,  q1,  d10
+        vld1.32         {d12[1]}, [r0,:32], r1
+        vaddw.u8        q2,  q2,  d11
+        vld1.32         {d13[0]}, [r0,:32], r1
+        vqmovun.s16     d2,  q1
+        vld1.32         {d13[1]}, [r0,:32], r1
+        vaddw.u8        q3,  q3,  d12
+        vst1.32         {d2[0]},  [ip,:32], r1
+        vqmovun.s16     d3,  q2
+        vst1.32         {d2[1]},  [ip,:32], r1
+        vaddw.u8        q4,  q4,  d13
+        vst1.32         {d3[0]},  [ip,:32], r1
+        vqmovun.s16     d4,  q3
+        vst1.32         {d3[1]},  [ip,:32], r1
+        vqmovun.s16     d5,  q4
+        vst1.32         {d4[0]},  [ip,:32], r1
+        vst1.32         {d4[1]},  [ip,:32], r1
+        vst1.32         {d5[0]},  [ip,:32], r1
+        vst1.32         {d5[1]},  [ip,:32], r1
+
+        bx              lr
+endfunc
+
+/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_add_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_pld_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+
+        idct_end
+endfunc
+
+        .align 6
+
+function idct_col4_st16_neon
+        mov             ip,  #16
+
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vst1.64         {d2}, [r2,:64], ip
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vst1.64         {d3}, [r2,:64], ip
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vst1.64         {d4}, [r2,:64], ip
+        vst1.64         {d5}, [r2,:64], ip
+        vst1.64         {d6}, [r2,:64], ip
+        vst1.64         {d7}, [r2,:64], ip
+        vst1.64         {d8}, [r2,:64], ip
+        vst1.64         {d9}, [r2,:64], ip
+
+        bx              lr
+endfunc
+
+/* void ff_simple_idct_neon(DCTELEM *data); */
+function ff_simple_idct_neon, export=1
+        idct_start      r0
+
+        mov             r2,  r0
+        bl              idct_row4_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+
+        idct_end
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/synth_filter_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/synth_filter_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+
+function ff_synth_filter_float_neon, export=1
+        push            {r3-r11,lr}
+
+        ldr             r4,  [r2]               @ synth_buf_offset
+        add             r1,  r1,  r4,  lsl #2   @ synth_buf
+        sub             r12, r4,  #32
+        bfc             r12, #9,  #23
+        bic             r4,  r4,  #63
+        str             r12, [r2]
+
+        ldr             r2,  [sp, #12*4]        @ in
+        mov             r9,  r1                 @ synth_buf
+
+VFP     vpush           {d0}
+        bl              ff_imdct_half_neon
+VFP     vpop            {d0}
+        pop             {r3}
+
+        ldr             r5,  [sp, #9*4]         @ window
+        ldr             r2,  [sp, #10*4]        @ out
+NOVFP   vldr            d0,  [sp, #12*4]        @ scale, bias
+        add             r8,  r9,  #12*4
+
+        mov             lr,  #64*4
+        mov             r1,  #4
+1:
+        add             r10, r9,  #16*4         @ synth_buf
+        add             r11, r8,  #16*4
+        add             r0,  r5,  #16*4         @ window
+        add             r6,  r5,  #32*4
+        add             r7,  r5,  #48*4
+
+        vld1.32         {q10},    [r3,:128]     @ a
+        add             r3,  r3,  #16*4
+        vld1.32         {q1},     [r3,:128]     @ b
+        vmov.f32        q2,  #0.0               @ c
+        vmov.f32        q3,  #0.0               @ d
+
+        mov             r12, #512
+2:
+        vld1.32         {q9},     [r8, :128], lr
+        vrev64.32       q9,  q9
+        vld1.32         {q8},     [r5, :128], lr
+        vmls.f32        d20, d16, d19
+        vld1.32         {q11},    [r0, :128], lr
+        vmls.f32        d21, d17, d18
+        vld1.32         {q12},    [r9, :128], lr
+        vmla.f32        d2,  d22, d24
+        vld1.32         {q8},     [r6, :128], lr
+        vmla.f32        d3,  d23, d25
+        vld1.32         {q9},     [r10,:128], lr
+        vmla.f32        d4,  d16, d18
+        vld1.32         {q12},    [r11,:128], lr
+        vmla.f32        d5,  d17, d19
+        vrev64.32       q12, q12
+        vld1.32         {q11},    [r7, :128], lr
+        vmla.f32        d6,  d22, d25
+        vmla.f32        d7,  d23, d24
+        subs            r12, r12, #64
+        beq             3f
+        cmp             r12, r4
+        bne             2b
+        sub             r8,  r8,  #512*4
+        sub             r9,  r9,  #512*4
+        sub             r10, r10, #512*4
+        sub             r11, r11, #512*4
+        b               2b
+3:
+        vdup.32         q8,  d0[1]
+        vdup.32         q9,  d0[1]
+        vmla.f32        q8,  q10, d0[0]
+        vmla.f32        q9,  q1,  d0[0]
+        vst1.32         {q3},     [r3,:128]
+        sub             r3,  r3,  #16*4
+        vst1.32         {q2},     [r3,:128]
+        vst1.32         {q8},     [r2,:128]
+        add             r2,  r2,  #16*4
+        vst1.32         {q9},     [r2,:128]
+
+        subs            r1,  r1,  #1
+        popeq           {r4-r11,pc}
+
+        cmp             r4,  #0
+        subeq           r8,  r8,  #512*4
+        subeq           r9,  r9,  #512*4
+        sub             r5,  r5,  #512*4
+        sub             r2,  r2,  #12*4         @ out
+        add             r3,  r3,  #4*4          @ synth_buf2
+        add             r5,  r5,  #4*4          @ window
+        add             r9,  r9,  #4*4          @ synth_buf
+        sub             r8,  r8,  #4*4          @ synth_buf
+        b               1b
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/vp3dsp_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/arm/vp3dsp_neon.S	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.section .rodata
+.align 4
+
+vp3_idct_constants:
+.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
+
+#define xC1S7 d0[0]
+#define xC2S6 d0[1]
+#define xC3S5 d0[2]
+#define xC4S4 d0[3]
+#define xC5S3 d1[0]
+#define xC6S2 d1[1]
+#define xC7S1 d1[2]
+
+.text
+
+.macro vp3_loop_filter
+    vsubl.u8        q3,  d18, d17
+    vsubl.u8        q2,  d16, d19
+    vadd.i16        q1,  q3,  q3
+    vadd.i16        q2,  q2,  q3
+    vadd.i16        q0,  q1,  q2
+    vrshr.s16       q0,  q0,  #3
+    vmovl.u8        q9,  d18
+    vdup.u16        q15, r2
+
+    vabs.s16        q1,  q0
+    vshr.s16        q0,  q0,  #15
+    vqsub.u16       q2,  q15, q1
+    vqsub.u16       q3,  q2,  q1
+    vsub.i16        q1,  q2,  q3
+    veor            q1,  q1,  q0
+    vsub.i16        q0,  q1,  q0
+
+    vaddw.u8        q2,  q0,  d17
+    vsub.i16        q3,  q9,  q0
+    vqmovun.s16     d0,  q2
+    vqmovun.s16     d1,  q3
+.endm
+
+function ff_vp3_v_loop_filter_neon, export=1
+    sub             ip,  r0,  r1
+    sub             r0,  r0,  r1,  lsl #1
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d17}, [r0,:64], r1
+    vld1.64         {d18}, [r0,:64], r1
+    vld1.64         {d19}, [r0,:64], r1
+    ldrb            r2,    [r2, #129*4]
+
+    vp3_loop_filter
+
+    vst1.64         {d0},  [ip,:64], r1
+    vst1.64         {d1},  [ip,:64], r1
+    bx              lr
+endfunc
+
+function ff_vp3_h_loop_filter_neon, export=1
+    sub             ip,  r0,  #1
+    sub             r0,  r0,  #2
+    vld1.32         {d16[]},  [r0], r1
+    vld1.32         {d17[]},  [r0], r1
+    vld1.32         {d18[]},  [r0], r1
+    vld1.32         {d19[]},  [r0], r1
+    vld1.32         {d16[1]}, [r0], r1
+    vld1.32         {d17[1]}, [r0], r1
+    vld1.32         {d18[1]}, [r0], r1
+    vld1.32         {d19[1]}, [r0], r1
+    ldrb            r2,  [r2, #129*4]
+
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+    vtrn.16         d16, d18
+    vtrn.16         d17, d19
+
+    vp3_loop_filter
+
+    vtrn.8          d0,  d1
+
+    vst1.16         {d0[0]}, [ip], r1
+    vst1.16         {d1[0]}, [ip], r1
+    vst1.16         {d0[1]}, [ip], r1
+    vst1.16         {d1[1]}, [ip], r1
+    vst1.16         {d0[2]}, [ip], r1
+    vst1.16         {d1[2]}, [ip], r1
+    vst1.16         {d0[3]}, [ip], r1
+    vst1.16         {d1[3]}, [ip], r1
+    bx              lr
+endfunc
+
+
+function vp3_idct_start_neon
+    vpush           {d8-d15}
+    movrel          r3,  vp3_idct_constants
+    vld1.64         {d0-d1},   [r3,:128]
+    vld1.64         {d16-d19}, [r2,:128]!
+    vld1.64         {d20-d23}, [r2,:128]!
+    vld1.64         {d24-d27}, [r2,:128]!
+    vadd.s16        q1,  q8,  q12
+    vsub.s16        q8,  q8,  q12
+    vld1.64         {d28-d31}, [r2,:128]!
+endfunc
+
+function vp3_idct_core_neon
+    vmull.s16       q2,  d18, xC1S7     // (ip[1] * C1) << 16
+    vmull.s16       q3,  d19, xC1S7
+    vmull.s16       q4,  d2,  xC4S4     // ((ip[0] + ip[4]) * C4) << 16
+    vmull.s16       q5,  d3,  xC4S4
+    vmull.s16       q6,  d16, xC4S4     // ((ip[0] - ip[4]) * C4) << 16
+    vmull.s16       q7,  d17, xC4S4
+    vshrn.s32       d4,  q2,  #16
+    vshrn.s32       d5,  q3,  #16
+    vshrn.s32       d6,  q4,  #16
+    vshrn.s32       d7,  q5,  #16
+    vshrn.s32       d8,  q6,  #16
+    vshrn.s32       d9,  q7,  #16
+    vadd.s16        q12, q1,  q3        // E = (ip[0] + ip[4]) * C4
+    vadd.s16        q8,  q8,  q4        // F = (ip[0] - ip[4]) * C4
+    vadd.s16        q1,  q2,  q9        // ip[1] * C1
+
+    vmull.s16       q2,  d30, xC1S7     // (ip[7] * C1) << 16
+    vmull.s16       q3,  d31, xC1S7
+    vmull.s16       q4,  d30, xC7S1     // (ip[7] * C7) << 16
+    vmull.s16       q5,  d31, xC7S1
+    vmull.s16       q6,  d18, xC7S1     // (ip[1] * C7) << 16
+    vmull.s16       q7,  d19, xC7S1
+    vshrn.s32       d4,  q2,  #16
+    vshrn.s32       d5,  q3,  #16
+    vshrn.s32       d6,  q4,  #16       // ip[7] * C7
+    vshrn.s32       d7,  q5,  #16
+    vshrn.s32       d8,  q6,  #16       // ip[1] * C7
+    vshrn.s32       d9,  q7,  #16
+    vadd.s16        q2,  q2,  q15       // ip[7] * C1
+    vadd.s16        q9,  q1,  q3        // A = ip[1] * C1 + ip[7] * C7
+    vsub.s16        q15, q4,  q2        // B = ip[1] * C7 - ip[7] * C1
+
+    vmull.s16       q2,  d22, xC5S3     // (ip[3] * C5) << 16
+    vmull.s16       q3,  d23, xC5S3
+    vmull.s16       q4,  d22, xC3S5     // (ip[3] * C3) << 16
+    vmull.s16       q5,  d23, xC3S5
+    vmull.s16       q6,  d26, xC5S3     // (ip[5] * C5) << 16
+    vmull.s16       q7,  d27, xC5S3
+    vshrn.s32       d4,  q2,  #16
+    vshrn.s32       d5,  q3,  #16
+    vshrn.s32       d6,  q4,  #16
+    vshrn.s32       d7,  q5,  #16
+    vshrn.s32       d8,  q6,  #16
+    vshrn.s32       d9,  q7,  #16
+    vadd.s16        q3,  q3,  q11       // ip[3] * C3
+    vadd.s16        q4,  q4,  q13       // ip[5] * C5
+    vadd.s16        q1,  q2,  q11       // ip[3] * C5
+    vadd.s16        q11, q3,  q4        // C = ip[3] * C3 + ip[5] * C5
+
+    vmull.s16       q2,  d26, xC3S5     // (ip[5] * C3) << 16
+    vmull.s16       q3,  d27, xC3S5
+    vmull.s16       q4,  d20, xC2S6     // (ip[2] * C2) << 16
+    vmull.s16       q5,  d21, xC2S6
+    vmull.s16       q6,  d28, xC6S2     // (ip[6] * C6) << 16
+    vmull.s16       q7,  d29, xC6S2
+    vshrn.s32       d4,  q2,  #16
+    vshrn.s32       d5,  q3,  #16
+    vshrn.s32       d6,  q4,  #16
+    vshrn.s32       d7,  q5,  #16
+    vshrn.s32       d8,  q6,  #16       // ip[6] * C6
+    vshrn.s32       d9,  q7,  #16
+    vadd.s16        q2,  q2,  q13       // ip[5] * C3
+    vadd.s16        q3,  q3,  q10       // ip[2] * C2
+    vsub.s16        q13, q2,  q1        // D = ip[5] * C3 - ip[3] * C5
+    vsub.s16        q1,  q9,  q11       // (A - C)
+    vadd.s16        q11, q9,  q11       // Cd = A + C
+    vsub.s16        q9,  q15, q13       // (B - D)
+    vadd.s16        q13, q15, q13       // Dd = B + D
+    vadd.s16        q15, q3,  q4        // G = ip[2] * C2 + ip[6] * C6
+
+    vmull.s16       q2,  d2,  xC4S4     // ((A - C) * C4) << 16
+    vmull.s16       q3,  d3,  xC4S4
+    vmull.s16       q4,  d28, xC2S6     // (ip[6] * C2) << 16
+    vmull.s16       q5,  d29, xC2S6
+    vmull.s16       q6,  d20, xC6S2     // (ip[2] * C6) << 16
+    vmull.s16       q7,  d21, xC6S2
+    vshrn.s32       d4,  q2,  #16
+    vshrn.s32       d5,  q3,  #16
+    vshrn.s32       d6,  q4,  #16
+    vshrn.s32       d7,  q5,  #16
+    vshrn.s32       d8,  q6,  #16       // ip[2] * C6
+    vmull.s16       q5,  d18, xC4S4     // ((B - D) * C4) << 16
+    vmull.s16       q6,  d19, xC4S4
+    vshrn.s32       d9,  q7,  #16
+    vadd.s16        q3,  q3,  q14       // ip[6] * C2
+    vadd.s16        q10, q1,  q2        // Ad = (A - C) * C4
+    vsub.s16        q14, q4,  q3        // H = ip[2] * C6 - ip[6] * C2
+    bx              lr
+endfunc
+
+.macro VP3_IDCT_END type
+function vp3_idct_end_\type\()_neon
+.ifc \type, col
+    vdup.16         q0,  r3
+    vadd.s16        q12, q12, q0
+    vadd.s16        q8,  q8,  q0
+.endif
+
+    vshrn.s32       d2,  q5,  #16
+    vshrn.s32       d3,  q6,  #16
+    vadd.s16        q2,  q12, q15       // Gd  = E + G
+    vadd.s16        q9,  q1,  q9        // (B - D) * C4
+    vsub.s16        q12, q12, q15       // Ed  = E - G
+    vsub.s16        q3,  q8,  q10       // Fd  = F - Ad
+    vadd.s16        q10, q8,  q10       // Add = F + Ad
+    vadd.s16        q4,  q9,  q14       // Hd  = Bd + H
+    vsub.s16        q14, q9,  q14       // Bdd = Bd - H
+    vadd.s16        q8,  q2,  q11       // [0] = Gd + Cd
+    vsub.s16        q15, q2,  q11       // [7] = Gd - Cd
+    vadd.s16        q9,  q10, q4        // [1] = Add + Hd
+    vsub.s16        q10, q10, q4        // [2] = Add - Hd
+    vadd.s16        q11, q12, q13       // [3] = Ed + Dd
+    vsub.s16        q12, q12, q13       // [4] = Ed - Dd
+.ifc \type, row
+    vtrn.16         q8,  q9
+.endif
+    vadd.s16        q13, q3,  q14       // [5] = Fd + Bdd
+    vsub.s16        q14, q3,  q14       // [6] = Fd - Bdd
+
+.ifc \type, row
+    // 8x8 transpose
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vswp            d17, d24
+    vswp            d19, d26
+    vadd.s16        q1,  q8,  q12
+    vswp            d21, d28
+    vsub.s16        q8,  q8,  q12
+    vswp            d23, d30
+.endif
+    bx              lr
+endfunc
+.endm
+
+VP3_IDCT_END row
+VP3_IDCT_END col
+
+function ff_vp3_idct_neon, export=1
+    mov             ip,  lr
+    mov             r2,  r0
+    bl              vp3_idct_start_neon
+    bl              vp3_idct_end_row_neon
+    mov             r3,  #8
+    bl              vp3_idct_core_neon
+    bl              vp3_idct_end_col_neon
+    mov             lr,  ip
+    vpop            {d8-d15}
+
+    vshr.s16        q8,  q8,  #4
+    vshr.s16        q9,  q9,  #4
+    vshr.s16        q10, q10, #4
+    vshr.s16        q11, q11, #4
+    vshr.s16        q12, q12, #4
+    vst1.64         {d16-d19}, [r0,:128]!
+    vshr.s16        q13, q13, #4
+    vshr.s16        q14, q14, #4
+    vst1.64         {d20-d23}, [r0,:128]!
+    vshr.s16        q15, q15, #4
+    vst1.64         {d24-d27}, [r0,:128]!
+    vst1.64         {d28-d31}, [r0,:128]!
+    bx              lr
+endfunc
+
+function ff_vp3_idct_put_neon, export=1
+    mov             ip,  lr
+    bl              vp3_idct_start_neon
+    bl              vp3_idct_end_row_neon
+    mov             r3,  #8
+    add             r3,  r3,  #2048         // convert signed pixel to unsigned
+    bl              vp3_idct_core_neon
+    bl              vp3_idct_end_col_neon
+    mov             lr,  ip
+    vpop            {d8-d15}
+
+    vqshrun.s16     d0,  q8,  #4
+    vqshrun.s16     d1,  q9,  #4
+    vqshrun.s16     d2,  q10, #4
+    vqshrun.s16     d3,  q11, #4
+    vst1.64         {d0}, [r0,:64], r1
+    vqshrun.s16     d4,  q12, #4
+    vst1.64         {d1}, [r0,:64], r1
+    vqshrun.s16     d5,  q13, #4
+    vst1.64         {d2}, [r0,:64], r1
+    vqshrun.s16     d6,  q14, #4
+    vst1.64         {d3}, [r0,:64], r1
+    vqshrun.s16     d7,  q15, #4
+    vst1.64         {d4}, [r0,:64], r1
+    vst1.64         {d5}, [r0,:64], r1
+    vst1.64         {d6}, [r0,:64], r1
+    vst1.64         {d7}, [r0,:64], r1
+    bx              lr
+endfunc
+
+function ff_vp3_idct_add_neon, export=1
+    mov             ip,  lr
+    bl              vp3_idct_start_neon
+    bl              vp3_idct_end_row_neon
+    mov             r3,  #8
+    bl              vp3_idct_core_neon
+    bl              vp3_idct_end_col_neon
+    mov             lr,  ip
+    vpop            {d8-d15}
+    mov             r2,  r0
+
+    vld1.64         {d0}, [r0,:64], r1
+    vshr.s16        q8,  q8,  #4
+    vld1.64         {d1}, [r0,:64], r1
+    vshr.s16        q9,  q9,  #4
+    vld1.64         {d2}, [r0,:64], r1
+    vaddw.u8        q8,  q8,  d0
+    vld1.64         {d3}, [r0,:64], r1
+    vaddw.u8        q9,  q9,  d1
+    vld1.64         {d4}, [r0,:64], r1
+    vshr.s16        q10, q10, #4
+    vld1.64         {d5}, [r0,:64], r1
+    vshr.s16        q11, q11, #4
+    vld1.64         {d6}, [r0,:64], r1
+    vqmovun.s16     d0,  q8
+    vld1.64         {d7}, [r0,:64], r1
+    vqmovun.s16     d1,  q9
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vshr.s16        q12, q12, #4
+    vshr.s16        q13, q13, #4
+    vqmovun.s16     d2,  q10
+    vqmovun.s16     d3,  q11
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vshr.s16        q14, q14, #4
+    vshr.s16        q15, q15, #4
+    vst1.64         {d0}, [r2,:64], r1
+    vqmovun.s16     d4,  q12
+    vst1.64         {d1}, [r2,:64], r1
+    vqmovun.s16     d5,  q13
+    vst1.64         {d2}, [r2,:64], r1
+    vaddw.u8        q14, q14, d6
+    vst1.64         {d3}, [r2,:64], r1
+    vaddw.u8        q15, q15, d7
+    vst1.64         {d4}, [r2,:64], r1
+    vqmovun.s16     d6,  q14
+    vst1.64         {d5}, [r2,:64], r1
+    vqmovun.s16     d7,  q15
+    vst1.64         {d6}, [r2,:64], r1
+    vst1.64         {d7}, [r2,:64], r1
+    bx              lr
+endfunc
+
+function ff_vp3_idct_dc_add_neon, export=1
+    ldrsh           r2,  [r2]
+    movw            r3,  #46341
+    mul             r2,  r3,  r2
+    smulwt          r2,  r3,  r2
+    mov             r3,  r0
+    vdup.16         q15, r2
+    vrshr.s16       q15, q15, #4
+
+    vld1.8          {d0}, [r0,:64], r1
+    vld1.8          {d1}, [r0,:64], r1
+    vld1.8          {d2}, [r0,:64], r1
+    vaddw.u8        q8,  q15, d0
+    vld1.8          {d3}, [r0,:64], r1
+    vaddw.u8        q9,  q15, d1
+    vld1.8          {d4}, [r0,:64], r1
+    vaddw.u8        q10, q15, d2
+    vld1.8          {d5}, [r0,:64], r1
+    vaddw.u8        q11, q15, d3
+    vld1.8          {d6}, [r0,:64], r1
+    vaddw.u8        q12, q15, d4
+    vld1.8          {d7}, [r0,:64], r1
+    vaddw.u8        q13, q15, d5
+    vqmovun.s16     d0,  q8
+    vaddw.u8        q14, q15, d6
+    vqmovun.s16     d1,  q9
+    vaddw.u8        q15, q15, d7
+    vqmovun.s16     d2,  q10
+    vst1.8          {d0}, [r3,:64], r1
+    vqmovun.s16     d3,  q11
+    vst1.8          {d1}, [r3,:64], r1
+    vqmovun.s16     d4,  q12
+    vst1.8          {d2}, [r3,:64], r1
+    vqmovun.s16     d5,  q13
+    vst1.8          {d3}, [r3,:64], r1
+    vqmovun.s16     d6,  q14
+    vst1.8          {d4}, [r3,:64], r1
+    vqmovun.s16     d7,  q15
+    vst1.8          {d5}, [r3,:64], r1
+    vst1.8          {d6}, [r3,:64], r1
+    vst1.8          {d7}, [r3,:64], r1
+    bx              lr
+endfunc
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/avcodec.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/avcodec.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,407 @@
+#ifndef AVCODEC_AVCODEC_H
+#define AVCODEC_AVCODEC_H
+
+#include <errno.h>
+#include <stdint.h>
+#include "config.h"
+
+#include "libavutil/mem.h"
+
+#define MAX_SPS_COUNT 32
+#define MAX_PPS_COUNT 256
+
+
+#ifndef CABAC
+#define CABAC h->pps.cabac
+#endif
+
+#define EXTENDED_SAR          255
+
+#define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
+#define MB_TYPE_8x8DCT     0x01000000
+#define IS_REF0(a)         ((a) & MB_TYPE_REF0)
+#define IS_8x8DCT(a)       ((a) & MB_TYPE_8x8DCT)
+
+#define LIST_NOT_USED -1
+#define PART_NOT_AVAILABLE -2
+
+/* dct code */
+typedef short DCTELEM;
+
+/**
+* Required number of additionally allocated bytes at the end of the input bitstream for decoding.
+* This is mainly needed because some optimized bitstream readers read
+* 32 or 64 bit at once and could read over the end.<br>
+* Note: If the first 23 bits of the additional bytes are not 0, then damaged
+* MPEG bitstreams could cause overread and segfault.
+*/
+#define FF_INPUT_BUFFER_PADDING_SIZE 8
+
+enum AVColorPrimaries{
+    AVCOL_PRI_BT709      =1, ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B
+    AVCOL_PRI_UNSPECIFIED=2,
+    AVCOL_PRI_BT470M     =4,
+    AVCOL_PRI_BT470BG    =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM
+    AVCOL_PRI_SMPTE170M  =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC
+    AVCOL_PRI_SMPTE240M  =7, ///< functionally identical to above
+    AVCOL_PRI_FILM       =8,
+    AVCOL_PRI_NB           , ///< Not part of ABI
+};
+
+enum AVColorTransferCharacteristic{
+    AVCOL_TRC_BT709      =1, ///< also ITU-R BT1361
+    AVCOL_TRC_UNSPECIFIED=2,
+    AVCOL_TRC_GAMMA22    =4, ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM
+    AVCOL_TRC_GAMMA28    =5, ///< also ITU-R BT470BG
+    AVCOL_TRC_NB           , ///< Not part of ABI
+};
+
+enum AVColorSpace{
+    AVCOL_SPC_RGB        =0,
+    AVCOL_SPC_BT709      =1, ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B
+    AVCOL_SPC_UNSPECIFIED=2,
+    AVCOL_SPC_FCC        =4,
+    AVCOL_SPC_BT470BG    =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601
+    AVCOL_SPC_SMPTE170M  =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC / functionally identical to above
+    AVCOL_SPC_SMPTE240M  =7,
+    AVCOL_SPC_NB           , ///< Not part of ABI
+};
+
+enum AVColorRange{
+    AVCOL_RANGE_UNSPECIFIED=0,
+    AVCOL_RANGE_MPEG       =1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges
+    AVCOL_RANGE_JPEG       =2, ///< the normal     2^n-1   "JPEG" YUV ranges
+    AVCOL_RANGE_NB           , ///< Not part of ABI
+};
+
+#define MAX_MMCO_COUNT 66
+/**
+* Memory management control operation opcode.
+*/
+typedef enum MMCOOpcode{
+    MMCO_END=0,
+    MMCO_SHORT2UNUSED,
+    MMCO_LONG2UNUSED,
+    MMCO_SHORT2LONG,
+    MMCO_SET_MAX_LONG,
+    MMCO_RESET,
+    MMCO_LONG,
+} MMCOOpcode;
+
+/* NAL unit types */
+enum {
+    NAL_SLICE=1,
+    NAL_DPA,
+    NAL_DPB,
+    NAL_DPC,
+    NAL_IDR_SLICE,
+    NAL_SEI,
+    NAL_SPS,
+    NAL_PPS,
+    NAL_AUD,
+    NAL_END_SEQUENCE,
+    NAL_END_STREAM,
+    NAL_FILLER_DATA,
+    NAL_SPS_EXT,
+    NAL_AUXILIARY_SLICE=19
+};
+
+/**
+* SEI message types
+*/
+typedef enum {
+    SEI_BUFFERING_PERIOD             =  0, ///< buffering period (H.264, D.1.1)
+    SEI_TYPE_PIC_TIMING              =  1, ///< picture timing
+    SEI_TYPE_USER_DATA_UNREGISTERED  =  5, ///< unregistered user data
+    SEI_TYPE_RECOVERY_POINT          =  6  ///< recovery point (frame # to decoder sync)
+} SEI_Type;
+
+/**
+* pic_struct in picture timing SEI message
+*/
+typedef enum {
+    SEI_PIC_STRUCT_FRAME             = 0, ///<  0: %frame
+    SEI_PIC_STRUCT_TOP_FIELD         = 1, ///<  1: top field
+    SEI_PIC_STRUCT_BOTTOM_FIELD      = 2, ///<  2: bottom field
+    SEI_PIC_STRUCT_TOP_BOTTOM        = 3, ///<  3: top field, bottom field, in that order
+    SEI_PIC_STRUCT_BOTTOM_TOP        = 4, ///<  4: bottom field, top field, in that order
+    SEI_PIC_STRUCT_TOP_BOTTOM_TOP    = 5, ///<  5: top field, bottom field, top field repeated, in that order
+    SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM = 6, ///<  6: bottom field, top field, bottom field repeated, in that order
+    SEI_PIC_STRUCT_FRAME_DOUBLING    = 7, ///<  7: %frame doubling
+    SEI_PIC_STRUCT_FRAME_TRIPLING    = 8  ///<  8: %frame tripling
+} SEI_PicStructType;
+
+#define FF_MAX_B_FRAMES 16
+
+
+//The following defines may change, don't expect compatibility if you use them.
+#define MB_TYPE_INTRA4x4   0x0001
+#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific
+#define MB_TYPE_INTRA_PCM  0x0004 //FIXME H.264-specific
+#define MB_TYPE_16x16      0x0008
+#define MB_TYPE_16x8       0x0010
+#define MB_TYPE_8x16       0x0020
+#define MB_TYPE_8x8        0x0040
+#define MB_TYPE_INTERLACED 0x0080
+#define MB_TYPE_DIRECT2    0x0100 //FIXME
+#define MB_TYPE_ACPRED     0x0200
+#define MB_TYPE_GMC        0x0400
+#define MB_TYPE_SKIP       0x0800
+#define MB_TYPE_P0L0       0x1000
+#define MB_TYPE_P1L0       0x2000
+#define MB_TYPE_P0L1       0x4000
+#define MB_TYPE_P1L1       0x8000
+#define MB_TYPE_L0         (MB_TYPE_P0L0 | MB_TYPE_P1L0)
+#define MB_TYPE_L1         (MB_TYPE_P0L1 | MB_TYPE_P1L1)
+#define MB_TYPE_L0L1       (MB_TYPE_L0   | MB_TYPE_L1)
+#define MB_TYPE_QUANT      0x00010000
+#define MB_TYPE_CBP        0x00020000
+//Note bits 24-31 are reserved for codec specific use (h264 ref0, mpeg1 0mv, ...)
+
+#define FF_BUFFER_TYPE_INTERNAL 1
+#define FF_BUFFER_TYPE_USER     2 ///< direct rendering buffers (image is (de)allocated by user)
+#define FF_BUFFER_TYPE_SHARED   4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared.
+#define FF_BUFFER_TYPE_COPY     8 ///< Just a (modified) copy of some other buffer, don't deallocate anything.
+
+
+#define FF_I_TYPE  1 ///< Intra
+#define FF_P_TYPE  2 ///< Predicted
+#define FF_B_TYPE  3 ///< Bi-dir predicted
+#define FF_S_TYPE  4 ///< S(GMC)-VOP MPEG4
+#define FF_SI_TYPE 5 ///< Switching Intra
+#define FF_SP_TYPE 6 ///< Switching Predicted
+#define FF_BI_TYPE 7
+
+#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if there is just one type
+#define IS_INTRA4x4(a)   ((a)&MB_TYPE_INTRA4x4)
+#define IS_INTRA16x16(a) ((a)&MB_TYPE_INTRA16x16)
+#define IS_PCM(a)        ((a)&MB_TYPE_INTRA_PCM)
+#define IS_INTRA(a)      ((a)&7)
+#define IS_INTER(a)      ((a)&(MB_TYPE_16x16|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8))
+#define IS_SKIP(a)       ((a)&MB_TYPE_SKIP)
+#define IS_INTRA_PCM(a)  ((a)&MB_TYPE_INTRA_PCM)
+#define IS_INTERLACED(a) ((a)&MB_TYPE_INTERLACED)
+#define IS_DIRECT(a)     ((a)&MB_TYPE_DIRECT2)
+#define IS_GMC(a)        ((a)&MB_TYPE_GMC)
+#define IS_16X16(a)      ((a)&MB_TYPE_16x16)
+#define IS_16X8(a)       ((a)&MB_TYPE_16x8)
+#define IS_8X16(a)       ((a)&MB_TYPE_8x16)
+#define IS_8X8(a)        ((a)&MB_TYPE_8x8)
+#define IS_SUB_8X8(a)    ((a)&MB_TYPE_16x16) //note reused
+#define IS_SUB_8X4(a)    ((a)&MB_TYPE_16x8)  //note reused
+#define IS_SUB_4X8(a)    ((a)&MB_TYPE_8x16)  //note reused
+#define IS_SUB_4X4(a)    ((a)&MB_TYPE_8x8)   //note reused
+#define IS_ACPRED(a)     ((a)&MB_TYPE_ACPRED)
+#define IS_QUANT(a)      ((a)&MB_TYPE_QUANT)
+#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list))))
+#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0|MB_TYPE_P1L0)<<(2*(list)))) ///< does this mb use listX, note does not work if subMBs
+#define HAS_CBP(a)        ((a)&MB_TYPE_CBP)
+
+
+#define FF_MM_FORCE    0x80000000 /* Force usage of selected flags (OR) */
+    /* lower 16 bits - CPU features */
+#define FF_MM_MMX      0x0001 ///< standard MMX
+#define FF_MM_3DNOW    0x0004 ///< AMD 3DNOW
+#define FF_MM_MMX2     0x0002 ///< SSE integer functions or AMD MMX ext
+#define FF_MM_SSE      0x0008 ///< SSE functions
+#define FF_MM_SSE2     0x0010 ///< PIV SSE2 functions
+#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
+#define FF_MM_SSE3     0x0040 ///< Prescott SSE3 functions
+#define FF_MM_SSSE3    0x0080 ///< Conroe SSSE3 functions
+#define FF_MM_SSE4     0x0100 ///< Penryn SSE4.1 functions
+#define FF_MM_SSE42    0x0200 ///< Nehalem SSE4.2 functions
+#define FF_MM_IWMMXT   0x0100 ///< XScale IWMMXT
+#define FF_MM_ALTIVEC  0x0001 ///< standard AltiVec
+
+
+/**
+* Sequence parameter set
+*/
+typedef struct SPS{
+
+    int profile_idc;
+    int level_idc;
+    int chroma_format_idc;
+    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
+    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
+    int poc_type;                      ///< pic_order_cnt_type
+    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
+    int delta_pic_order_always_zero_flag;
+    int offset_for_non_ref_pic;
+    int offset_for_top_to_bottom_field;
+    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
+    int ref_frame_count;               ///< num_ref_frames
+    int gaps_in_frame_num_allowed_flag;
+    int mb_width;                      ///< pic_width_in_mbs_minus1 + 1
+    int mb_height;                     ///< pic_height_in_map_units_minus1 + 1
+    int frame_mbs_only_flag;
+    int mb_aff;                        ///<mb_adaptive_frame_field_flag
+    int direct_8x8_inference_flag;
+    int crop;                   ///< frame_cropping_flag
+    unsigned int crop_left;            ///< frame_cropping_rect_left_offset
+    unsigned int crop_right;           ///< frame_cropping_rect_right_offset
+    unsigned int crop_top;             ///< frame_cropping_rect_top_offset
+    unsigned int crop_bottom;          ///< frame_cropping_rect_bottom_offset
+    int vui_parameters_present_flag;
+    int num,den;
+
+    int video_signal_type_present_flag;
+    int full_range;
+    int colour_description_present_flag;
+    enum AVColorPrimaries color_primaries;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorSpace colorspace;
+    int timing_info_present_flag;
+    uint32_t num_units_in_tick;
+    uint32_t time_scale;
+    int fixed_frame_rate_flag;
+    short offset_for_ref_frame[256]; //FIXME dyn aloc?
+    int bitstream_restriction_flag;
+    int num_reorder_frames;
+    int scaling_matrix_present;
+    uint8_t scaling_matrix4[6][16];
+    uint8_t scaling_matrix8[2][64];
+    int nal_hrd_parameters_present_flag;
+    int vcl_hrd_parameters_present_flag;
+    int pic_struct_present_flag;
+    int time_offset_length;
+    int cpb_cnt;                       ///< See H.264 E.1.2
+    int initial_cpb_removal_delay_length; ///< initial_cpb_removal_delay_length_minus1 +1
+    int cpb_removal_delay_length;      ///< cpb_removal_delay_length_minus1 + 1
+    int dpb_output_delay_length;       ///< dpb_output_delay_length_minus1 + 1
+    int bit_depth_luma;                ///< bit_depth_luma_minus8 + 8
+    int bit_depth_chroma;              ///< bit_depth_chroma_minus8 + 8
+    int residual_color_transform_flag; ///< residual_colour_transform_flag
+}SPS;
+
+/**
+* Picture parameter set
+*/
+typedef struct PPS{
+    unsigned int sps_id;
+    int cabac;                  ///< entropy_coding_mode_flag
+    int pic_order_present;      ///< pic_order_present_flag
+    int slice_group_count;      ///< num_slice_groups_minus1 + 1
+    int mb_slice_group_map_type;
+    unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
+    int weighted_pred;          ///< weighted_pred_flag
+    int weighted_bipred_idc;
+    int init_qp;                ///< pic_init_qp_minus26 + 26
+    int init_qs;                ///< pic_init_qs_minus26 + 26
+    int chroma_qp_index_offset[2];
+    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
+    int constrained_intra_pred; ///< constrained_intra_pred_flag
+    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
+    int transform_8x8_mode;     ///< transform_8x8_mode_flag
+    uint8_t scaling_matrix4[6][16];
+    uint8_t scaling_matrix8[2][64];
+    uint8_t chroma_qp_table[2][64];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
+    int chroma_qp_diff;
+}PPS;
+
+typedef struct TopBorder{
+    uint8_t unfiltered_y[16];
+    uint8_t unfiltered_cb[8];
+    uint8_t unfiltered_cr[8];
+
+    uint8_t top_borders_y[16*4];
+    uint8_t top_borders_cb[8*2];
+    uint8_t top_borders_cr[8*2];
+}TopBorder;
+
+typedef struct LeftBorder{
+    uint8_t unfiltered_y[17];
+    uint8_t unfiltered_cb[9];
+    uint8_t unfiltered_cr[9];
+}LeftBorder;
+
+typedef struct H264Mb {
+    //variables copied in after cabac decoding
+    int16_t mb_x, mb_y;
+    int32_t mb_type;
+
+    uint16_t cbp;                                               // coded block pattern, idct, deblock
+    int8_t qscale_mb_xy;                                        // qp, deblock
+    int8_t qscale_left_mb_xy; //not required
+    int8_t qscale_top_mb_xy;
+
+    DECLARE_ALIGNED(8, uint16_t, sub_mb_type[4]);
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count[24]);            //idct deblock
+    DECLARE_ALIGNED(16, int16_t, mb[16*24]);                    //coeffs, idct
+
+    union{
+        struct {
+        DECLARE_ALIGNED(8, int8_t, ref_index[2][4]);            //mc, deblock
+        DECLARE_ALIGNED(16, int16_t, mvd[2][16][2]);            //mc, deblock
+        };
+        struct {
+        DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode[16]);     //intra, deblock
+        int8_t chroma_pred_mode;                                //intra
+        int8_t intra16x16_pred_mode;                            //intra, deblock
+        };
+    };
+
+#if OMPSS
+    DECLARE_ALIGNED(8, uint8_t, top_border[16+ 2*8]);
+    DECLARE_ALIGNED(8, uint8_t, top_border_next[8]);
+    DECLARE_ALIGNED(8, uint8_t, left_border[17+2*9]);
+    int8_t intra4x4_pred_mode_left[4];
+#endif
+
+} H264Mb;
+
+typedef struct RawFrame {
+    uint8_t *data;
+    int size;
+    unsigned int data_size;
+    int64_t pos;                            ///< byte position in stream, -1 if unknown
+    int state;
+} RawFrame;
+
+typedef struct PictureInfo{
+    int ref_poc[2][16];      ///< h264 POCs of the frames used as reference
+    int ref_count[2];        ///< number of entries in ref_poc
+    int poc;                    ///< h264 frame POC
+    int frame_num;              ///< h264 frame_num (raw frame_num from slice header)
+    int pic_id;
+    int long_ref;
+    int cpn;                    ///coded picture number
+    int slice_type_nos;
+//     int key_frame;
+//     int mmco_reset;             ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET.
+
+    int reference;  //Set to 4 for delayed, non-reference frames. 1-3 for reference. FIXME
+
+}PictureInfo;
+
+typedef struct DecodedPicture{
+    int16_t (*motion_val[2])[2];
+    int16_t (*motion_val_base[2])[2];
+
+    /**
+    * motion reference frame index
+    * the order in which these are stored can depend on the codec.
+    * - encoding: Set by user.
+    * - decoding: Set by libavcodec.
+    */
+    int8_t *ref_index[2];
+    uint32_t *mb_type;          //mb_type_base + mb_width + 2
+    uint32_t *mb_type_base;
+
+    int8_t *intra4x4_pred_mode;
+    int8_t *non_zero_count;
+
+    uint8_t *data[3]; //point to first pixel in the frame
+    int linesize[3];
+    uint8_t *base[3]; //base of picture planes
+
+    int cpn;                /// coded picture number
+    int poc;                    ///< h264 frame POC
+    int reference;  // 0 -> free, 1 -> needs to be displayed, 2 -> needed for reference, 3 -> 1 && 2
+    int key_frame;
+    int mmco_reset;             ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET.
+
+} DecodedPicture;
+
+
+#endif /* AVCODEC_AVCODEC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cabac.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cabac.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,242 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Context Adaptive Binary Arithmetic Coder.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+//#include "get_bits.h"
+#include "cabac.h"
+
+static const uint8_t lps_range[64][4]= {
+{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
+{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
+{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
+{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
+{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
+{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
+{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
+{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
+{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
+{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
+{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
+{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
+{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
+{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
+{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
+{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
+};
+
+uint8_t ff_h264_mlps_state[4*64];
+uint8_t ff_h264_lps_range[4*2*64];
+uint8_t ff_h264_lps_state[2*64];
+uint8_t ff_h264_mps_state[2*64];
+
+static const uint8_t mps_state[64]= {
+  1, 2, 3, 4, 5, 6, 7, 8,
+  9,10,11,12,13,14,15,16,
+ 17,18,19,20,21,22,23,24,
+ 25,26,27,28,29,30,31,32,
+ 33,34,35,36,37,38,39,40,
+ 41,42,43,44,45,46,47,48,
+ 49,50,51,52,53,54,55,56,
+ 57,58,59,60,61,62,62,63,
+};
+
+static const uint8_t lps_state[64]= {
+  0, 0, 1, 2, 2, 4, 4, 5,
+  6, 7, 8, 9, 9,11,11,12,
+ 13,13,15,15,16,16,18,18,
+ 19,19,21,21,22,22,23,24,
+ 24,25,26,26,27,27,28,29,
+ 29,30,30,30,31,32,32,33,
+ 33,33,34,34,35,35,35,36,
+ 36,36,37,37,37,38,38,63,
+};
+
+const uint8_t ff_h264_norm_shift[512]= {
+ 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+/**
+ *
+ * @param buf_size size of buf in bits
+ */
+void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
+    c->bytestream_start=
+    c->bytestream= buf;
+    c->bytestream_end= buf + buf_size;
+
+#if CABAC_BITS == 16
+    c->low =  (*c->bytestream++)<<18;
+    c->low+=  (*c->bytestream++)<<10;
+#else
+    c->low =  (*c->bytestream++)<<10;
+#endif
+    c->low+= ((*c->bytestream++)<<2) + 2;
+    c->range= 0x1FE;
+}
+
+void ff_init_cabac_states(){
+    int i, j;
+
+    for(i=0; i<64; i++){
+        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
+            ff_h264_lps_range[j*2*64+2*i+0]=
+            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
+        }
+
+        ff_h264_mlps_state[128+2*i+0]=
+        ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0;
+        ff_h264_mlps_state[128+2*i+1]=
+        ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1;
+
+        if( i ){
+#ifdef BRANCHLESS_CABAC_DECODER
+            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
+            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
+        }else{
+            ff_h264_mlps_state[128-2*i-1]= 1;
+            ff_h264_mlps_state[128-2*i-2]= 0;
+#else
+            ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0;
+            ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1;
+        }else{
+            ff_h264_lps_state[2*i+0]= 1;
+            ff_h264_lps_state[2*i+1]= 0;
+#endif
+        }
+    }
+}
+
+#ifdef TEST
+#define SIZE 10240
+#define START_TIMER
+#define STOP_TIMER(...)
+#define av_log(...)
+// #include "libavutil/lfg.h"
+#include "avcodec.h"
+#include "cabac.h"
+
+int main(void){
+    CABACContext c;
+    uint8_t b[9*SIZE];
+    uint8_t r[9*SIZE];
+    int i;
+    uint8_t state[10]= {0};
+//    AVLFG prng;
+
+// //     av_lfg_init(&prng, 1);
+//     ff_init_cabac_encoder(&c, b, SIZE);
+//     ff_init_cabac_states();
+//
+//     for(i=0; i<SIZE; i++){
+//         r[i] = i%7; //av_lfg_get(&prng) % 7;
+//     }
+//
+//     for(i=0; i<SIZE; i++){
+// START_TIMER
+//         put_cabac_bypass(&c, r[i]&1);
+// STOP_TIMER("put_cabac_bypass")
+//     }
+//
+//     for(i=0; i<SIZE; i++){
+// START_TIMER
+//         put_cabac(&c, state, r[i]&1);
+// STOP_TIMER("put_cabac")
+//     }
+//
+//     for(i=0; i<SIZE; i++){
+// START_TIMER
+//         put_cabac_u(&c, state, r[i], 6, 3, i&1);
+// STOP_TIMER("put_cabac_u")
+//     }
+//
+//     for(i=0; i<SIZE; i++){
+// START_TIMER
+//         put_cabac_ueg(&c, state, r[i], 3, 0, 1, 2);
+// STOP_TIMER("put_cabac_ueg")
+//     }
+//
+//     put_cabac_terminate(&c, 1);
+
+    ff_init_cabac_decoder(&c, b, SIZE);
+
+    memset(state, 0, sizeof(state));
+
+    for(i=0; i<SIZE; i++){
+START_TIMER
+        if( (r[i]&1) != get_cabac_bypass(&c) )
+            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
+STOP_TIMER("get_cabac_bypass")
+    }
+
+    for(i=0; i<SIZE; i++){
+START_TIMER
+        if( (r[i]&1) != get_cabac(&c, state) )
+            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
+STOP_TIMER("get_cabac")
+    }
+#if 0
+    for(i=0; i<SIZE; i++){
+START_TIMER
+        if( r[i] != get_cabac_u(&c, state, (i&1) ? 6 : 7, 3, i&1) )
+            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
+STOP_TIMER("get_cabac_u")
+    }
+
+    for(i=0; i<SIZE; i++){
+START_TIMER
+        if( r[i] != get_cabac_ueg(&c, state, 3, 0, 1, 2))
+            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
+STOP_TIMER("get_cabac_ueg")
+    }
+#endif
+    if(!get_cabac_terminate(&c))
+        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
+
+    return 0;
+}
+
+#endif /* TEST */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cabac.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cabac.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,206 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Context Adaptive Binary Arithmetic Coder.
+ */
+
+#ifndef AVCODEC_CABAC_H
+#define AVCODEC_CABAC_H
+
+//#undef NDEBUG
+#include <assert.h>
+#include "libavutil/x86_cpu.h"
+#include "libavutil/attributes.h"
+
+#define CABAC_BITS 16
+#define CABAC_MASK ((1<<CABAC_BITS)-1)
+#define BRANCHLESS_CABAC_DECODER 1
+
+typedef struct CABACContext{
+    int low;
+    int range;
+    int outstanding_count;
+#ifdef STRICT_LIMITS
+    int symCount;
+#endif
+    const uint8_t *bytestream_start;
+    const uint8_t *bytestream;
+    const uint8_t *bytestream_end;
+    uint8_t  cabac_state[460];
+}CABACContext;
+
+extern uint8_t ff_h264_mlps_state[4*64];
+extern uint8_t ff_h264_lps_range[4*2*64];  ///< rangeTabLPS
+extern uint8_t ff_h264_mps_state[2*64];     ///< transIdxMPS
+extern uint8_t ff_h264_lps_state[2*64];     ///< transIdxLPS
+extern const uint8_t ff_h264_norm_shift[512];
+
+void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
+void ff_init_cabac_states(void);
+
+static void refill(CABACContext *c){
+#if CABAC_BITS == 16
+        c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
+#else
+        c->low+= c->bytestream[0]<<1;
+#endif
+    c->low -= CABAC_MASK;
+    c->bytestream+= CABAC_BITS/8;
+}
+
+static void refill2(CABACContext *c){
+    int i, x;
+
+    x= c->low ^ (c->low-1);
+    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
+
+    x= -CABAC_MASK;
+
+#if CABAC_BITS == 16
+        x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
+#else
+        x+= c->bytestream[0]<<1;
+#endif
+
+    c->low += x<<i;
+    c->bytestream+= CABAC_BITS/8;
+}
+
+static inline void renorm_cabac_decoder(CABACContext *c){
+    while(c->range < 0x100){
+        c->range+= c->range;
+        c->low+= c->low;
+        if(!(c->low & CABAC_MASK))
+            refill(c);
+    }
+}
+
+static inline void renorm_cabac_decoder_once(CABACContext *c){
+
+    int shift= (uint32_t)(c->range - 0x100)>>31;
+    c->range<<= shift;
+    c->low  <<= shift;
+
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+}
+
+static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
+
+    int s = *state;
+    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
+    int bit, lps_mask av_unused;
+
+    c->range -= RangeLPS;
+#ifndef BRANCHLESS_CABAC_DECODER
+    if(c->low < (c->range<<(CABAC_BITS+1))){
+        bit= s&1;
+        *state= ff_h264_mps_state[s];
+        renorm_cabac_decoder_once(c);
+    }else{
+        bit= ff_h264_norm_shift[RangeLPS];
+        c->low -= (c->range<<(CABAC_BITS+1));
+        *state= ff_h264_lps_state[s];
+        c->range = RangeLPS<<bit;
+        c->low <<= bit;
+        bit= (s&1)^1;
+
+        if(!(c->low & CABAC_MASK)){
+            refill2(c);
+        }
+    }
+#else /* BRANCHLESS_CABAC_DECODER */
+    lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
+
+    c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
+    c->range += (RangeLPS - c->range) & lps_mask;
+
+    s^=lps_mask;
+    *state= (ff_h264_mlps_state+128)[s];
+    bit= s&1;
+
+    lps_mask= ff_h264_norm_shift[c->range];
+    c->range<<= lps_mask;
+    c->low  <<= lps_mask;
+    if(!(c->low & CABAC_MASK))
+        refill2(c);
+#endif /* BRANCHLESS_CABAC_DECODER */
+
+    return bit;
+}
+
+static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){
+    return get_cabac_inline(c, state);
+}
+
+static int av_unused get_cabac(CABACContext *c, uint8_t * const state){
+    return get_cabac_inline(c, state);
+}
+
+static int av_unused get_cabac_bypass(CABACContext *c){
+
+    int range;
+    c->low += c->low;
+
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+
+    range= c->range<<(CABAC_BITS+1);
+    if(c->low < range){
+        return 0;
+    }else{
+        c->low -= range;
+        return 1;
+    }
+}
+
+static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
+    int range, mask;
+    c->low += c->low;
+
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+
+    range= c->range<<(CABAC_BITS+1);
+    c->low -= range;
+    mask= c->low >> 31;
+    range &= mask;
+    c->low += range;
+    return (val^mask)-mask;
+}
+
+/**
+ *
+ * @return the number of bytes read or 0 if no end
+ */
+static int av_unused get_cabac_terminate(CABACContext *c){
+    c->range -= 2;
+    if(c->low < c->range<<(CABAC_BITS+1)){
+        renorm_cabac_decoder_once(c);
+        return 0;
+    }else{
+        return c->bytestream - c->bytestream_start;
+    }
+}
+
+#endif /* AVCODEC_CABAC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/cabac_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/cabac_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,140 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Context Adaptive Binary Arithmetic Coder.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+//#include "get_bits.h"
+#include "cabac_spu.h"
+#define av_log(...)
+
+int bytecount =0;
+static const uint8_t lps_range[64][4]= {
+{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
+{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
+{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
+{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
+{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
+{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
+{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
+{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
+{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
+{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
+{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
+{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
+{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
+{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
+{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
+{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
+};
+
+uint8_t ff_h264_mlps_state[4*64];
+uint8_t ff_h264_lps_range[4*2*64];
+uint8_t ff_h264_lps_state[2*64];
+uint8_t ff_h264_mps_state[2*64];
+
+static const uint8_t mps_state[64]= {
+  1, 2, 3, 4, 5, 6, 7, 8,
+  9,10,11,12,13,14,15,16,
+ 17,18,19,20,21,22,23,24,
+ 25,26,27,28,29,30,31,32,
+ 33,34,35,36,37,38,39,40,
+ 41,42,43,44,45,46,47,48,
+ 49,50,51,52,53,54,55,56,
+ 57,58,59,60,61,62,62,63,
+};
+
+static const uint8_t lps_state[64]= {
+  0, 0, 1, 2, 2, 4, 4, 5,
+  6, 7, 8, 9, 9,11,11,12,
+ 13,13,15,15,16,16,18,18,
+ 19,19,21,21,22,22,23,24,
+ 24,25,26,26,27,27,28,29,
+ 29,30,30,30,31,32,32,33,
+ 33,33,34,34,35,35,35,36,
+ 36,36,37,37,37,38,38,63,
+};
+
+const uint8_t ff_h264_norm_shift[512]= {
+ 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+/**
+ *
+ * @param buf_size size of buf in bits
+ */
+
+void ff_init_cabac_states(){
+    int i, j;
+
+    for(i=0; i<64; i++){
+        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
+            ff_h264_lps_range[j*2*64+2*i+0]=
+            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
+        }
+
+        ff_h264_mlps_state[128+2*i+0]=
+        ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0;
+        ff_h264_mlps_state[128+2*i+1]=
+        ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1;
+
+        if( i ){
+#ifdef BRANCHLESS_CABAC_DECODER
+            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
+            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
+        }else{
+            ff_h264_mlps_state[128-2*i-1]= 1;
+            ff_h264_mlps_state[128-2*i-2]= 0;
+#else
+            ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0;
+            ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1;
+        }else{
+            ff_h264_lps_state[2*i+0]= 1;
+            ff_h264_lps_state[2*i+1]= 0;
+#endif
+        }
+    }
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/cabac_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/cabac_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,233 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Context Adaptive Binary Arithmetic Coder.
+ */
+
+#ifndef AVCODEC_CABAC_H
+#define AVCODEC_CABAC_H
+
+//#undef NDEBUG
+#include <assert.h>
+#include "h264_dma.h"
+#include "libavutil/x86_cpu.h"
+#include "libavutil/attributes.h"
+
+#define CABAC_BITS 16
+#define CABAC_MASK ((1<<CABAC_BITS)-1)
+#define BRANCHLESS_CABAC_DECODER 1
+
+typedef struct CABACContext{
+    int low;
+    int range;
+    int outstanding_count;
+#ifdef STRICT_LIMITS
+    int symCount;
+#endif
+	const uint8_t *bytestream_ea_start;
+    const uint8_t *bytestream_ea;
+	const uint8_t *bytestream_ea_end;
+	int slot;
+	int bufsize;
+
+	uint8_t *bytestream_start;
+    uint8_t *bytestream;
+    uint8_t *bytestream_end;
+    uint8_t  cabac_state[460];
+}CABACContext;
+
+extern uint8_t ff_h264_mlps_state[4*64];
+extern uint8_t ff_h264_lps_range[4*2*64];  ///< rangeTabLPS
+extern uint8_t ff_h264_mps_state[2*64];     ///< transIdxMPS
+extern uint8_t ff_h264_lps_state[2*64];     ///< transIdxLPS
+extern const uint8_t ff_h264_norm_shift[512];
+
+void ff_init_cabac_states(void);
+
+extern DECLARE_ALIGNED(128,uint8_t, bytestream_ls[4096]);
+extern int bytecount;
+static inline void dma_cabac(CABACContext *c){
+	bytecount++;
+	if (c->bytestream == c->bytestream_end){
+		if (c->bufsize>0){
+			int size = (c->bufsize > sizeof(bytestream_ls)) ?  sizeof(bytestream_ls) : c->bufsize;
+			int align = size &0xF;
+			int dma_size = size + (align? 16-align : 0);
+
+			spu_dma_get(bytestream_ls, (unsigned) c->bytestream_ea, dma_size, ED_raw);
+			wait_dma_id(ED_raw);
+			c->bytestream = bytestream_ls;
+			c->bytestream_end = &bytestream_ls[size];
+			c->bytestream_ea += dma_size;
+			c->bufsize -= size;
+		} 
+		bytecount =0;
+	}else if((unsigned)c->bytestream > (unsigned)c->bytestream_end +2){		
+		//fprintf(stderr, "Read beyond end of frame %d\n", c->bufsize);
+		bytecount =0;
+	}
+}
+
+static void refill(CABACContext *c){
+	dma_cabac(c); 
+
+	c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
+
+    c->low -= CABAC_MASK;
+    c->bytestream+= CABAC_BITS/8;
+}
+
+static void refill2(CABACContext *c){
+    int i, x;
+
+	dma_cabac(c);
+
+    x= c->low ^ (c->low-1);
+    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
+
+    x= -CABAC_MASK;
+
+	x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
+
+    c->low += x<<i;
+    c->bytestream+= CABAC_BITS/8;
+}
+
+static inline void renorm_cabac_decoder(CABACContext *c){
+    while(c->range < 0x100){
+        c->range+= c->range;
+        c->low+= c->low;
+        if(!(c->low & CABAC_MASK))
+            refill(c);
+    }
+}
+
+static inline void renorm_cabac_decoder_once(CABACContext *c){
+
+    int shift= (uint32_t)(c->range - 0x100)>>31;
+    c->range<<= shift;
+    c->low  <<= shift;
+
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+}
+
+static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
+
+    int s = *state;
+    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
+    int bit, lps_mask av_unused;
+
+    c->range -= RangeLPS;
+#ifndef BRANCHLESS_CABAC_DECODER
+    if(c->low < (c->range<<(CABAC_BITS+1))){
+        bit= s&1;
+        *state= ff_h264_mps_state[s];
+        renorm_cabac_decoder_once(c);
+    }else{
+        bit= ff_h264_norm_shift[RangeLPS];
+        c->low -= (c->range<<(CABAC_BITS+1));
+        *state= ff_h264_lps_state[s];
+        c->range = RangeLPS<<bit;
+        c->low <<= bit;
+        bit= (s&1)^1;
+
+        if(!(c->low & CABAC_MASK)){
+            refill2(c);
+        }
+    }
+#else /* BRANCHLESS_CABAC_DECODER */
+    lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
+
+    c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
+    c->range += (RangeLPS - c->range) & lps_mask;
+
+    s^=lps_mask;
+    *state= (ff_h264_mlps_state+128)[s];
+    bit= s&1;
+
+    lps_mask= ff_h264_norm_shift[c->range];
+    c->range<<= lps_mask;
+    c->low  <<= lps_mask;
+    if(!(c->low & CABAC_MASK))
+        refill2(c);
+#endif /* BRANCHLESS_CABAC_DECODER */
+
+    return bit;
+}
+
+static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){
+    return get_cabac_inline(c, state);
+}
+
+static int av_unused get_cabac(CABACContext *c, uint8_t * const state){
+    return get_cabac_inline(c, state);
+}
+
+static int av_unused get_cabac_bypass(CABACContext *c){
+
+    int range;
+    c->low += c->low;
+
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+
+    range= c->range<<(CABAC_BITS+1);
+    if(c->low < range){
+        return 0;
+    }else{
+        c->low -= range;
+        return 1;
+    }
+}
+
+static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
+    int range, mask;
+    c->low += c->low;
+
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+
+    range= c->range<<(CABAC_BITS+1);
+    c->low -= range;
+    mask= c->low >> 31;
+    range &= mask;
+    c->low += range;
+    return (val^mask)-mask;
+}
+
+/**
+ *
+ * @return the number of bytes read or 0 if no end
+ */
+static int av_unused get_cabac_terminate(CABACContext *c){
+    c->range -= 2;
+    if(c->low < c->range<<(CABAC_BITS+1)){
+        renorm_cabac_decoder_once(c);
+        return 0;
+    }else{
+        return c->bytestream - c->bytestream_start;
+    }
+}
+
+#endif /* AVCODEC_CABAC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/dsputil_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/dsputil_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1147 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding. 
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD SPU kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+
+#include "dsputil_spu.h"
+#include "h264_idct_spu.h"
+#include "h264_deblock_spu.h"
+#include "types_spu.h"
+#include "libavutil/intreadwrite.h"
+
+#include <stdio.h>
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <assert.h>
+
+//Luma interpolation
+#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s
+#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s)
+
+#define OP_U8_SPU                          PUT_OP_U8_SPU
+#define PREFIX_h264_qpel16_h_lowpass_spu   put_h264_qpel16_h_lowpass_spu
+#define PREFIX_h264_qpel16_v_lowpass_spu   put_h264_qpel16_v_lowpass_spu
+#define PREFIX_h264_qpel16_hv_lowpass_spu  put_h264_qpel16_hv_lowpass_spu
+#define PREFIX_h264_qpel8_h_lowpass_spu    put_h264_qpel8_h_lowpass_spu
+#define PREFIX_h264_qpel8_v_lowpass_spu    put_h264_qpel8_v_lowpass_spu
+#define PREFIX_h264_qpel8_hv_lowpass_spu   put_h264_qpel8_hv_lowpass_spu
+#define PREFIX_h264_qpel4_h_lowpass_spu    put_h264_qpel4_h_lowpass_spu
+#define PREFIX_h264_qpel4_v_lowpass_spu    put_h264_qpel4_v_lowpass_spu
+#define PREFIX_h264_qpel4_hv_lowpass_spu   put_h264_qpel4_hv_lowpass_spu
+#include "h264_luma_template_spu.c"
+#undef OP_U8_SPU                          
+#undef PREFIX_h264_qpel16_h_lowpass_spu
+#undef PREFIX_h264_qpel16_v_lowpass_spu
+#undef PREFIX_h264_qpel16_hv_lowpass_spu
+#undef PREFIX_h264_qpel8_h_lowpass_spu
+#undef PREFIX_h264_qpel8_v_lowpass_spu
+#undef PREFIX_h264_qpel8_hv_lowpass_spu
+#undef PREFIX_h264_qpel4_h_lowpass_spu
+#undef PREFIX_h264_qpel4_v_lowpass_spu
+#undef PREFIX_h264_qpel4_hv_lowpass_spu
+
+#define OP_U8_SPU                          AVG_OP_U8_SPU
+#define PREFIX_h264_qpel16_h_lowpass_spu   avg_h264_qpel16_h_lowpass_spu
+#define PREFIX_h264_qpel16_v_lowpass_spu   avg_h264_qpel16_v_lowpass_spu
+#define PREFIX_h264_qpel16_hv_lowpass_spu  avg_h264_qpel16_hv_lowpass_spu
+#define PREFIX_h264_qpel8_h_lowpass_spu    avg_h264_qpel8_h_lowpass_spu
+#define PREFIX_h264_qpel8_v_lowpass_spu    avg_h264_qpel8_v_lowpass_spu
+#define PREFIX_h264_qpel8_hv_lowpass_spu   avg_h264_qpel8_hv_lowpass_spu
+#define PREFIX_h264_qpel4_h_lowpass_spu    avg_h264_qpel4_h_lowpass_spu
+#define PREFIX_h264_qpel4_v_lowpass_spu    avg_h264_qpel4_v_lowpass_spu
+#define PREFIX_h264_qpel4_hv_lowpass_spu   avg_h264_qpel4_hv_lowpass_spu
+#include "h264_luma_template_spu.c"
+#undef OP_U8_SPU                          
+#undef PREFIX_h264_qpel16_h_lowpass_spu
+#undef PREFIX_h264_qpel16_v_lowpass_spu
+#undef PREFIX_h264_qpel16_hv_lowpass_spu
+#undef PREFIX_h264_qpel8_h_lowpass_spu
+#undef PREFIX_h264_qpel8_v_lowpass_spu
+#undef PREFIX_h264_qpel8_hv_lowpass_spu
+#undef PREFIX_h264_qpel4_h_lowpass_spu
+#undef PREFIX_h264_qpel4_v_lowpass_spu
+#undef PREFIX_h264_qpel4_hv_lowpass_spu
+
+#define H264_MC(OPNAME, SIZE, CODETYPE) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \
+    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
+    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
+    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
+    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
+    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
+    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
+    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
+}\
+
+
+/**************************/
+/* put pixels functions   */
+/*************************/
+
+static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
+                                    const uint8_t * src2, int dst_stride,
+                                    int src_stride1, int h)
+{
+  int i;
+
+  const int perm_src1 = (unsigned int) src1 & 15;
+
+  for (i=0; i<h; i++){
+      //unaligned load of src1
+      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
+      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
+      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
+
+      //aligned load of src2
+      const vuint8_t srcb = *(vuint8_t *)(src2);
+
+      //average and rounding
+      const vuint8_t avgc = spu_avg(srca,srcb);
+
+      // 16x16 dest luma blocks are always aligned
+      *(vuint8_t *)dst=avgc;
+
+      src1 +=src_stride1;
+      src2 +=16;
+      dst  +=dst_stride;
+  }
+}
+
+static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
+                                    const uint8_t * src2, int dst_stride,
+                                    int src_stride1, int h)
+{
+  int i;
+
+  const int perm_src1 = (unsigned int) src1 & 15;
+
+  for (i=0; i<h; i++){
+      //unaligned load of src1
+      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
+      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
+      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
+
+      //aligned load of src2
+      const vuint8_t srcb = *(vuint8_t *)(src2);
+
+      //average and rounding
+      const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst);
+
+      // 16x16 dest luma blocks are always aligned
+      *(vuint8_t *)dst=avgc;
+
+      src1 +=src_stride1;
+      src2 +=16;
+      dst  +=dst_stride;
+  }
+}
+
+// next one assumes that ((line_size % 16) == 0)
+void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
+{
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    const int perm = (unsigned int) src & 15;
+    int i;
+	register int line_size   = src_stride;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+    register int dst_stride_2 = dst_stride << 1;
+    register int dst_stride_3 = dst_stride_2 + dst_stride;
+    register int dst_stride_4 = dst_stride << 2;
+
+    for(i=0; i<h; i+=4) {
+      pixelsv1 = *(vuint8_t *)(src);
+      pixelsv2 = *(vuint8_t *)(src+16);
+      pixelsv1B = *(vuint8_t *)(src + line_size);
+      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
+      pixelsv1C = *(vuint8_t *)(src + line_size_2);
+      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
+      pixelsv1D = *(vuint8_t *)(src + line_size_3);
+      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
+
+      *(vuint8_t *) dst                 = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16));
+      *(vuint8_t *)(dst +   dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16));
+      *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16));
+      *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16));
+
+      src+= line_size_4;
+      dst+= dst_stride_4;
+    }
+}
+
+// next one assumes that ((line_size % 16) == 0)
+void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
+{
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    const int perm = (unsigned int) src & 15;
+    int i;
+	register int line_size   = src_stride;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+    register int dst_stride_2 = dst_stride << 1;
+    register int dst_stride_3 = dst_stride_2 + dst_stride;
+    register int dst_stride_4 = dst_stride << 2;
+
+
+    for(i=0; i<h; i+=4) {
+      pixelsv1 = *(vuint8_t *)(src);
+      pixelsv2 = *(vuint8_t *)(src+16);
+      pixelsv1B = *(vuint8_t *)(src + line_size);
+      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
+      pixelsv1C = *(vuint8_t *)(src + line_size_2);
+      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
+      pixelsv1D = *(vuint8_t *)(src + line_size_3);
+      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
+
+      *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst);
+      *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride));
+      *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2));
+      *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3));
+
+      src+= line_size_4;
+      dst+= dst_stride_4;
+    }
+}
+
+void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
+				   int dst_stride, int src_stride1, int h)
+{
+  int i;
+
+  const int perm_src1 = (unsigned int) src1 & 15;
+  const int shift_dst = (unsigned int) dst & 15;
+
+  // 8x dest luma blocks are aligned or desaligned by 8
+  vuint8_t dstmask;
+  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+  if(shift_dst==0){
+    dstmask = dst8mask1;
+  }
+  else{
+    dstmask = dst8mask2;
+  }
+
+  for (i=0; i<h; i++){
+      //unaligned load of src1
+      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
+      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
+      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
+
+      //aligned load of src2
+      const vuint8_t srcb = *(vuint8_t *)(src2);
+
+      //average and rounding
+      const vuint8_t avgc = spu_avg(srca,srcb);
+
+      const vuint8_t dst1 = *(vuint8_t *)dst;
+
+      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
+
+      *(vuint8_t *)dst=davgc;
+
+      src1 +=src_stride1;
+      src2 +=16;
+      dst  +=dst_stride;
+  }
+}
+
+void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
+				   int dst_stride, int src_stride1, int h)
+{
+  int i;
+
+  const int perm_src1 = (unsigned int) src1 & 15;
+  const int shift_dst = (unsigned int) dst & 15;
+
+  // 8x dest luma blocks are aligned or desaligned by 8
+  vuint8_t dstmask;
+  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+  if(shift_dst==0){
+    dstmask = dst8mask1;
+  }
+  else{
+    dstmask = dst8mask2;
+  }
+
+  for (i=0; i<h; i++){
+      //unaligned load of src1
+      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
+      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
+      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
+
+      //aligned load of src2
+      const vuint8_t srcb = *(vuint8_t *)(src2);
+
+      //average and rounding
+      const vuint8_t avgc = spu_avg(srca,srcb);
+
+      const vuint8_t dst1 = *(vuint8_t *)dst;
+
+      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
+
+      const vuint8_t davgc = spu_avg(dst1,davgc1);
+
+      *(vuint8_t *)dst=davgc;
+
+      src1 +=src_stride1;
+      src2 +=16;
+      dst  +=dst_stride;
+  }
+}
+
+// next one assumes that ((line_size % 16) == 0)
+void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
+{
+    register vector unsigned char pixelsv1A, pixelsv2A;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    const int perm = (unsigned int) src & 15;
+    const int shift_dst = (unsigned int) dst & 15;
+
+    // 8x dest luma blocks are aligned or desaligned by 8
+    vuint8_t dstmask;
+    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+    if(shift_dst==0){
+      dstmask = dst8mask1;
+    }
+    else{
+      dstmask = dst8mask2;
+    }
+
+    int i;
+	register int line_size   = src_stride;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+    register int dst_stride_2 = dst_stride << 1;
+    register int dst_stride_3 = dst_stride_2 + dst_stride;
+    register int dst_stride_4 = dst_stride << 2;
+
+    for(i=0; i<h; i+=4) {
+      pixelsv1A = *(vuint8_t *)(src);
+      pixelsv2A = *(vuint8_t *)(src+16);
+      pixelsv1B = *(vuint8_t *)(src + line_size);
+      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
+      pixelsv1C = *(vuint8_t *)(src + line_size_2);
+      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
+      pixelsv1D = *(vuint8_t *)(src + line_size_3);
+      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
+
+      const vuint8_t block1 = *(vuint8_t *)dst;
+      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
+      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
+      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
+      const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride);
+      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
+      const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride);
+      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
+
+      *(vuint8_t *) dst = put1;
+      *(vuint8_t *)(dst + dst_stride) = put2;
+      *(vuint8_t *)(dst + dst_stride_2) = put3;
+      *(vuint8_t *)(dst + dst_stride_3) = put4;
+
+      src += line_size_4;
+      dst += dst_stride_4;
+    }
+}
+
+// next one assumes that ((line_size % 16) == 0)
+void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
+{
+    register vector unsigned char pixelsv1A, pixelsv2A;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    const int perm = (unsigned int) src & 15;
+    const int shift_dst = (unsigned int) dst & 15;
+
+    // 8x dest luma blocks are aligned or desaligned by 8
+    vuint8_t dstmask;
+    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+    if(shift_dst==0){
+      dstmask = dst8mask1;
+    }
+    else{
+      dstmask = dst8mask2;
+    }
+
+    int i;
+	register int line_size   = src_stride;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+	register int dst_stride_2 = dst_stride << 1;
+    register int dst_stride_3 = dst_stride_2 + dst_stride;
+    register int dst_stride_4 = dst_stride << 2;
+
+    for(i=0; i<h; i+=4) {
+      pixelsv1A = *(vuint8_t *)(src);
+      pixelsv2A = *(vuint8_t *)(src+16);
+      pixelsv1B = *(vuint8_t *)(src + line_size);
+      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
+      pixelsv1C = *(vuint8_t *)(src + line_size_2);
+      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
+      pixelsv1D = *(vuint8_t *)(src + line_size_3);
+      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
+
+      const vuint8_t block1 = *(vuint8_t *) dst;
+      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
+      const vuint8_t put1 = spu_avg(block1,put1a);
+
+      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
+      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
+      const vuint8_t put2 = spu_avg(block2,put2a);
+
+      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
+      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
+      const vuint8_t put3 = spu_avg(block3,put3a);
+
+      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
+      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
+      const vuint8_t put4 = spu_avg(block4,put4a);
+
+      *(vuint8_t *) dst = put1;
+      *(vuint8_t *)(dst + dst_stride) = put2;
+      *(vuint8_t *)(dst + dst_stride_2) = put3;
+      *(vuint8_t *)(dst + dst_stride_3) = put4;
+
+      src+= line_size_4;
+      dst+= dst_stride_4;
+    }
+}
+
+void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
+				   int dst_stride, int src_stride1, int h)
+{
+  int i;
+
+  const int perm_src1 = (unsigned int) src1 & 15;
+  const int shift_dst = (unsigned int) dst & 15;
+
+  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+  switch(shift_dst){
+    case 0:  dstmask = dstmask0;
+             break;
+    case 4:  dstmask = dstmask4;
+             break;
+    case 8:  dstmask = dstmask8;
+             break;
+    case 12: dstmask = dstmask12;
+             break;
+  }
+
+  for (i=0; i<h; i++){
+      //unaligned load of src1
+      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
+      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
+      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
+
+      //aligned load of src2
+      const vuint8_t srcb = *(vuint8_t *)(src2);
+
+      //average and rounding
+      const vuint8_t avgc = spu_avg(srca,srcb);
+
+      const vuint8_t dst1 = *(vuint8_t *)dst;
+
+      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
+
+      *(vuint8_t *)dst=davgc;
+
+      src1 +=src_stride1;
+      src2 +=16;
+      dst  +=dst_stride;
+  }
+}
+
+void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
+				   int dst_stride, int src_stride1, int h)
+{
+  int i;
+
+  const int perm_src1 = (unsigned int) src1 & 15;
+  const int shift_dst = (unsigned int) dst & 15;
+
+  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+  switch(shift_dst){
+    case 0:  dstmask = dstmask0;
+             break;
+    case 4:  dstmask = dstmask4;
+             break;
+    case 8:  dstmask = dstmask8;
+             break;
+    case 12: dstmask = dstmask12;
+             break;
+  }
+
+  for (i=0; i<h; i++){
+      //unaligned load of src1
+      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
+      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
+      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
+
+      //aligned load of src2
+      const vuint8_t srcb = *(vuint8_t *)(src2);
+
+      //average and rounding
+      const vuint8_t avgc = spu_avg(srca,srcb);
+
+      const vuint8_t dst1 = *(vuint8_t *)dst;
+
+      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
+
+      const vuint8_t davgc = spu_avg(dst1,davgc1);
+
+      *(vuint8_t *)dst=davgc;
+
+      src1 +=src_stride1;
+      src2 +=16;
+      dst  +=dst_stride;
+  }
+}
+
+// next one assumes that ((line_size % 16) == 0)
+void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
+{
+    register vector unsigned char pixelsv1A, pixelsv2A;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    const int perm = (unsigned int) src & 15;
+    const int shift_dst = (unsigned int) dst & 15;
+
+    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
+    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+    switch(shift_dst){
+      case 0:  dstmask = dstmask0;
+               break;
+      case 4:  dstmask = dstmask4;
+               break;
+      case 8:  dstmask = dstmask8;
+               break;
+      case 12: dstmask = dstmask12;
+               break;
+    }
+
+    int i;
+	register int line_size   = src_stride;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+	register int dst_stride_2 = dst_stride << 1;
+    register int dst_stride_3 = dst_stride_2 + dst_stride;
+    register int dst_stride_4 = dst_stride << 2;
+
+    for(i=0; i<h; i+=4) {
+	  pixelsv1A = *(vuint8_t *)(src);
+      pixelsv2A = *(vuint8_t *)(src+16);
+      pixelsv1B = *(vuint8_t *)(src + line_size);
+      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
+      pixelsv1C = *(vuint8_t *)(src + line_size_2);
+      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
+      pixelsv1D = *(vuint8_t *)(src + line_size_3);
+      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
+
+      const vuint8_t block1 = *(vuint8_t *)dst;
+      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
+      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
+      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
+      const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2);
+      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
+      const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3);
+      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
+
+      *(vuint8_t *) dst = put1;
+      *(vuint8_t *)(dst + dst_stride) = put2;
+      *(vuint8_t *)(dst + dst_stride_2) = put3;
+      *(vuint8_t *)(dst + dst_stride_3) = put4;
+
+      src += line_size_4;
+      dst += dst_stride_4;
+    }
+}
+
+// next one assumes that ((line_size % 16) == 0)
+void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
+{
+    register vector unsigned char pixelsv1A, pixelsv2A;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    const int perm = (unsigned int) src & 15;
+    const int shift_dst = (unsigned int) dst & 15;
+
+    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
+    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+    switch(shift_dst){
+      case 0:  dstmask = dstmask0;
+               break;
+      case 4:  dstmask = dstmask4;
+               break;
+      case 8:  dstmask = dstmask8;
+               break;
+      case 12: dstmask = dstmask12;
+               break;
+    }
+
+    int i;
+	register int line_size   = src_stride;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+	register int dst_stride_2 = dst_stride << 1;
+    register int dst_stride_3 = dst_stride_2 + dst_stride;
+    register int dst_stride_4 = dst_stride << 2;
+
+    for(i=0; i<h; i+=4) {
+	  pixelsv1A = *(vuint8_t *)(src);
+      pixelsv2A = *(vuint8_t *)(src+16);
+      pixelsv1B = *(vuint8_t *)(src + line_size);
+      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
+      pixelsv1C = *(vuint8_t *)(src + line_size_2);
+      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
+      pixelsv1D = *(vuint8_t *)(src + line_size_3);
+      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
+
+      const vuint8_t block1 = *(vuint8_t *) dst;
+      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
+      const vuint8_t put1 = spu_avg(block1,put1a);
+
+      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
+      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
+      const vuint8_t put2 = spu_avg(block2,put2a);
+
+      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
+      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
+      const vuint8_t put3 = spu_avg(block3,put3a);
+
+      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
+      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
+      const vuint8_t put4 = spu_avg(block4,put4a);
+
+      *(vuint8_t *) dst = put1;
+      *(vuint8_t *)(dst + dst_stride) = put2;
+      *(vuint8_t *)(dst + dst_stride_2) = put3;
+      *(vuint8_t *)(dst + dst_stride_3) = put4;
+
+      src+= line_size_4;
+      dst+= dst_stride_4;
+    }
+}
+
+/* Here we create all the interpolation modes H.264 motion compensation stage for luma */
+  H264_MC(put_, 16, spu)
+  H264_MC(put_, 8, spu)
+  H264_MC(put_, 4, spu)
+
+  H264_MC(avg_, 16, spu)
+  H264_MC(avg_, 8, spu)
+  H264_MC(avg_, 4, spu)
+
+
+//Chroma interpolation:
+
+#define OP_U8_SPU                          PUT_OP_U8_SPU
+#define PREFIX_h264_chroma_mc8_spu         put_h264_chroma_mc8_spu
+#define PREFIX_h264_chroma_mc4_spu         put_h264_chroma_mc4_spu
+#define PREFIX_h264_chroma_mc2_spu         put_h264_chroma_mc2_spu
+#include "h264_chroma_template_spu.c"
+#undef OP_U8_SPU
+#undef PREFIX_h264_chroma_mc8_spu
+#undef PREFIX_h264_chroma_mc4_spu
+#undef PREFIX_h264_chroma_mc2_spu
+
+#define OP_U8_SPU                          AVG_OP_U8_SPU
+#define PREFIX_h264_chroma_mc8_spu         avg_h264_chroma_mc8_spu
+#define PREFIX_h264_chroma_mc4_spu         avg_h264_chroma_mc4_spu
+#define PREFIX_h264_chroma_mc2_spu         avg_h264_chroma_mc2_spu
+#include "h264_chroma_template_spu.c"
+#undef OP_U8_SPU
+#undef PREFIX_h264_chroma_mc8_spu
+#undef PREFIX_h264_chroma_mc4_spu
+#undef PREFIX_h264_chroma_mc2_spu
+
+// Weight and Biweight functions
+
+#define op_scale1(x)  dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom )
+#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
+#define H264_WEIGHT(W,H) \
+static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
+    int y; \
+    offset <<= log2_denom; \
+    if(log2_denom) offset += 1<<(log2_denom-1); \
+    for(y=0; y<H; y++, dst += stride){ \
+        op_scale1(0); \
+        op_scale1(1); \
+        if(W==2) continue; \
+        op_scale1(2); \
+        op_scale1(3); \
+        if(W==4) continue; \
+        op_scale1(4); \
+        op_scale1(5); \
+        op_scale1(6); \
+        op_scale1(7); \
+        if(W==8) continue; \
+        op_scale1(8); \
+        op_scale1(9); \
+        op_scale1(10); \
+        op_scale1(11); \
+        op_scale1(12); \
+        op_scale1(13); \
+        op_scale1(14); \
+        op_scale1(15); \
+    } \
+} \
+static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \
+    int y; \
+    offset = ((offset + 1) | 1) << log2_denom; \
+    for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \
+        op_scale2(0); \
+        op_scale2(1); \
+        if(W==2) continue; \
+        op_scale2(2); \
+        op_scale2(3); \
+        if(W==4) continue; \
+        op_scale2(4); \
+        op_scale2(5); \
+        op_scale2(6); \
+        op_scale2(7); \
+        if(W==8) continue; \
+        op_scale2(8); \
+        op_scale2(9); \
+        op_scale2(10); \
+        op_scale2(11); \
+        op_scale2(12); \
+        op_scale2(13); \
+        op_scale2(14); \
+        op_scale2(15); \
+    } \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16,8)
+H264_WEIGHT(8,16)
+H264_WEIGHT(8,8)
+H264_WEIGHT(8,4)
+H264_WEIGHT(4,8)
+H264_WEIGHT(4,4)
+H264_WEIGHT(4,2)
+H264_WEIGHT(2,4)
+H264_WEIGHT(2,2)
+
+#undef op_scale1
+#undef op_scale2
+#undef H264_WEIGHT
+
+/////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
+{
+    int i, d;
+    for( i = 0; i < 4; i++ ) {
+        if( tc0[i] < 0 ) {
+            pix += 4*ystride;
+            continue;
+        }
+        for( d = 0; d < 4; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int p2 = pix[-3*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+            const int q2 = pix[2*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int tc = tc0[i];
+                int i_delta;
+
+                if( FFABS( p2 - p0 ) < beta ) {
+                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
+                    tc++;
+                }
+                if( FFABS( q2 - q0 ) < beta ) {
+                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
+                    tc++;
+                }
+
+                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
+                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
+}
+static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
+}
+
+static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
+{
+    int d;
+    for( d = 0; d < 16; d++ ) {
+        const int p2 = pix[-3*xstride];
+        const int p1 = pix[-2*xstride];
+        const int p0 = pix[-1*xstride];
+
+        const int q0 = pix[ 0*xstride];
+        const int q1 = pix[ 1*xstride];
+        const int q2 = pix[ 2*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                if( FFABS( p2 - p0 ) < beta)
+                {
+                    const int p3 = pix[-4*xstride];
+                    /* p0', p1', p2' */
+                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                } else {
+                    /* p0' */
+                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                }
+                if( FFABS( q2 - q0 ) < beta)
+                {
+                    const int q3 = pix[3*xstride];
+                    /* q0', q1', q2' */
+                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                } else {
+                    /* q0' */
+                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                }
+            }else{
+                /* p0', q0' */
+                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+        }
+        pix += ystride;
+    }
+}
+static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
+}
+static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
+}
+
+static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
+{
+    int i, d;
+    for( i = 0; i < 4; i++ ) {
+        const int tc = tc0[i];
+        if( tc <= 0 ) {
+            pix += 2*ystride;
+            continue;
+        }
+        for( d = 0; d < 2; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
+                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
+}
+static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
+}
+
+static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
+{
+    int d;
+    for( d = 0; d < 8; d++ ) {
+        const int p0 = pix[-1*xstride];
+        const int p1 = pix[-2*xstride];
+        const int q0 = pix[0];
+        const int q1 = pix[1*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+        }
+        pix += ystride;
+    }
+}
+static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
+}
+static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
+}
+
+
+void dsputil_h264_init_cell(DSPContext_spu* c) {
+
+	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
+    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
+    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
+    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
+    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
+    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
+    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
+    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
+
+    c->h264_idct_add[0] = h264_idct8_add_spu;
+    c->h264_idct_add[1] = h264_idct4_add_spu;
+
+
+    c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu;
+    c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu;
+    c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu;
+    c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu;
+    c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu;
+    c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu;
+
+    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
+    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
+    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
+    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
+    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
+    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
+    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
+    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
+    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
+    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
+    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
+    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
+    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
+    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
+    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
+    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
+    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
+    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
+    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
+    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
+
+
+#define dspfunc(PFX, IDX, NUM) \
+    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \
+    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \
+    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \
+    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \
+    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \
+    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \
+    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu
+
+    dspfunc(put_h264_qpel, 0, 16);
+    dspfunc(put_h264_qpel, 1, 8);
+    dspfunc(put_h264_qpel, 2, 4);
+
+    dspfunc(avg_h264_qpel, 0, 16);
+    dspfunc(avg_h264_qpel, 1, 8);
+    dspfunc(avg_h264_qpel, 2, 4);
+
+#undef dspfunc
+
+
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/dsputil_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/dsputil_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,34 @@
+#ifndef DSPUTIL_CELL_H
+#define DSPUTIL_CELL_H
+
+#include "types_spu.h"
+
+typedef struct DSPContext_spu {
+	
+	void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
+    /* v/h_loop_filter_luma_intra: align 16 */
+    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
+	
+	qpel_mc_func put_h264_qpel_pixels_tab[3][16];
+	qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
+
+	h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
+	h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
+
+	h264_idct_func h264_idct_add[2];
+
+	h264_weight_func weight_h264_pixels_tab[10];
+	h264_biweight_func biweight_h264_pixels_tab[10];
+
+} DSPContext_spu;
+
+
+void dsputil_h264_init_cell(DSPContext_spu* c);
+ 
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_cabac_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_cabac_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,2633 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 cabac decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+#define CELL_SPE
+#include <limits.h>
+#include <stdlib.h>
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavcodec/avcodec.h"
+#include "h264_deblock_spu.h"
+#include "h264_pred_spu.h"
+#include "h264_direct_spu.h"
+#include "h264_tables.h"
+#include "mathops_spu.h"
+//#include "libavcodec/h264_data.h"
+#include "cabac_spu.h"
+#include "rectangle_spu.h"
+#include "libavutil/log.h"
+
+//#undef NDEBUG
+#include <assert.h>
+#define INT_BIT (sizeof(int) * 8)
+/* Cabac pre state table */
+typedef struct IMbInfo{
+    uint16_t type;
+    uint8_t pred_mode;
+    uint8_t cbp;
+} IMbInfo;
+
+extern int bytecount;
+
+static const IMbInfo i_mb_type_info[26]={
+{MB_TYPE_INTRA4x4  , -1, -1},
+{MB_TYPE_INTRA16x16,  2,  0},
+{MB_TYPE_INTRA16x16,  1,  0},
+{MB_TYPE_INTRA16x16,  0,  0},
+{MB_TYPE_INTRA16x16,  3,  0},
+{MB_TYPE_INTRA16x16,  2,  16},
+{MB_TYPE_INTRA16x16,  1,  16},
+{MB_TYPE_INTRA16x16,  0,  16},
+{MB_TYPE_INTRA16x16,  3,  16},
+{MB_TYPE_INTRA16x16,  2,  32},
+{MB_TYPE_INTRA16x16,  1,  32},
+{MB_TYPE_INTRA16x16,  0,  32},
+{MB_TYPE_INTRA16x16,  3,  32},
+{MB_TYPE_INTRA16x16,  2,  15+0},
+{MB_TYPE_INTRA16x16,  1,  15+0},
+{MB_TYPE_INTRA16x16,  0,  15+0},
+{MB_TYPE_INTRA16x16,  3,  15+0},
+{MB_TYPE_INTRA16x16,  2,  15+16},
+{MB_TYPE_INTRA16x16,  1,  15+16},
+{MB_TYPE_INTRA16x16,  0,  15+16},
+{MB_TYPE_INTRA16x16,  3,  15+16},
+{MB_TYPE_INTRA16x16,  2,  15+32},
+{MB_TYPE_INTRA16x16,  1,  15+32},
+{MB_TYPE_INTRA16x16,  0,  15+32},
+{MB_TYPE_INTRA16x16,  3,  15+32},
+{MB_TYPE_INTRA_PCM , -1, -1},
+};
+
+typedef struct PMbInfo{
+    uint16_t type;
+    uint8_t partition_count;
+} PMbInfo;
+
+static const PMbInfo p_mb_type_info[5]={
+{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
+};
+
+static const PMbInfo p_sub_mb_type_info[4]={
+{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
+{MB_TYPE_16x8 |MB_TYPE_P0L0             , 2},
+{MB_TYPE_8x16 |MB_TYPE_P0L0             , 2},
+{MB_TYPE_8x8  |MB_TYPE_P0L0             , 4},
+};
+
+static const PMbInfo b_mb_type_info[23]={
+{MB_TYPE_DIRECT2|MB_TYPE_L0L1                                      , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
+{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
+};
+
+static const PMbInfo b_sub_mb_type_info[13]={
+{MB_TYPE_DIRECT2                                                   , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
+{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 4, },
+{MB_TYPE_8x8               |MB_TYPE_P0L1             |MB_TYPE_P1L1, 4, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
+};
+
+static const int8_t cabac_context_init_I[460][2] =
+{
+    /* 0 - 10 */
+    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
+    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
+    { -6,  53 }, { -1, 54 },  {  7,  51 },
+
+    /* 11 - 23 unsused for I */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },
+
+    /* 24- 39 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+
+    /* 40 - 53 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 54 - 59 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 60 - 69 */
+    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+    { 13, 41 },  { 3, 62 },
+
+    /* 70 -> 87 */
+    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
+    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
+    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
+    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
+    { -12, 115 },{ -16, 122 },
+
+    /* 88 -> 104 */
+    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
+    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
+    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
+    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
+    { -22, 125 },
+
+    /* 105 -> 135 */
+    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
+    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
+    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
+    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
+    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
+    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
+    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
+    { 14, 62 },  { -13, 108 },{ -15, 100 },
+
+    /* 136 -> 165 */
+    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
+    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
+    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
+    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
+    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
+    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
+    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
+    { 0, 62 },   { 12, 72 },
+
+    /* 166 -> 196 */
+    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
+    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
+    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
+    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
+    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
+    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
+    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
+    { 0, 89 },   { 26, -19 }, { 22, -17 },
+
+    /* 197 -> 226 */
+    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
+    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
+    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
+    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
+    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
+    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
+    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
+    { 12, 68 },  { 2, 97 },
+
+    /* 227 -> 251 */
+    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
+    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
+    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
+    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
+    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
+    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
+    { -4, 65 },
+
+    /* 252 -> 275 */
+    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
+    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
+    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
+    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
+    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
+    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
+
+    /* 276 a bit special (not used, bypass is used instead) */
+    { 0, 0 },
+
+    /* 277 -> 307 */
+    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
+    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
+    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
+    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
+    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
+    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
+    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
+    { 9, 64 },   { -12, 104 },{ -11, 97 },
+
+    /* 308 -> 337 */
+    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
+    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
+    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
+    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
+    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
+    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
+    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
+    { 5, 64 },   { 12, 70 },
+
+    /* 338 -> 368 */
+    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
+    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
+    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
+    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
+    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
+    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
+    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
+    { -12, 109 },{ 36, -35 }, { 36, -34 },
+
+    /* 369 -> 398 */
+    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
+    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
+    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
+    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
+    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
+    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
+    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
+    { 29, 39 },  { 19, 66 },
+
+    /* 399 -> 435 */
+    {  31,  21 }, {  31,  31 }, {  25,  50 },
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
+    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
+    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
+    {   0,  68 }, {  -9,  92 },
+
+    /* 436 -> 459 */
+    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
+    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
+    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
+    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
+};
+
+static const int8_t cabac_context_init_PB[3][460][2] =
+{
+    /* i_cabac_init_idc == 0 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
+        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
+        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
+        {  17,  50 },
+
+        /* 24 - 39 */
+        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
+        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
+        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
+        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
+
+        /* 40 - 53 */
+        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
+        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
+        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
+        {  -3,  81 }, {   0,  88 },
+
+        /* 54 - 59 */
+        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
+        {  -7,  72 }, {   1,  58 },
+
+        /* 60 - 69 */
+        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
+        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
+        {  13,  41 }, {   3,  62 },
+
+        /* 70 - 87 */
+        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
+        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
+        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
+        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
+        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
+        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
+        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
+        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
+        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
+
+        /* 105 -> 165 */
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
+        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
+        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
+        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
+        {   9,  69 },
+
+        /* 166 - 226 */
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
+        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
+        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
+        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
+        {  -9, 108 },
+
+        /* 227 - 275 */
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
+        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
+        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
+        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
+        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
+        {  -8,  85 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
+        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
+        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
+        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
+        {  26,  43 },
+
+        /* 338 - 398 */
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
+        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
+        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
+        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
+        {  11,  86 },
+
+        /* 399 - 435 */
+        {  12,  40 }, {  11,  51 }, {  14,  59 },
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
+        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
+        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
+        {  -8,  66 }, {  -8,  76 },
+
+        /* 436 - 459 */
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
+    },
+
+    /* i_cabac_init_idc == 1 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
+        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
+        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
+        {  10,  54 },
+
+        /* 24 - 39 */
+        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
+        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
+        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
+        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
+
+        /* 40 - 53 */
+        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
+        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
+        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
+        {  -7,  86 },{  -5,  95 },
+
+        /* 54 - 59 */
+        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
+        {  -5,  72 },{   0,  61 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
+        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
+        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
+        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
+        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
+        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
+        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
+        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
+        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
+
+        /* 105 -> 165 */
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
+        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
+        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
+        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
+        {   0,  89 },
+
+        /* 166 - 226 */
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
+        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
+        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
+        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
+        { -10, 116 },
+
+        /* 227 - 275 */
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
+        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
+        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
+        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
+        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
+        {  -4,  78 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
+        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
+        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
+        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
+        {  18,  50 },
+
+        /* 338 - 398 */
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
+        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
+        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
+        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
+        {  11,  83 },
+
+        /* 399 - 435 */
+        {  25,  32 }, {  21,  49 }, {  21,  54 },
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
+        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
+        {  -4,  67 }, {  -7,  82 },
+
+        /* 436 - 459 */
+        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
+        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
+        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
+        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+    },
+
+    /* i_cabac_init_idc == 2 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
+        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
+        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
+        {  14,  57 },
+
+        /* 24 - 39 */
+        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
+        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
+        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
+        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
+
+        /* 40 - 53 */
+        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
+        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
+        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
+        {  -3,  90 },{  -1,  101 },
+
+        /* 54 - 59 */
+        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
+        {  -7,  50 },{   1,  60 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
+        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
+        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
+        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
+        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
+        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
+        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
+        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
+        {   3,  68 }, {  -8,  71 }, { -13,  98 },
+
+        /* 105 -> 165 */
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
+        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
+        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
+        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
+        { -22, 127 },
+
+        /* 166 - 226 */
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
+        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
+        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
+        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
+        { -24, 127 },
+
+        /* 227 - 275 */
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
+        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
+        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
+        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
+        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
+        { -10,  87 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
+        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
+        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
+        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
+        {  25,  42 },
+
+        /* 338 - 398 */
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
+        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
+        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
+        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
+        {  25,  61 },
+
+        /* 399 - 435 */
+        {  21,  33 }, {  19,  50 }, {  17,  61 },
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
+        {  -6,  68 }, { -10,  79 },
+
+        /* 436 - 459 */
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+    }
+};
+
+static const uint8_t left_block_options[4][16]={
+    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
+    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
+    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
+    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
+};
+
+void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c) {
+    int i;
+    const int8_t (*tab)[2];
+
+    if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I;
+    else                                 tab = cabac_context_init_PB[s->cabac_init_idc];
+
+    /* calculate pre-state */
+    for( i= 0; i < 460; i++ ) {
+        int pre = 2*(((tab[i][0] * s->qscale) >>4 ) + tab[i][1]) - 127;
+
+        pre^= pre>>31;
+        if(pre > 124)
+            pre= 124 + (pre&1);
+
+        c->cabac_state[i] =  pre;
+    }
+}
+
+static void fill_decode_neighbors(H264Cabac_spu *hc, EDSlice_spu *s){
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+	const int mb_y = m->mb_y;
+
+    m->top_type     = hc->mb_type_top[mb_x];
+    m->left_type    = hc->mb_type[mb_x-1] ;
+
+}
+
+static void fill_decode_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
+    H264Mb *m = s->m;
+    int topleft_xy, top_xy, topright_xy, left_xy;
+    int topleft_type, top_type, topright_type, left_type;
+    const uint8_t * left_block= left_block_options[0];
+	const int mb_x = m->mb_x;
+	const int mb_y = m->mb_y;
+	const int b_stride = hc->b_stride;
+    int i;
+
+    topleft_type = hc->mb_type_top[mb_x-1] ;
+    top_type     = m->top_type      ;
+	topright_type= hc->mb_type_top[mb_x+1] ;
+    left_type    = m->left_type     ;
+	
+	if (s->slice_type_nos == FF_B_TYPE){
+		get_list = get_list_buf;
+		for(int i=0; i<2; i++){
+			get_dma_list(hc->list1_motion_val[i], s->list1.motion_val[i][4*mb_x + 4*mb_y*b_stride], 16, 4, b_stride*2*sizeof(int16_t), ED_get_mv, 0);
+		}
+		if (hc->blocking) wait_dma_id(ED_get_mv);
+	}
+	
+    if(!IS_SKIP(mb_type)){
+        if(IS_INTRA(mb_type)){
+            int type_mask= s->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
+            m->topleft_samples_available=
+            m->top_samples_available=
+            m->left_samples_available= 0xFFFF;
+            m->topright_samples_available= 0xEEEA;
+
+            if(!(top_type & type_mask)){
+                m->topleft_samples_available= 0xB3FF;
+                m->top_samples_available= 0x33FF;
+                m->topright_samples_available= 0x26EA;
+            }
+            if(!(left_type & type_mask)){
+                m->topleft_samples_available&= 0xDF5F;
+                m->left_samples_available&= 0x5F5F;
+            }
+
+            if(!(topleft_type & type_mask))
+                m->topleft_samples_available&= 0x7FFF;
+
+            if(!(topright_type & type_mask))
+                m->topright_samples_available&= 0xFBFF;
+
+            if(IS_INTRA4x4(mb_type)){
+                if(IS_INTRA4x4(top_type)){
+                    AV_COPY32(m->intra4x4_pred_mode_cache+4+8*0, &hc->intra4x4_pred_mode_top[8*mb_x]);
+                }else{
+                    m->intra4x4_pred_mode_cache[4+8*0]=
+                    m->intra4x4_pred_mode_cache[5+8*0]=
+                    m->intra4x4_pred_mode_cache[6+8*0]=
+                    m->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask);
+                }
+                for(i=0; i<2; i++){
+                    if(IS_INTRA4x4(left_type)){
+                        int8_t *mode= &hc->intra4x4_pred_mode[8*(mb_x-1)];
+                        m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]];
+                        m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]];
+                    }else{
+                        m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
+                        m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type & type_mask);
+                    }
+                }
+            }
+        }
+        if(top_type){
+			AV_COPY32(&m->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]);
+            m->non_zero_count_cache[1+8*0]= hc->non_zero_count_top[mb_x][1+1*8];
+			m->non_zero_count_cache[2+8*0]= hc->non_zero_count_top[mb_x][2+1*8];
+			m->non_zero_count_cache[1+8*3]= hc->non_zero_count_top[mb_x][1+2*8];
+			m->non_zero_count_cache[2+8*3]= hc->non_zero_count_top[mb_x][2+2*8];
+        }else {
+            m->non_zero_count_cache[1+8*0]=
+            m->non_zero_count_cache[2+8*0]=
+            m->non_zero_count_cache[1+8*3]=
+            m->non_zero_count_cache[2+8*3]=
+            AV_WN32A(&m->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040);
+        }
+
+        for (i=0; i<2; i++) {
+            if(left_type){
+                m->non_zero_count_cache[3+8*1 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+0+2*i]];
+				m->non_zero_count_cache[3+8*2 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+1+2*i]];
+				m->non_zero_count_cache[0+8*1 +   8*i]= hc->non_zero_count[mb_x-1][left_block[8+4+2*i]];
+				m->non_zero_count_cache[0+8*4 +   8*i]= hc->non_zero_count[mb_x-1][left_block[8+5+2*i]];
+            }else{
+                    m->non_zero_count_cache[3+8*1 + 2*8*i]=
+                    m->non_zero_count_cache[3+8*2 + 2*8*i]=
+                    m->non_zero_count_cache[0+8*1 +   8*i]=
+                    m->non_zero_count_cache[0+8*4 +   8*i]= !IS_INTRA(mb_type) ? 0 : 64;
+            }
+        }
+
+
+		// top_cbp
+		if(top_type) {
+			hc->top_cbp = hc->cbp_top[mb_x];
+		} else {
+			hc->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
+		}
+		// left_cbp
+		if (left_type) {
+			hc->left_cbp = (hc->cbp[mb_x-1] & 0x1f0)
+			|  ((hc->cbp[mb_x-1]>>(left_block[0]&(~1)))&2)
+			| (((hc->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2);
+		} else {
+			hc->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
+		}
+    }
+
+    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
+        int list;
+
+        m->ref_cache[0][scan8[5 ]+1] = m->ref_cache[0][scan8[7 ]+1] = m->ref_cache[0][scan8[13]+1] =
+        m->ref_cache[1][scan8[5 ]+1] = m->ref_cache[1][scan8[7 ]+1] = m->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
+
+        for(list=0; list<s->list_count; list++){
+            if(!USES_LIST(mb_type, list)){
+                continue;
+            }
+            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
+
+            if(USES_LIST(top_type, list)){
+                const int b_xy= 4*mb_x + 3*hc->b_stride;
+                AV_COPY128(m->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]);
+                    m->ref_cache[list][scan8[0] + 0 - 1*8]=
+                    m->ref_cache[list][scan8[0] + 1 - 1*8]= hc->ref_index_top[list][4*mb_x + 2];
+                    m->ref_cache[list][scan8[0] + 2 - 1*8]=
+					m->ref_cache[list][scan8[0] + 3 - 1*8]= hc->ref_index_top[list][4*mb_x + 3];
+            }else{
+                AV_ZERO128(m->mv_cache[list][scan8[0] + 0 - 1*8]);
+                AV_WN32A(&m->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
+            }
+
+            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
+                for(i=0; i<2; i++){
+                    int cache_idx = scan8[0] - 1 + i*2*8;
+                    if(USES_LIST(left_type, list)){
+                        const int b_xy= 4*(mb_x-1) + 3;
+                        const int b8_x= 4*(mb_x-1) + 1;
+                        AV_COPY32(m->mv_cache[list][cache_idx  ], hc->motion_val[list][b_xy + hc->b_stride*left_block[0+i*2]]);
+                        AV_COPY32(m->mv_cache[list][cache_idx+8], hc->motion_val[list][b_xy + hc->b_stride*left_block[1+i*2]]);
+                        m->ref_cache[list][cache_idx  ]= hc->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
+                        m->ref_cache[list][cache_idx+8]= hc->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
+                    }else{
+                        AV_ZERO32(m->mv_cache [list][cache_idx  ]);
+                        AV_ZERO32(m->mv_cache [list][cache_idx+8]);
+                        m->ref_cache[list][cache_idx  ]=
+                        m->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
+                    }
+                }
+            }else{
+                if(USES_LIST(left_type, list)){
+					const int b_x = 4*(mb_x-1) + 3;
+                    const int b8_x= 4*(mb_x-1) + 1;
+                    AV_COPY32(m->mv_cache[list][scan8[0] - 1], hc->motion_val[list][b_x + hc->b_stride*left_block[0]]);
+                    m->ref_cache[list][scan8[0] - 1]= hc->ref_index[list][b8_x + (left_block[0]&~1)];
+                }else{
+                    AV_ZERO32(m->mv_cache [list][scan8[0] - 1]);
+                    m->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if(USES_LIST(topright_type, list)){
+                const int b_xy= 4*(mb_x+1) + 3*hc->b_stride;
+                AV_COPY32(m->mv_cache[list][scan8[0] + 4 - 1*8], hc->motion_val_top[list][b_xy]);
+                m->ref_cache[list][scan8[0] + 4 - 1*8]= hc->ref_index_top[list][4*(mb_x+1) + 2];
+            }else{
+                AV_ZERO32(m->mv_cache [list][scan8[0] + 4 - 1*8]);
+                m->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+            }
+            if(m->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
+                int topleft_partition= -1;
+                if(USES_LIST(topleft_type, list)){
+                    const int b_xy = 4*(mb_x-1) + 3 + hc->b_stride + (topleft_partition & 2*hc->b_stride);
+                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
+                    AV_COPY32(m->mv_cache[list][scan8[0] - 1 - 1*8], hc->motion_val_top[list][b_xy]);
+                    m->ref_cache[list][scan8[0] - 1 - 1*8]= hc->ref_index_top[list][b8_x];
+                }else{
+                    AV_ZERO32(m->mv_cache[list][scan8[0] - 1 - 1*8]);
+                    m->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
+                continue;
+
+            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
+                m->ref_cache[list][scan8[4 ]] =
+                m->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
+                AV_ZERO32(m->mv_cache [list][scan8[4 ]]);
+                AV_ZERO32(m->mv_cache [list][scan8[12]]);
+
+
+				/* XXX beurk, Load mvd */
+				if(USES_LIST(top_type, list)){
+// 					const int b_xy= hc->mb2br_top_xy;
+					AV_COPY64(hc->mvd_cache[list][scan8[0] + 0 - 1*8], hc->mvd_top[list][8*mb_x + 0]);
+				}else{
+					AV_ZERO64(hc->mvd_cache[list][scan8[0] + 0 - 1*8]);
+				}
+				if(USES_LIST(left_type, list)){
+// 					const int b_xy= hc->mb2br_left_xy + 6;
+					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 0*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[0]]);
+					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 1*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[1]]);
+				}else{
+					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 0*8]);
+					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 1*8]);
+				}
+				if(USES_LIST(left_type, list)){
+// 					const int b_xy= hc->mb2br_left_xy + 6;
+					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 2*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[2]]);
+					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 3*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[3]]);
+				}else{
+					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 2*8]);
+					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 3*8]);
+				}
+				AV_ZERO16(hc->mvd_cache [list][scan8[4 ]]);
+				AV_ZERO16(hc->mvd_cache [list][scan8[12]]);
+				if(s->slice_type_nos == FF_B_TYPE){
+					fill_rectangle(&hc->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
+
+					if(IS_DIRECT(top_type)){
+						AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1));
+					}else if(IS_8X8(top_type)){
+						int b8_x = 4*mb_x;
+						hc->direct_cache[scan8[0] + 0 - 1*8]= hc->direct_top[b8_x + 2];
+						hc->direct_cache[scan8[0] + 2 - 1*8]= hc->direct_top[b8_x + 3];
+					}else{
+						AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1));
+					}
+
+					if(IS_DIRECT(left_type))
+						hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1;
+					else if(IS_8X8(left_type))
+						hc->direct_cache[scan8[0] - 1 + 0*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)];
+					else
+						hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1;
+
+					if(IS_DIRECT(left_type))
+						hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1;
+					else if(IS_8X8(left_type))
+						hc->direct_cache[scan8[0] - 1 + 2*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)];
+					else
+						hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1;
+				}
+            }
+        }
+    }
+    hc->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type);
+
+	if (s->slice_type_nos == FF_B_TYPE){
+		wait_dma_id(ED_get_mv);
+	}
+}
+
+static int check_mv(H264Cabac_spu *hc, EDSlice_spu *s, long b_idx, long bn_idx, int mvy_limit){
+	int v;
+
+	v= hc->ref_cache[0][b_idx] != hc->ref_cache[0][bn_idx];
+	if(!v && hc->ref_cache[0][b_idx]!=-1)
+		// absolute value >= 7 | ...
+		v= ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) |
+		((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit);
+
+	if(s->list_count==2){
+		if(!v)
+			v = (hc->ref_cache[1][b_idx] != hc->ref_cache[1][bn_idx]) |
+			((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) |
+			((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit);
+
+		if(v){
+			if((hc->ref_cache[0][b_idx] != hc->ref_cache[1][bn_idx]) |
+				(hc->ref_cache[1][b_idx] != hc->ref_cache[0][bn_idx]))
+				return 1;
+			return
+			((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) |
+			((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
+			((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) |
+			((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit);
+		}
+	}
+
+	return v;
+}
+
+static void calc_bS_values(H264Cabac_spu *hc, EDSlice_spu *s, int mvy_limit, int dir) {
+	H264Mb *m = s->m;
+	int mb_type = m->mb_type;
+	int edge;
+	const int mbm_type = dir == 0 ? m->left_type : m->top_type;
+
+	// how often to recheck mv-based bS when iterating between edges
+	static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
+	{0,3,1,1,3,3,3,3}};
+	const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
+	const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
+	// how often to recheck mv-based bS when iterating along each edge
+	const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
+
+	m->edges[dir]= edges;
+
+	if(mbm_type){
+		int16_t* bS=m->bS[dir][0];
+		if( IS_INTRA(mb_type|mbm_type)) {
+			AV_WN64A(bS, 0x0004000400040004ULL);
+		} else {
+			int i;
+			int mv_done;
+			if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
+				int b_idx= 8 + 4;
+				int bn_idx= b_idx - (dir ? 8:1);
+
+				bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, 8 + 4, bn_idx, mvy_limit);
+				mv_done = 1;
+			}
+			else
+				mv_done = 0;
+
+			for( i = 0; i < 4; i++ ) {
+				int x = dir == 0 ? 0 : i;
+				int y = dir == 0 ? i    : 0;
+				int b_idx= 8 + 4 + x + 8*y;
+				int bn_idx= b_idx - (dir ? 8:1);
+
+				if( hc->non_zero_count_cache[b_idx] |
+					hc->non_zero_count_cache[bn_idx] ) {
+					bS[i] = 2;
+				}
+				else if(!mv_done)
+				{
+					bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
+				}
+			}
+		}
+	}
+
+	/* Calculate bS */
+	for( edge = 1; edge < edges; edge++ ) {
+		int16_t* bS=m->bS[dir][edge];
+
+		if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
+			continue;
+
+		if( IS_INTRA(mb_type)) {
+			AV_WN64A(bS, 0x0003000300030003ULL);
+		} else {
+			int i;
+			int mv_done;
+
+			if( edge & mask_edge ) {
+				AV_ZERO64(bS);
+				mv_done = 1;
+			}
+			else if( mask_par0 ) {
+				int b_idx= 8 + 4 + edge * (dir ? 8:1);
+				int bn_idx= b_idx - (dir ? 8:1);
+
+				bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
+				mv_done = 1;
+			}
+			else
+				mv_done = 0;
+
+			for( i = 0; i < 4; i++ ) {
+				int x = dir == 0 ? edge : i;
+				int y = dir == 0 ? i    : edge;
+				int b_idx= 8 + 4 + x + 8*y;
+				int bn_idx= b_idx - (dir ? 8:1);
+
+				if( hc->non_zero_count_cache[b_idx] |
+					hc->non_zero_count_cache[bn_idx] ) {
+					bS[i] = 2;
+				}
+				else if(!mv_done)
+				{
+					bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
+				}
+			}
+
+			if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+				continue;
+		}
+
+	}
+}
+
+/**
+*
+* @return zero if the loop filter can be skiped
+*/
+static int fill_filter_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+    const int mb_y = m->mb_y;
+    int top_type, left_type;
+    int qp, top_qp, left_qp;
+    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
+
+    m->dequant4_coeff_y  = hc->dequant4_coeff[0][s->qscale][0];
+    m->dequant4_coeff_cb = hc->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][s->chroma_qp[0]][0];
+    m->dequant4_coeff_cr = hc->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][s->chroma_qp[1]][0];
+
+    m->qscale_mb_xy = qp = hc->qscale[mb_x];
+    m->qscale_left_mb_xy = left_qp = hc->qscale[mb_x-1];
+    m->qscale_top_mb_xy = top_qp = hc->qscale_top[mb_x];
+
+    //for sufficiently low qp, filtering wouldn't do anything
+    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
+	if(qp <= qp_thresh
+		&& (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
+		&& ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
+		m->deblock_mb = 0;
+		return 0;
+	}
+    
+
+    m->deblock_mb = 1;
+
+	top_type     = hc->mb_type_top[mb_x] ;
+	left_type    = hc->mb_type[mb_x -1];
+
+    m->top_type     = top_type ;
+    m->left_type    = left_type;
+
+    if(IS_INTRA(mb_type)){
+        calc_bS_values(hc, s, 4, 0);
+        calc_bS_values(hc, s, 4, 1);
+        return 1;
+    }
+
+    AV_COPY64(&hc->non_zero_count_cache[0+8*1], &hc->non_zero_count[mb_x][ 0]);
+    AV_COPY64(&hc->non_zero_count_cache[0+8*2], &hc->non_zero_count[mb_x][ 8]);
+    AV_COPY32(&hc->non_zero_count_cache[0+8*5], &hc->non_zero_count[mb_x][16]);
+    AV_COPY32(&hc->non_zero_count_cache[4+8*3], &hc->non_zero_count[mb_x][20]);
+    AV_COPY64(&hc->non_zero_count_cache[0+8*4], &hc->non_zero_count[mb_x][24]);
+
+    m->cbp= hc->cbp[mb_x];
+
+    {
+        int list;
+        for(list=0; list<s->list_count; list++){
+            int8_t *ref;
+            int y, b_stride;
+            int16_t (*mv_dst)[2];
+            int16_t (*mv_src)[2];
+
+            if(!USES_LIST(mb_type, list)){
+                fill_rectangle( hc->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
+                AV_WN32A(&hc->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                AV_WN32A(&hc->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                AV_WN32A(&hc->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                AV_WN32A(&hc->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                continue;
+            }
+
+            ref = &hc->ref_index[list][4*mb_x];
+            {
+                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
+                AV_WN32A(&hc->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+                AV_WN32A(&hc->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+                ref += 2;
+                AV_WN32A(&hc->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+                AV_WN32A(&hc->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+            }
+            b_stride = hc->b_stride;
+            mv_dst   = &hc->mv_cache[list][scan8[0]];
+            mv_src   = &hc->motion_val[list][4*mb_x];
+            for(y=0; y<4; y++){
+                AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride);
+            }
+
+        }
+    }
+
+    /*
+    0 . T T. T T T T
+    1 L . .L . . . .
+    2 L . .L . . . .
+    3 . T TL . . . .
+    4 L . .L . . . .
+    5 L . .. . . . .
+    */
+    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
+    if(top_type){
+        AV_COPY32(&hc->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]);
+    }
+
+    if(left_type){
+        hc->non_zero_count_cache[3+8*1]= hc->non_zero_count[mb_x-1][7+0*8];
+		hc->non_zero_count_cache[3+8*2]= hc->non_zero_count[mb_x-1][7+1*8];
+		hc->non_zero_count_cache[3+8*3]= hc->non_zero_count[mb_x-1][7+2*8];
+		hc->non_zero_count_cache[3+8*4]= hc->non_zero_count[mb_x-1][7+3*8];
+    }
+
+    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
+        int list;
+        for(list=0; list<s->list_count; list++){
+            if(USES_LIST(top_type, list)){
+                const int b_xy= 4*mb_x + 3*hc->b_stride;
+                const int b8_x= 4*mb_x + 2;
+                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
+                AV_COPY128(hc->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]);
+                hc->ref_cache[list][scan8[0] + 0 - 1*8]=
+                hc->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 0]];
+                hc->ref_cache[list][scan8[0] + 2 - 1*8]=
+                hc->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 1]];
+            }else{
+                AV_ZERO128(hc->mv_cache[list][scan8[0] + 0 - 1*8]);
+                AV_WN32A(&hc->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+            }
+
+            if(USES_LIST(left_type, list)){
+				const int b_x = 4*(mb_x-1) + 3;
+                const int b8_x= 4*(mb_x-1) + 1;
+                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
+                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 0 ], hc->motion_val[list][b_x + hc->b_stride*0]);
+                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 8 ], hc->motion_val[list][b_x + hc->b_stride*1]);
+                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +16 ], hc->motion_val[list][b_x + hc->b_stride*2]);
+                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +24 ], hc->motion_val[list][b_x + hc->b_stride*3]);
+                hc->ref_cache[list][scan8[0] - 1 + 0 ]=
+                hc->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*0]];
+                hc->ref_cache[list][scan8[0] - 1 +16 ]=
+                hc->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*1]];
+            }else{
+                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 0 ]);
+                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 8 ]);
+                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +16 ]);
+                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +24 ]);
+                hc->ref_cache[list][scan8[0] - 1 + 0  ]=
+                hc->ref_cache[list][scan8[0] - 1 + 8  ]=
+                hc->ref_cache[list][scan8[0] - 1 + 16 ]=
+                hc->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
+            }
+        }
+    }
+    calc_bS_values(hc, s, 4, 0);
+    calc_bS_values(hc, s, 4, 1);
+    return 1;
+}
+
+
+/**
+* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
+*/
+static int check_intra4x4_pred_mode(EDSlice_spu *s){
+    H264Mb *m = s->m;
+    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
+    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
+    int i;
+
+    if(!(m->top_samples_available&0x8000)){
+        for(i=0; i<4; i++){
+            int status= top[ m->intra4x4_pred_mode_cache[scan8[0] + i] ];
+            if(status<0){
+                fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
+                return -1;
+            } else if(status){
+                m->intra4x4_pred_mode_cache[scan8[0] + i]= status;
+            }
+        }
+    }
+
+    if((m->left_samples_available&0x8888)!=0x8888){
+        static const int mask[4]={0x8000,0x2000,0x80,0x20};
+        for(i=0; i<4; i++){
+            if(!(m->left_samples_available&mask[i])){
+                int status= left[ m->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
+                if(status<0){
+                    fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d, %x\n", status, m->mb_x, m->mb_y, m->left_samples_available);
+                    return -1;
+                } else if(status){
+                    m->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
+*/
+static int check_intra_pred_mode(EDSlice_spu *s, int mode){
+    H264Mb *m = s->m;
+    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
+    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
+
+    if(mode > 6) {
+        fprintf(stderr, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y);
+        return -1;
+    }
+
+    if(!(m->top_samples_available&0x8000)){
+        mode= top[ mode ];
+        if(mode<0){
+            fprintf(stderr, "top block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y);
+            return -1;
+        }
+    }
+
+    if((m->left_samples_available&0x8080) != 0x8080){
+        mode= left[ mode ];
+        if(m->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
+            mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(m->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
+        }
+        if(mode<0){
+            fprintf(stderr, "left block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y);
+            return -1;
+        }
+    }
+    return mode;
+}
+
+/**
+ * gets the predicted intra4x4 prediction mode.
+ */
+static inline int pred_intra_mode(EDSlice_spu *s, int n){
+    H264Mb *m = s->m;
+    const int index8= scan8[n];
+    const int left= m->intra4x4_pred_mode_cache[index8 - 1];
+    const int top = m->intra4x4_pred_mode_cache[index8 - 8];
+    const int min= FFMIN(left, top);
+
+    if(min<0) return DC_PRED;
+    else      return min;
+}
+
+static void write_back_intra_pred_mode(H264Cabac_spu *hc, EDSlice_spu *s){
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+    int8_t *mode= &hc->intra4x4_pred_mode[8*mb_x];
+
+    AV_COPY32(mode, m->intra4x4_pred_mode_cache + 4 + 8*4);
+    mode[4]= m->intra4x4_pred_mode_cache[7+8*3];
+    mode[5]= m->intra4x4_pred_mode_cache[7+8*2];
+    mode[6]= m->intra4x4_pred_mode_cache[7+8*1];
+}
+
+static inline void write_back_non_zero_count(H264Cabac_spu *hc, EDSlice_spu *s){
+    H264Mb *m = s->m;
+    const int mb_x= m->mb_x;
+
+    AV_COPY64(&hc->non_zero_count[mb_x][ 0], &m->non_zero_count_cache[0+8*1]);
+    AV_COPY64(&hc->non_zero_count[mb_x][ 8], &m->non_zero_count_cache[0+8*2]);
+    AV_COPY32(&hc->non_zero_count[mb_x][16], &m->non_zero_count_cache[0+8*5]);
+    AV_COPY32(&hc->non_zero_count[mb_x][20], &m->non_zero_count_cache[4+8*3]);
+    AV_COPY64(&hc->non_zero_count[mb_x][24], &m->non_zero_count_cache[0+8*4]);
+}
+
+static inline void write_back_motion(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+    int b_stride = hc->b_stride;
+    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
+    const int b8_x= 4*m->mb_x;
+    int list;
+
+    if(!USES_LIST(mb_type, 0))
+        fill_rectangle(&hc->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
+
+    for(list=0; list<s->list_count; list++){
+        int y;
+        int16_t (*mv_dst)[2];
+        int16_t (*mv_src)[2];
+
+        if(!USES_LIST(mb_type, list))
+            continue;
+
+        mv_dst   = &hc->motion_val[list][b_x];
+        mv_src   = &m->mv_cache[list][scan8[0]];
+        for(y=0; y<4; y++){
+            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
+        }
+        {
+            uint8_t (*mvd_dst)[2] = (void *) hc->mvd[list][8*mb_x];
+            uint8_t (*mvd_src)[2] = &hc->mvd_cache[list][scan8[0]];
+            if(IS_SKIP(mb_type))
+                AV_ZERO128(mvd_dst);
+            else{
+				AV_COPY64(mvd_dst, mvd_src + 8*3);
+                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
+                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
+                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
+            }
+        }
+
+        {
+            int8_t *ref_index = &hc->ref_index[list][b8_x];
+            ref_index[0+0*2]= m->ref_cache[list][scan8[0]];
+            ref_index[1+0*2]= m->ref_cache[list][scan8[4]];
+            ref_index[0+1*2]= m->ref_cache[list][scan8[8]];
+            ref_index[1+1*2]= m->ref_cache[list][scan8[12]];
+        }
+    }
+
+    if(s->slice_type_nos == FF_B_TYPE){
+        if(IS_8X8(mb_type)){
+            uint8_t *direct = &hc->direct[4*mb_x];
+            direct[1] = m->sub_mb_type[1]>>1;
+            direct[2] = m->sub_mb_type[2]>>1;
+            direct[3] = m->sub_mb_type[3]>>1;
+        }
+    }
+}
+
+static inline int get_dct8x8_allowed(EDSlice_spu *s){
+    H264Mb *m = s->m;
+    if(s->direct_8x8_inference_flag)
+        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
+    else
+        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
+}
+
+static inline int fetch_diagonal_mv(EDSlice_spu *s, const int16_t **C, int i, int list, int part_width){
+    H264Mb *m = s->m;
+    const int topright_ref= m->ref_cache[list][ i - 8 + part_width ];
+
+    if(topright_ref != PART_NOT_AVAILABLE){
+        *C= m->mv_cache[list][ i - 8 + part_width ];
+        return topright_ref;
+    }else{
+        *C= m->mv_cache[list][ i - 8 - 1 ];
+        return m->ref_cache[list][ i - 8 - 1 ];
+    }
+}
+
+/**
+ * gets the predicted MV.
+ * @param n the block index
+ * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
+ * @param mx the x component of the predicted motion vector
+ * @param my the y component of the predicted motion vector
+ */
+static inline void pred_motion(EDSlice_spu *s, int n, int part_width, int list, int ref, int * const mx, int * const my){
+    H264Mb *m = s->m;
+    const int index8= scan8[n];
+    const int top_ref=      m->ref_cache[list][ index8 - 8 ];
+    const int left_ref=     m->ref_cache[list][ index8 - 1 ];
+    const int16_t * const A= m->mv_cache[list][ index8 - 1 ];
+    const int16_t * const B= m->mv_cache[list][ index8 - 8 ];
+    const int16_t * C;
+    int diagonal_ref, match_count;
+
+    assert(part_width==1 || part_width==2 || part_width==4);
+
+/* mv_cache
+  B . . A T T T T
+  U . . L . . , .
+  U . . L . . . .
+  U . . L . . , .
+  . . . L . . . .
+*/
+
+    diagonal_ref= fetch_diagonal_mv(s, &C, index8, list, part_width);
+    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
+
+    if(match_count > 1){ //most common
+        *mx= mid_pred(A[0], B[0], C[0]);
+        *my= mid_pred(A[1], B[1], C[1]);
+    }else if(match_count==1){
+        if(left_ref==ref){
+            *mx= A[0];
+            *my= A[1];
+        }else if(top_ref==ref){
+            *mx= B[0];
+            *my= B[1];
+        }else{
+            *mx= C[0];
+            *my= C[1];
+        }
+    }else{
+        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
+            *mx= A[0];
+            *my= A[1];
+        }else{
+            *mx= mid_pred(A[0], B[0], C[0]);
+            *my= mid_pred(A[1], B[1], C[1]);
+        }
+    }
+
+}
+
+/**
+ * gets the directionally predicted 16x8 MV.
+ * @param n the block index
+ * @param mx the x component of the predicted motion vector
+ * @param my the y component of the predicted motion vector
+ */
+static inline void pred_16x8_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){
+    H264Mb *m = s->m;
+    if(n==0){
+        const int top_ref=      m->ref_cache[list][ scan8[0] - 8 ];
+        const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ];
+
+        if(top_ref == ref){
+            *mx= B[0];
+            *my= B[1];
+            return;
+        }
+    }else{
+        const int left_ref=     m->ref_cache[list][ scan8[8] - 1 ];
+        const int16_t * const A= m->mv_cache[list][ scan8[8] - 1 ];
+
+        if(left_ref == ref){
+            *mx= A[0];
+            *my= A[1];
+            return;
+        }
+    }
+
+    //RARE
+    pred_motion(s, n, 4, list, ref, mx, my);
+}
+
+/**
+ * gets the directionally predicted 8x16 MV.
+ * @param n the block index
+ * @param mx the x component of the predicted motion vector
+ * @param my the y component of the predicted motion vector
+ */
+static inline void pred_8x16_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){
+    H264Mb *m = s->m;
+    if(n==0){
+        const int left_ref=      m->ref_cache[list][ scan8[0] - 1 ];
+        const int16_t * const A=  m->mv_cache[list][ scan8[0] - 1 ];
+
+        if(left_ref == ref){
+            *mx= A[0];
+            *my= A[1];
+            return;
+        }
+    }else{
+        const int16_t * C;
+        int diagonal_ref;
+
+        diagonal_ref= fetch_diagonal_mv(s, &C, scan8[4], list, 2);
+        if(diagonal_ref == ref){
+            *mx= C[0];
+            *my= C[1];
+            return;
+        }
+    }
+
+    //RARE
+    pred_motion(s, n, 2, list, ref, mx, my);
+}
+
+static inline void pred_pskip_motion(EDSlice_spu *s, int * const mx, int * const my){
+    H264Mb *m = s->m;
+    const int top_ref = m->ref_cache[0][ scan8[0] - 8 ];
+    const int left_ref= m->ref_cache[0][ scan8[0] - 1 ];
+
+    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
+       || !( top_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 8 ]))
+       || !(left_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 1 ]))){
+
+        *mx = *my = 0;
+        return;
+    }
+
+    pred_motion(s, 0, 4, 0, 0, mx, my);
+
+    return;
+}
+
+/**
+ * decodes a P_SKIP or B_SKIP macroblock
+ */
+static void decode_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s){
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;    
+    int mb_type=0;
+
+    memset(hc->non_zero_count[mb_x], 0, 32);
+    memset(m->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+
+    if( s->slice_type_nos == FF_B_TYPE )
+    {
+        // just for fill_caches. pred_direct_motion will set the real mb_type
+        mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
+		fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ...
+
+        ff_h264_pred_direct_motion(hc, s, &mb_type);
+        mb_type|= MB_TYPE_SKIP;
+    }
+    else
+    {
+        int mx, my;
+        mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+
+        fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ...
+        pred_pskip_motion(s, &mx, &my);
+        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+        fill_rectangle(  m->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
+    }
+
+    write_back_motion(hc, s, mb_type);
+	hc->mb_type[mb_x]= mb_type;
+    m->mb_type = mb_type;
+    hc->qscale[mb_x]= s->qscale;
+    fill_filter_caches(hc, s, mb_type);
+}
+
+static int decode_cabac_intra_mb_type(EDSlice_spu *s, CABACContext *c, int ctx_base, int intra_slice) {
+    H264Mb *m =s->m;
+    uint8_t *state= &c->cabac_state[ctx_base];
+    int mb_type;
+
+    if(intra_slice){
+        int ctx=0;
+        if( m->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
+            ctx++;
+        if( m->top_type     & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
+            ctx++;
+        if( get_cabac_noinline( c, &state[ctx] ) == 0 )
+            return 0;   /* I4x4 */
+        state += 2;
+    }else{
+        if( get_cabac_noinline( c, state ) == 0 )
+            return 0;   /* I4x4 */
+    }
+
+    if( get_cabac_terminate( c ) )
+        return 25;  /* PCM */
+
+    mb_type = 1; /* I16x16 */
+    mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */
+    if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */
+        mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] );
+    mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] );
+    mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] );
+    return mb_type;
+}
+
+static int decode_cabac_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s, H264Mb *m, CABACContext *c) {
+    int ctx = 0;
+    const int mb_x = m->mb_x;
+
+	if( m->mb_x>0 && !IS_SKIP( hc->mb_type[mb_x-1] ))
+        ctx++;
+	if( m->mb_y>0 && !IS_SKIP( hc->mb_type_top[mb_x] ))
+        ctx++;
+
+    if( s->slice_type_nos == FF_B_TYPE )
+        ctx += 13;
+    return get_cabac_noinline(c, &c->cabac_state[11+ctx] );
+}
+
+static int decode_cabac_mb_intra4x4_pred_mode( CABACContext *c, int pred_mode ) {
+    int mode = 0;
+
+    if( get_cabac(c, &c->cabac_state[68] ) )
+        return pred_mode;
+
+    mode += 1 * get_cabac(c, &c->cabac_state[69] );
+    mode += 2 * get_cabac(c, &c->cabac_state[69] );
+    mode += 4 * get_cabac(c, &c->cabac_state[69] );
+
+    return mode + ( mode >= pred_mode );
+}
+
+static int decode_cabac_mb_chroma_pre_mode(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) {
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+
+    int ctx = 0;
+
+    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */
+    if( m->left_type && hc->chroma_pred_mode[mb_x-1] != 0 )
+        ctx++;
+
+    if( m->top_type     && hc->chroma_pred_mode_top[mb_x] != 0 )
+        ctx++;
+
+    if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 )
+        return 0;
+
+    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
+        return 1;
+    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
+        return 2;
+    else
+        return 3;
+}
+
+static int decode_cabac_mb_cbp_luma(H264Cabac_spu *hc, CABACContext *c) {
+    int cbp_b, cbp_a, ctx, cbp = 0;
+
+    cbp_a = hc->left_cbp;
+    cbp_b = hc->top_cbp;
+
+    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]);
+    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1;
+    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2;
+    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3;
+    return cbp;
+}
+static int decode_cabac_mb_cbp_chroma(H264Cabac_spu *hc, CABACContext *c) {
+    int ctx;
+    int cbp_a, cbp_b;
+
+    cbp_a = (hc->left_cbp>>4)&0x03;
+    cbp_b = (hc-> top_cbp>>4)&0x03;
+
+    ctx = 0;
+    if( cbp_a > 0 ) ctx++;
+    if( cbp_b > 0 ) ctx += 2;
+    if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 )
+        return 0;
+
+    ctx = 4;
+    if( cbp_a == 2 ) ctx++;
+    if( cbp_b == 2 ) ctx += 2;
+    return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] );
+}
+
+static int decode_cabac_p_mb_sub_type( CABACContext *c) {
+    if( get_cabac(c, &c->cabac_state[21] ) )
+        return 0;   /* 8x8 */
+    if( !get_cabac(c, &c->cabac_state[22] ) )
+        return 1;   /* 8x4 */
+    if( get_cabac(c, &c->cabac_state[23] ) )
+        return 2;   /* 4x8 */
+    return 3;       /* 4x4 */
+}
+static int decode_cabac_b_mb_sub_type(CABACContext *c) {
+    int type;
+    if( !get_cabac(c, &c->cabac_state[36] ) )
+        return 0;   /* B_Direct_8x8 */
+    if( !get_cabac(c, &c->cabac_state[37] ) )
+        return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
+    type = 3;
+    if( get_cabac(c, &c->cabac_state[38] ) ) {
+        if( get_cabac(c, &c->cabac_state[39] ) )
+            return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
+        type += 4;
+    }
+    type += 2*get_cabac(c, &c->cabac_state[39] );
+    type +=   get_cabac(c, &c->cabac_state[39] );
+    return type;
+}
+
+static int decode_cabac_mb_ref(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, int list, int n ) {
+    H264Mb *m = s->m;
+    int refa = m->ref_cache[list][scan8[n] - 1];
+    int refb = m->ref_cache[list][scan8[n] - 8];
+    int ref  = 0;
+    int ctx  = 0;
+
+    if( s->slice_type_nos == FF_B_TYPE) {
+        if( refa > 0 && !(hc->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) )
+            ctx++;
+        if( refb > 0 && !(hc->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) )
+            ctx += 2;
+    } else {
+        if( refa > 0 )
+            ctx++;
+        if( refb > 0 )
+            ctx += 2;
+    }
+
+    while( get_cabac(c, &c->cabac_state[54+ctx] ) ) {
+        ref++;
+        ctx = (ctx>>2)+4;
+        if(ref >= 32 /*h->ref_list[list]*/){
+			fprintf(stderr, "refcount %d\n", ref);
+            return -1;
+        }
+    }
+    return ref;
+}
+
+static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) {
+    int mvd;
+
+    if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
+//    if(!get_cabac(&h->cabac, &c->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
+        *mvda= 0;
+        return 0;
+    }
+
+    mvd= 1;
+    ctxbase+= 3;
+    while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) {
+        if( mvd < 4 )
+            ctxbase++;
+        mvd++;
+    }
+
+    if( mvd >= 9 ) {
+        int k = 3;
+        while( get_cabac_bypass(c ) ) {
+            mvd += 1 << k;
+            k++;
+            if(k>24){
+                fprintf(stderr, "overflow in decode_cabac_mb_mvd\n");
+                return INT_MIN;
+            }
+        }
+        while( k-- ) {
+            mvd += get_cabac_bypass(c )<<k;
+        }
+        *mvda=mvd < 70 ? mvd : 70;
+    }else
+        *mvda=mvd;
+    return get_cabac_bypass_sign(c, -mvd );
+}
+
+#define DECODE_CABAC_MB_MVD( hc, c, list,  n )\
+{\
+    int amvd0 = hc->mvd_cache[list][scan8[n] - 1][0] +\
+                hc->mvd_cache[list][scan8[n] - 8][0];\
+    int amvd1 = hc->mvd_cache[list][scan8[n] - 1][1] +\
+                hc->mvd_cache[list][scan8[n] - 8][1];\
+\
+    mx += decode_cabac_mb_mvd( c, 40, amvd0, &mpx );\
+    my += decode_cabac_mb_mvd( c, 47, amvd1, &mpy );\
+}
+
+static av_always_inline int get_cabac_cbf_ctx(H264Cabac_spu *hc, EDSlice_spu *s, int cat, int idx, int is_dc ) {
+    H264Mb *m = s->m;
+    int nza, nzb;
+    int ctx = 0;
+
+    if( is_dc ) {
+        if( cat == 0 ) {
+            nza = hc->left_cbp&0x100;
+            nzb = hc-> top_cbp&0x100;
+        } else {
+            nza = (hc->left_cbp>>(6+idx))&0x01;
+            nzb = (hc-> top_cbp>>(6+idx))&0x01;
+        }
+    } else {
+        assert(cat == 1 || cat == 2 || cat == 4);
+        nza = m->non_zero_count_cache[scan8[idx] - 1];
+        nzb = m->non_zero_count_cache[scan8[idx] - 8];
+    }
+
+    if( nza > 0 )
+        ctx++;
+
+    if( nzb > 0 )
+        ctx += 2;
+
+    return ctx + 4 * cat;
+}
+
+ uint8_t last_coeff_flag_offset_8x8[63] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
+};
+
+static const int significant_coeff_flag_offset[2][6] = {
+    { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
+    { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
+};
+static const int last_coeff_flag_offset[2][6] = {
+    { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
+    { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
+};
+static const int coeff_abs_level_m1_offset[6] = {
+    227+0, 227+10, 227+20, 227+30, 227+39, 426
+};
+static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
+    { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
+    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
+    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
+    12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
+    { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
+    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
+    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
+    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
+};
+/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
+* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
+* map node ctx => cabac ctx for level=1 */
+static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
+/* map node ctx => cabac ctx for level>1 */
+static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+static const uint8_t coeff_abs_level_transition[2][8] = {
+    /* update node ctx after decoding a level=1 */
+    { 1, 2, 3, 3, 4, 5, 6, 7 },
+    /* update node ctx after decoding a level>1 */
+    { 4, 4, 4, 4, 5, 6, 7, 7 }
+};
+
+static av_always_inline void decode_cabac_residual_internal(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+    int index[64];
+
+    int av_unused last;
+    int coeff_count = 0;
+    int node_ctx = 0;
+
+    uint8_t *significant_coeff_ctx_base;
+    uint8_t *last_coeff_ctx_base;
+    uint8_t *abs_level_m1_ctx_base;
+
+    /* read coded block flag */
+    if( is_dc || cat != 5 ) {
+        if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( hc, s, cat, n, is_dc ) ] ) == 0 ) {
+            if( !is_dc )
+                m->non_zero_count_cache[scan8[n]] = 0;
+            return;
+        }
+    }
+
+    significant_coeff_ctx_base = c->cabac_state
+        + significant_coeff_flag_offset[0][cat];
+    last_coeff_ctx_base = c->cabac_state
+        + last_coeff_flag_offset[0][cat];
+    abs_level_m1_ctx_base = c->cabac_state
+        + coeff_abs_level_m1_offset[cat];
+
+    if( !is_dc && cat == 5 ) {
+#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
+        for(last= 0; last < coefs; last++) { \
+            uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
+            if( get_cabac( c, sig_ctx )) { \
+                uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
+                index[coeff_count++] = last; \
+                if( get_cabac( c, last_ctx ) ) { \
+                    last= max_coeff; \
+                    break; \
+                } \
+            } \
+        }\
+        if( last == max_coeff -1 ) {\
+            index[coeff_count++] = last;\
+        }\
+		
+        const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0];
+        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
+    } else {
+        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
+    }
+    assert(coeff_count > 0);
+
+    if( is_dc ) {
+        if( cat == 0 )
+            hc->cbp[mb_x] |= 0x100;
+        else
+            hc->cbp[mb_x] |= 0x40 << n;
+    } else {
+        if( cat == 5 )
+            fill_rectangle(&m->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
+        else {
+            assert( cat == 1 || cat == 2 || cat == 4 );
+            m->non_zero_count_cache[scan8[n]] = coeff_count;
+        }
+    }
+
+    do {
+        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
+        int j= scantable[index[--coeff_count]];
+
+        if( get_cabac( c, ctx ) == 0 ) {
+            node_ctx = coeff_abs_level_transition[0][node_ctx];
+            if( is_dc ) {
+                block[j] = get_cabac_bypass_sign( c, -1);
+            }else{
+                block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6;
+            }
+        } else {
+            int coeff_abs = 2;
+            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
+            node_ctx = coeff_abs_level_transition[1][node_ctx];
+
+            while( coeff_abs < 15 && get_cabac( c, ctx ) ) {
+                coeff_abs++;
+            }
+
+            if( coeff_abs >= 15 ) {
+                int j = 0;
+                while( get_cabac_bypass( c ) ) {
+                    j++;
+                }
+
+                coeff_abs=1;
+                while( j-- ) {
+                    coeff_abs += coeff_abs + get_cabac_bypass( c );
+                }
+                coeff_abs+= 14;
+            }
+
+            if( is_dc ) {
+                block[j] = get_cabac_bypass_sign( c, -coeff_abs );
+            }else{
+                block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6;
+            }
+        }
+    } while( coeff_count );
+
+}
+
+static void decode_cabac_residual_dc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
+    decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, NULL, max_coeff, 1);
+}
+
+static void decode_cabac_residual_nondc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
+    decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, qmul, max_coeff, 0);
+}
+
+/**
+ * decodes a macroblock
+ * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) {
+    H264Mb *m = s->m;
+	int mb_x = m->mb_x;
+    int mb_type, partition_count, cbp = 0;
+    int dct8x8_allowed= s->pps.transform_8x8_mode;
+
+    fill_decode_neighbors(hc, s);
+	memset(m->mb, 0 , sizeof(m->mb));
+
+    if( s->slice_type_nos != FF_I_TYPE ) {
+        int skip;
+        /* a skipped mb needs the aff flag from the following mb */
+        skip = decode_cabac_mb_skip( hc, s, m, c);
+		
+        /* read skip flags */
+        if( skip ) {
+            decode_mb_skip(hc, s);
+            hc->cbp[mb_x] = m->cbp = 0;
+            hc->chroma_pred_mode[mb_x] = 0;
+            s->last_qscale_diff = 0;
+            return 0;
+        }
+    }
+
+    if( s->slice_type_nos == FF_B_TYPE ) {
+        int ctx = 0;
+
+        if( !IS_DIRECT( m->left_type-1 ) )
+            ctx++;
+        if( !IS_DIRECT( m->top_type-1 ) )
+            ctx++;
+
+        if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){
+            mb_type= 0; /* B_Direct_16x16 */
+        }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) {
+            mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */
+        }else{
+            int bits;
+            bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3;
+            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2;
+            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1;
+            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] );
+            if( bits < 8 ){
+                mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
+            }else if( bits == 13 ){
+                mb_type= decode_cabac_intra_mb_type(s, c, 32, 0);
+                goto decode_intra_mb;
+            }else if( bits == 14 ){
+                mb_type= 11; /* B_L1_L0_8x16 */
+            }else if( bits == 15 ){
+                mb_type= 22; /* B_8x8 */
+            }else{
+                bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] );
+                mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
+            }
+        }
+            partition_count= b_mb_type_info[mb_type].partition_count;
+            mb_type=         b_mb_type_info[mb_type].type;
+    } else if( s->slice_type_nos == FF_P_TYPE ) {
+        if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) {
+            /* P-type */
+            if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) {
+                /* P_L0_D16x16, P_8x8 */
+                mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] );
+            } else {
+                /* P_L0_D8x16, P_L0_D16x8 */
+                mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] );
+            }
+            partition_count= p_mb_type_info[mb_type].partition_count;
+            mb_type=         p_mb_type_info[mb_type].type;
+        } else {
+            mb_type= decode_cabac_intra_mb_type(s, c, 17, 0);
+            goto decode_intra_mb;
+        }
+    } else {
+        mb_type= decode_cabac_intra_mb_type(s ,c, 3, 1);
+        if(s->slice_type == FF_SI_TYPE && mb_type)
+            mb_type--;
+        assert(s->slice_type_nos == FF_I_TYPE);
+decode_intra_mb:
+        partition_count = 0;
+        cbp= i_mb_type_info[mb_type].cbp;
+        m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
+        mb_type= i_mb_type_info[mb_type].type;
+    }
+	
+    if(IS_INTRA_PCM(mb_type)) {
+        uint8_t *ptr;
+        // We assume these blocks are very rare so we do not optimize it.
+        // FIXME The two following lines get the bitstream position in the cabac
+        // decode, I think it should be done by a function in cabac.h (or cabac.c).
+        ptr=c->bytestream;
+        if(c->low&0x1) ptr--;
+        if(CABAC_BITS==16){
+            if(c->low&0x1FF) ptr--;
+        }
+		if ((unsigned) (ptr + 384) >= (unsigned) c->bytestream_end){
+			fprintf(stderr, "Intra PCM mb crossed bytestream buffer\n Known issue.");
+		}		
+		
+        // The pixels are stored in the same order as levels in h->mb array.
+        memcpy(m->mb, ptr, 256); ptr+=256;        
+		memcpy(m->mb+128, ptr, 128); ptr+=128;
+        
+		c->bytestream = ptr;
+		#if CABAC_BITS == 16
+		c->low =  (*c->bytestream++)<<18;
+		c->low+=  (*c->bytestream++)<<10;
+		#else
+		c->low =  (*c->bytestream++)<<10;
+		#endif
+		c->low+= ((*c->bytestream++)<<2) + 2;
+		c->range= 0x1FE;
+
+        // All blocks are present
+        hc->cbp[mb_x] = 0x1ef;
+        hc->chroma_pred_mode[mb_x] = 0;
+        // In deblocking, the quantizer is 0
+        hc->qscale[mb_x]= 0;
+        // All coeffs are present
+        memset(hc->non_zero_count[mb_x], 16, 32);
+		hc->mb_type[mb_x]= m->mb_type = mb_type;
+        s->last_qscale_diff = 0;
+        fill_filter_caches(hc, s, mb_type);
+        return 0;
+    }
+    fill_decode_caches(hc, s, mb_type);
+
+    if( IS_INTRA( mb_type ) ) {
+        int i, pred_mode;
+        if( IS_INTRA4x4( mb_type ) ) {
+            if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ) ) {
+                mb_type |= MB_TYPE_8x8DCT;
+                for( i = 0; i < 16; i+=4 ) {
+                    int pred = pred_intra_mode( s, i );
+                    int mode = decode_cabac_mb_intra4x4_pred_mode(c, pred );
+                    fill_rectangle( &m->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
+                }
+            } else {
+                for( i = 0; i < 16; i++ ) {
+                    int pred = pred_intra_mode( s, i );
+                    m->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode(c, pred );
+                }
+            }
+            write_back_intra_pred_mode(hc, s);
+            if( check_intra4x4_pred_mode(s) < 0 ) return -1;
+        } else {
+            m->intra16x16_pred_mode= check_intra_pred_mode(s, m->intra16x16_pred_mode );
+            if( m->intra16x16_pred_mode < 0 ) return -1;
+        }
+
+		hc->chroma_pred_mode[mb_x] =
+		pred_mode                        = decode_cabac_mb_chroma_pre_mode( hc, s, c );
+
+		pred_mode= check_intra_pred_mode( s, pred_mode );
+		if( pred_mode < 0 ) return -1;
+		m->chroma_pred_mode= pred_mode;
+	
+    } else if( partition_count == 4 ) {
+        int i, j, sub_partition_count[4], list, ref[2][4];
+
+        if( s->slice_type_nos == FF_B_TYPE ) {
+            for( i = 0; i < 4; i++ ) {
+                m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c );
+                sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
+                m->sub_mb_type[i]=      b_sub_mb_type_info[ m->sub_mb_type[i] ].type;
+            }
+            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
+                          m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
+                ff_h264_pred_direct_motion(hc, s, &mb_type);
+                m->ref_cache[0][scan8[4]] =
+                m->ref_cache[1][scan8[4]] =
+                m->ref_cache[0][scan8[12]] =
+                m->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
+                    for( i = 0; i < 4; i++ )
+                        fill_rectangle( &hc->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 );
+            }
+        } else {
+            for( i = 0; i < 4; i++ ) {
+                m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c );
+                sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
+                m->sub_mb_type[i]=      p_sub_mb_type_info[ m->sub_mb_type[i] ].type;
+            }
+        }
+
+        for( list = 0; list < s->list_count; list++ ) {
+            for( i = 0; i < 4; i++ ) {
+                if(IS_DIRECT(m->sub_mb_type[i])) continue;
+                if(IS_DIR(m->sub_mb_type[i], 0, list)){
+                    if( s->ref_count[list] > 1 ){
+                        ref[list][i] = decode_cabac_mb_ref(hc, s, c, list, 4*i );
+                        if(ref[list][i] >= s->ref_count[list]){
+                            fprintf(stderr, "Reference %d >= %d\n", ref[list][i], s->ref_count[list]);
+                            return -1;
+                        }
+                    }else
+                        ref[list][i] = 0;
+                } else {
+                    ref[list][i] = -1;
+                }
+                                                    m->ref_cache[list][ scan8[4*i]+1 ]=
+                m->ref_cache[list][ scan8[4*i]+8 ]=m->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
+            }
+        }
+
+        if(dct8x8_allowed)
+            dct8x8_allowed = get_dct8x8_allowed(s);
+
+        for(list=0; list<s->list_count; list++){
+            for(i=0; i<4; i++){
+                m->ref_cache[list][ scan8[4*i]   ]=m->ref_cache[list][ scan8[4*i]+1 ];
+                if(IS_DIRECT(m->sub_mb_type[i])){
+                    fill_rectangle(hc->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
+                    continue;
+                }
+
+                if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){
+                    const int sub_mb_type= m->sub_mb_type[i];
+                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
+                    for(j=0; j<sub_partition_count[i]; j++){
+                        int mpx, mpy;
+                        int mx, my;
+                        const int index= 4*i + block_width*j;
+                        int16_t (* mv_cache)[2]= &m->mv_cache[list][ scan8[index]];
+                        uint8_t (* mvd_cache)[2]= &hc->mvd_cache[list][ scan8[index]];
+                        pred_motion(s, index, block_width, list, m->ref_cache[list][ scan8[index] ], &mx, &my);
+                        DECODE_CABAC_MB_MVD( hc, c, list, index)
+
+                        if(IS_SUB_8X8(sub_mb_type)){
+                            mv_cache[ 1 ][0]=
+                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
+                            mv_cache[ 1 ][1]=
+                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
+
+                            mvd_cache[ 1 ][0]=
+                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx;
+                            mvd_cache[ 1 ][1]=
+                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy;
+                        }else if(IS_SUB_8X4(sub_mb_type)){
+                            mv_cache[ 1 ][0]= mx;
+                            mv_cache[ 1 ][1]= my;
+
+                            mvd_cache[ 1 ][0]=  mpx;
+                            mvd_cache[ 1 ][1]= mpy;
+                        }else if(IS_SUB_4X8(sub_mb_type)){
+                            mv_cache[ 8 ][0]= mx;
+                            mv_cache[ 8 ][1]= my;
+
+                            mvd_cache[ 8 ][0]= mpx;
+                            mvd_cache[ 8 ][1]= mpy;
+                        }
+                        mv_cache[ 0 ][0]= mx;
+                        mv_cache[ 0 ][1]= my;
+
+                        mvd_cache[ 0 ][0]= mpx;
+                        mvd_cache[ 0 ][1]= mpy;
+                    }
+                }else{
+                    fill_rectangle(m->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
+                    fill_rectangle(hc->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
+                }
+            }
+        }
+    } else if( IS_DIRECT(mb_type) ) {
+		ff_h264_pred_direct_motion(hc, s, &mb_type);
+        fill_rectangle(hc->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
+        fill_rectangle(hc->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
+        dct8x8_allowed &= s->direct_8x8_inference_flag;
+    } else {
+        int list, i;
+        if(IS_16X16(mb_type)){
+            for(list=0; list<s->list_count; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    int ref;
+                    if(s->ref_count[list] > 1){
+                        ref= decode_cabac_mb_ref(hc, s, c, list, 0);
+                        if(ref >= s->ref_count[list]){
+                            fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
+                            return -1;
+                        }
+                    }else
+                        ref=0;
+                        fill_rectangle(&m->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
+                }
+            }
+            for(list=0; list<s->list_count; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    int mx,my,mpx,mpy;
+                    pred_motion(s, 0, 4, list, m->ref_cache[list][ scan8[0] ], &mx, &my);
+                    DECODE_CABAC_MB_MVD( hc, c, list, 0)
+
+                    fill_rectangle(hc->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
+                    fill_rectangle(m->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
+                }
+
+            }
+        }
+        else if(IS_16X8(mb_type)){
+            for(list=0; list<s->list_count; list++){
+                    for(i=0; i<2; i++){
+                        if(IS_DIR(mb_type, i, list)){
+                            int ref;
+                            if(s->ref_count[list] > 1){
+                                ref= decode_cabac_mb_ref(hc, s, c, list, 8*i );
+                                if(ref >= s->ref_count[list]){
+                                    fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
+                                    return -1;
+                                }
+                            }else
+                                ref=0;
+                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
+                        }else
+                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
+                    }
+            }
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        int mx,my,mpx,mpy;
+                        pred_16x8_motion(s, 8*i, list, m->ref_cache[list][scan8[0] + 16*i], &mx, &my);
+                        DECODE_CABAC_MB_MVD( hc, c, list, 8*i)
+
+                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
+                        fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
+                    }else{
+                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
+                        fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
+                    }
+                }
+            }
+        }else{
+            assert(IS_8X16(mb_type));
+            for(list=0; list<s->list_count; list++){
+                    for(i=0; i<2; i++){
+                        if(IS_DIR(mb_type, i, list)){ //FIXME optimize
+                            int ref;
+                            if(s->ref_count[list] > 1){
+                                ref= decode_cabac_mb_ref(hc, s, c, list, 4*i );
+                                if(ref >= s->ref_count[list]){
+                                    fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
+                                    return -1;
+                                }
+                            }else
+                                ref=0;
+                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
+                        }else
+                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
+                    }
+            }
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        int mx,my,mpx,mpy;
+                        pred_8x16_motion( s, i*4, list, m->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
+                        DECODE_CABAC_MB_MVD( hc, c, list, 4*i)
+
+                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
+                        fill_rectangle(m->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
+                    }else{
+                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
+                        fill_rectangle(m-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
+                    }
+                }
+            }
+        }
+    }
+	
+	if( IS_INTER( mb_type ) ) {
+			hc->chroma_pred_mode[mb_x] = 0;
+			write_back_motion( hc, s, mb_type );
+	}
+
+    if( !IS_INTRA16x16( mb_type ) ) {
+        cbp  = decode_cabac_mb_cbp_luma( hc, c);
+		cbp |= decode_cabac_mb_cbp_chroma( hc, c ) << 4;
+    }
+	
+    hc->cbp[mb_x] = m->cbp = cbp;
+    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
+        mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] );
+    }
+
+    if( cbp || IS_INTRA16x16( mb_type ) ) {
+        const uint8_t *scan, *scan8x8, *dc_scan;
+        const uint32_t *qmul;
+
+        if (s->transform_bypass && s->qscale){
+            scan8x8= ff_zigzag_direct;
+            scan= zigzag_scan;
+        }else{
+            scan8x8= hc->zigzag_scan8x8;
+            scan= hc->zigzag_scan;
+        }
+        dc_scan= luma_dc_zigzag_scan;
+
+        // decode_cabac_mb_dqp
+        if(get_cabac_noinline(c, &c->cabac_state[60 + (s->last_qscale_diff != 0)])){
+            int val = 1;
+            int ctx= 2;
+
+            while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) {
+                ctx= 3;
+                val++;
+                if(val > 102){ //prevent infinite loop
+                    fprintf(stderr, "cabac decode of qscale diff failed at %d %d (%d)\n", m->mb_x, m->mb_y, val);
+                    return -1;
+                }
+            }
+
+            if( val&0x01 )
+                val=   (val + 1)>>1 ;
+            else
+                val= -((val + 1)>>1);
+            s->last_qscale_diff = val;
+            s->qscale += val;
+            if(((unsigned)s->qscale) > 51){
+                if(s->qscale<0) s->qscale+= 52;
+                else            s->qscale-= 52;
+            }
+            s->chroma_qp[0] = s->pps.chroma_qp_table[0][s->qscale];
+            s->chroma_qp[1] = s->pps.chroma_qp_table[1][s->qscale];
+        }else
+            s->last_qscale_diff=0;
+
+        if( IS_INTRA16x16( mb_type ) ) {
+            int i;            
+            decode_cabac_residual_dc( hc, s, c, m->mb, 0, 0, dc_scan, 16);
+
+            if( cbp&15 ) {
+                qmul = hc->dequant4_coeff[0][s->qscale];
+                for( i = 0; i < 16; i++ ) {                    
+                    decode_cabac_residual_nondc( hc, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15);
+                }
+            } else {
+                fill_rectangle(&m->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+            }
+        } else {
+            int i8x8, i4x4;
+            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+                if( cbp & (1<<i8x8) ) {
+                    if( IS_8x8DCT(mb_type) ) {
+                        decode_cabac_residual_nondc(hc, s, c, m->mb + 64*i8x8, 5, 4*i8x8,
+                            scan8x8, hc->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
+                    } else {
+                        qmul = hc->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
+                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+                            const int index = 4*i8x8 + i4x4;                            
+//START_TIMER
+                            decode_cabac_residual_nondc(hc, s, c, m->mb + 16*index, 2, index, scan, qmul, 16);
+//STOP_TIMER("decode_residual")
+                        }
+                    }
+                } else {
+                    uint8_t * const nnz= &m->non_zero_count_cache[ scan8[4*i8x8] ];
+                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                }
+            }
+        }
+
+        if( cbp&0x30 ){
+            int i;
+            for( i = 0; i < 2; i++ ) {                
+                decode_cabac_residual_dc(hc, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4);
+            }
+        }
+
+        if( cbp&0x20 ) {
+            int i, j;
+            for( i = 0; i < 2; i++ ) {
+                qmul = hc->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][s->chroma_qp[i]];
+                for( j = 0; j < 4; j++ ) {
+                    const int index = 16 + 4 * i + j;                    
+                    decode_cabac_residual_nondc( hc, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15);
+                }
+            }
+        } else {
+            uint8_t * const nnz= &m->non_zero_count_cache[0];
+            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        }
+    } else {
+        uint8_t * const nnz= &m->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        s->last_qscale_diff = 0;
+    }
+	hc->mb_type[mb_x]= m->mb_type = mb_type;
+    hc->qscale[mb_x]= s->qscale;	
+    write_back_non_zero_count(hc, s);
+    fill_filter_caches(hc, s, mb_type);
+
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_cabac_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_cabac_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,17 @@
+#ifndef H264_CABAC_H
+#define H264_CABAC_H
+
+#define CELL_SPE
+#include "libavcodec/avcodec.h"
+#include "h264_types_spu.h"
+#include "cabac_spu.h"
+
+
+/**
+ * decodes a CABAC coded macroblock
+ * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c);
+void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_chroma_template_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_chroma_template_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,355 @@
+static void PREFIX_h264_chroma_mc8_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
+
+  register int i;
+
+  const int16_t i32ss= 32;
+  const int16_t imax = 255;
+  const int16_t iABCD1 = ((8 - x) * (8 - y));
+  const int16_t iABCD2 = ((x) * (8 - y));
+  const int16_t iABCD3 = ((8 - x) * (y));
+  const int16_t iABCD4 = ((x) * (y));
+
+  const vsint16_t vA = spu_splats(iABCD1);
+  const vsint16_t vB = spu_splats(iABCD2);
+  const vsint16_t vC = spu_splats(iABCD3);
+  const vsint16_t vD = spu_splats(iABCD4);
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v32ss = spu_splats(i32ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const int shift_src =(unsigned int) src & 15;
+  const int shift_dst =(unsigned int) dst & 15;
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+  const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+  vuint8_t dstmask;
+
+  if(shift_dst==0)
+    dstmask=dstmask0;
+  else
+    dstmask=dstmask8;
+
+  vuint8_t vsrc0uc1;
+  vuint8_t vsrc0uc2;
+  vuint8_t vsrc0uc;
+  vuint8_t vsrc1uc;
+  vsrc0uc1 = *(vuint8_t *)(src);
+  vsrc0uc2 = *(vuint8_t *)(src+16);
+  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
+  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
+
+  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
+  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
+
+  for (i = 0 ; i < h ; i++) {
+        
+    vuint8_t vsrc2uc1;
+    vuint8_t vsrc2uc2;
+    vuint8_t vsrc2uc;
+    vuint8_t vsrc3uc;
+    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
+    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
+    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
+    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
+        
+    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
+    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
+        
+    vsint16_t psum;
+        
+    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
+    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
+    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+
+    psum1 = spu_mule(vsrc1ssH, vB);
+    psum2 = spu_mulo(vsrc1ssH, vB);
+    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum1 = spu_mule(vsrc2ssH, vC);
+    psum2 = spu_mulo(vsrc2ssH, vC);
+    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum1 = spu_mule(vsrc3ssH, vD);
+    psum2 = spu_mulo(vsrc3ssH, vD);
+    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum = spu_add(v32ss, psum);
+    psum = spu_rlmask(psum, -6);
+
+    //Saturation from 0 to 255
+    sat = spu_cmpgt(psum,(vsint16_t)vzero);
+    psum = spu_and(psum,(vsint16_t)sat);
+    sat = spu_cmpgt(psum,vmax);
+    psum = spu_sel(psum,vmax,sat);
+
+    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+
+    vsrc0ssH = vsrc2ssH;
+    vsrc1ssH = vsrc3ssH;
+        
+    dst += dst_stride;
+    //src += src_stride;
+	src += STRIDE_C;
+  }
+}
+
+static void PREFIX_h264_chroma_mc4_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
+
+  register int i;
+
+  const int16_t i32ss= 32;
+  const int16_t imax = 255;
+  const int16_t iABCD1 = ((8 - x) * (8 - y));
+  const int16_t iABCD2 = ((x) * (8 - y));
+  const int16_t iABCD3 = ((8 - x) * (y));
+  const int16_t iABCD4 = ((x) * (y));
+
+  const vsint16_t vA = spu_splats(iABCD1);
+  const vsint16_t vB = spu_splats(iABCD2);
+  const vsint16_t vC = spu_splats(iABCD3);
+  const vsint16_t vD = spu_splats(iABCD4);
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v32ss = spu_splats(i32ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+    
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  const int shift_src = (unsigned int) src & 15;
+  const int shift_dst = (unsigned int) dst & 15;
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+  switch(shift_dst){
+    case 0:  dstmask = dstmask0;
+             break;
+    case 4:  dstmask = dstmask4;
+             break;
+    case 8:  dstmask = dstmask8;
+             break;
+    case 12: dstmask = dstmask12;
+             break;
+  }
+
+  vuint8_t vsrc0uc1;
+  vuint8_t vsrc0uc2;
+  vuint8_t vsrc0uc;
+  vuint8_t vsrc1uc;
+  vsrc0uc1 = *(vuint8_t *)(src);
+  vsrc0uc2 = *(vuint8_t *)(src+16);
+  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
+  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
+    
+  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
+  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
+
+  for (i = 0 ; i < h ; i++) {
+
+    vuint8_t vsrc2uc1;
+    vuint8_t vsrc2uc2;
+    vuint8_t vsrc2uc;
+    vuint8_t vsrc3uc;
+    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
+    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
+    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
+    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
+        
+    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
+    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
+        
+    vsint16_t psum;
+        
+    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
+    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
+    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+
+    psum1 = spu_mule(vsrc1ssH, vB);
+    psum2 = spu_mulo(vsrc1ssH, vB);
+    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum1 = spu_mule(vsrc2ssH, vC);
+    psum2 = spu_mulo(vsrc2ssH, vC);
+    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum1 = spu_mule(vsrc3ssH, vD);
+    psum2 = spu_mulo(vsrc3ssH, vD);
+    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum = spu_add(v32ss, psum);
+    psum = spu_rlmask(psum, -6);
+
+    //Saturation from 0 to 255
+    sat = spu_cmpgt(psum,(vsint16_t)vzero);
+    psum = spu_and(psum,(vsint16_t)sat);
+    sat = spu_cmpgt(psum,vmax);
+    psum = spu_sel(psum,vmax,sat);
+
+    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+
+    vsrc0ssH = vsrc2ssH;
+    vsrc1ssH = vsrc3ssH;
+        
+    dst += dst_stride;
+    src += STRIDE_C;
+  }
+}
+
+static void PREFIX_h264_chroma_mc2_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
+
+  register int i;
+
+  const int16_t i32ss= 32;
+  const int16_t imax = 255;
+  const int16_t iABCD1 = ((8 - x) * (8 - y));
+  const int16_t iABCD2 = ((x) * (8 - y));
+  const int16_t iABCD3 = ((8 - x) * (y));
+  const int16_t iABCD4 = ((x) * (y));
+
+  const vsint16_t vA = spu_splats(iABCD1);
+  const vsint16_t vB = spu_splats(iABCD2);
+  const vsint16_t vC = spu_splats(iABCD3);
+  const vsint16_t vD = spu_splats(iABCD4);
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v32ss = spu_splats(i32ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+    
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  const int shift_src = (unsigned int) src & 15;
+  const int shift_dst = (unsigned int) dst & 15;
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dstmask0=  {0x10,0x11,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask2=  {0x00,0x01,0x10,0x11,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask6=  {0x00,0x01,0x02,0x03,0x04,0x05,0x10,0x11,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask10= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x10,0x11,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x0E,0x0F};
+  const vuint8_t dstmask14= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x10,0x11};
+  
+  switch(shift_dst){
+    case 0:  dstmask = dstmask0;
+             break;
+    case 2:  dstmask = dstmask2;
+             break;
+    case 4:  dstmask = dstmask4;
+             break;
+    case 6:  dstmask = dstmask6;
+             break;
+    case 8:  dstmask = dstmask8;
+             break;
+    case 10: dstmask = dstmask10;
+             break;
+    case 12: dstmask = dstmask12;
+             break;
+    case 14: dstmask = dstmask14;
+             break;
+  }
+
+  vuint8_t vsrc0uc1;
+  vuint8_t vsrc0uc2;
+  vuint8_t vsrc0uc;
+  vuint8_t vsrc1uc;
+  vsrc0uc1 = *(vuint8_t *)(src);
+  vsrc0uc2 = *(vuint8_t *)(src+16);
+  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
+  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
+    
+  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
+  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
+
+  for (i = 0 ; i < h ; i++) {
+
+    vuint8_t vsrc2uc1;
+    vuint8_t vsrc2uc2;
+    vuint8_t vsrc2uc;
+    vuint8_t vsrc3uc;
+    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
+    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
+    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
+    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
+        
+    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
+    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
+        
+    vsint16_t psum;
+        
+    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
+    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
+    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+
+    psum1 = spu_mule(vsrc1ssH, vB);
+    psum2 = spu_mulo(vsrc1ssH, vB);
+    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum1 = spu_mule(vsrc2ssH, vC);
+    psum2 = spu_mulo(vsrc2ssH, vC);
+    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum1 = spu_mule(vsrc3ssH, vD);
+    psum2 = spu_mulo(vsrc3ssH, vD);
+    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
+    psum = spu_add(psum3, psum);
+
+    psum = spu_add(v32ss, psum);
+    psum = spu_rlmask(psum, -6);
+
+    //Saturation from 0 to 255
+    sat = spu_cmpgt(psum,(vsint16_t)vzero);
+    psum = spu_and(psum,(vsint16_t)sat);
+    sat = spu_cmpgt(psum,vmax);
+    psum = spu_sel(psum,vmax,sat);
+
+    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+
+    vsrc0ssH = vsrc2ssH;
+    vsrc1ssH = vsrc3ssH;
+        
+    dst += dst_stride;
+    src += STRIDE_C;
+  }
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_deblock_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_deblock_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding. 
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+#include "h264_deblock_spu.h"
+#include "h264_decode_mb_spu.h"
+
+extern int print_debug;
+
+static void filter_mb_edgev( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
+	H264slice *s= h->s;
+    const int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
+		
+        h->dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
+    } else {
+        h->dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
+    }
+}
+
+static void filter_mb_edgecv( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
+	H264slice *s= h->s;
+    const int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+	if (alpha ==0 || beta == 0) return;
+	
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+		
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
+		
+		h->dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
+    } else {
+        h->dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
+    }
+}
+
+static void filter_mb_edgeh( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
+	H264slice *s= h->s;
+    const int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+		
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
+		
+        h->dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
+    } else {
+        h->dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
+    }
+}
+
+static void filter_mb_edgech( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
+	H264slice *s= h->s;
+    const int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+		
+		tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
+		
+        h->dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
+    } else {
+        h->dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
+    }
+}
+
+static void filter_mb_dir(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int dir) {
+    H264Mb *mb = h->mb;
+	H264slice *s = h->s;
+	const int qp_xy= mb->qscale_mb_xy;
+    const int qp_dir = dir == 0 ? mb->qscale_left_mb_xy : mb->qscale_top_mb_xy;
+	const int mbm_type = dir == 0 ? mb->left_type : mb->top_type;
+	const int mb_type = mb->mb_type;
+	int edge;
+	const int edges = mb->edges[dir];
+    //int (*ref2frm)[64] = s->ref2frm;
+
+//     int start;//= h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
+// 
+//     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
+//                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
+//     // how often to recheck mv-based bS when iterating between edges
+//     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
+//                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
+//     // how often to recheck mv-based bS when iterating along each edge
+//     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
+
+// 	if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0))
+// 		start =1;
+// 	else
+// 		start =0;
+// 
+//     /* Calculate bS */
+//     for( edge = start; edge < edges; edge++ ) {
+// 		const int mbn_type = edge > 0 ? mb_type : mbm_type;
+// 		const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm;
+//         int (*ref2frmn)[64] = ref2frm;//edge > 0 ? ref2frm : ref2frmm;
+//         int16_t bS[4];
+//         int qp;
+// 
+//         if( (edge&1) && IS_8x8DCT(mb_type) )
+//             continue;
+// 
+//         if( IS_INTRA(mb_type) ||
+//             IS_INTRA(mbn_type) ) {
+//             int value;
+// 
+//             if (edge == 0) {
+//                 value = 4;
+//             } else {
+//                 value = 3;
+//             }
+//             bS[0] = bS[1] = bS[2] = bS[3] = value;
+//         } else {
+//             int i, l;
+//             int mv_done;
+// 
+//             if( edge & mask_edge ) {
+// 
+//                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
+//                 mv_done = 1;
+//             }
+//             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
+//                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
+//                 int bn_idx= b_idx - (dir ? 8:1);
+//                 int v = 0;
+// 
+// 				for( l = 0; !v && l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) {
+//                     v |= ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] ||
+//                          FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
+//                          FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit;
+//                 }
+//                 bS[0] = bS[1] = bS[2] = bS[3] = v;
+// 
+//                 mv_done = 1;
+//             }
+//             else
+//                 mv_done = 0;
+// 
+// 			for( i = 0; i < 4; i++ ) {
+//                 int x = dir == 0 ? edge : i;
+//                 int y = dir == 0 ? i    : edge;
+//                 int b_idx= 8 + 4 + x + 8*y;
+//                 int bn_idx= b_idx - (dir ? 8:1);
+// 
+//                 if( mb->non_zero_count_cache[b_idx] |
+//                     mb->non_zero_count_cache[bn_idx] ) {
+//                     bS[i] = 2;
+//                 }
+//                 else if(!mv_done)
+//                 {
+//                     bS[i] = 0;
+//                     for( l = 0; l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) {
+//                         if( ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] ||
+//                             FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
+//                             FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
+//                             bS[i] = 1;
+//                             break;
+//                         }
+//                     }
+//                 }
+//             }
+// 
+//             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+//                 continue;
+//         }
+// 		qp = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1;
+
+    if(mbm_type){
+        int16_t* bS=mb->bS[dir][0];
+        /* Filter edge */
+        // Do not use s->qscale as luma quantizer because it has not the same
+        // value in IPCM macroblocks.
+        if(bS[0]+bS[1]+bS[2]+bS[3]){
+            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
+            if( dir == 0 ) {
+                filter_mb_edgev(h, &img_y[0], linesize, bS, qp);
+                {
+                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
+                    filter_mb_edgecv(h, &img_cb[0], uvlinesize, bS, qp);
+                    filter_mb_edgecv(h, &img_cr[0], uvlinesize, bS, qp);
+                }
+            } else {
+                filter_mb_edgeh(h, &img_y[0], linesize, bS, qp);
+                {
+                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
+                    filter_mb_edgech(h, &img_cb[0], uvlinesize, bS, qp);
+                    filter_mb_edgech(h, &img_cr[0], uvlinesize, bS, qp);
+                }
+            }
+        }
+    }
+
+    for( edge = 1; edge < edges; edge++ ) {
+        int16_t* bS=mb->bS[dir][edge];
+        int qp = qp_xy;
+
+        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
+            continue;
+
+        /* Filter edge */
+        // Do not use s->qscale as luma quantizer because it has not the same
+        // value in IPCM macroblocks.
+
+        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+            continue;
+
+		if( dir == 0 ) {
+            filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
+            if( (edge&1) == 0 ) {
+                filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) );
+                filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) );
+            }
+        } else {
+            filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
+            if( (edge&1) == 0 ) {
+                filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) );
+                filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) );
+            }
+        }
+    }
+}
+
+void filter_mb( H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
+    filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 0);
+    filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 1);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_deblock_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_deblock_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,80 @@
+#ifndef H264_FILTER_SPU_H
+#define H264_FILTER_SPU_H
+
+#include "types_spu.h"
+#include "h264_decode_mb_spu.h"
+
+#define FFABS(a)           ((a) >= 0 ? (a) : (-(a)))
+
+void filter_mb(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
+
+/* Deblocking filter (p153) */
+static const uint8_t alpha_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+   255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+};
+
+static const uint8_t beta_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+};
+
+static const uint8_t tc0_table[52*3][4] = {
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
+    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
+    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
+    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
+    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
+    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
+    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+};
+
+static inline int get_chroma_qp(H264slice *s, int t, int qscale){
+    return s->chroma_qp_table[t][qscale];
+}
+
+#endif 
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_decode_mb_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_decode_mb_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding.
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <spu_intrinsics.h>
+//#include "dsputil_cell.h"
+#include "types_spu.h"
+#include "h264_tables.h"
+#include "h264_dma.h"
+#include "h264_mc_spu.h"
+#include "h264_intra_spu.h"
+#include "h264_decode_mb_spu.h"
+#include "h264_deblock_spu.h"
+
+//border buffers
+DECLARE_ALIGNED_16(TopBorder, top_ls[240]);
+LeftBorder left_ls;
+
+//mb line buffer - statically allocated for up to 1920 width video
+DECLARE_ALIGNED_16(uint8_t, dest_y_ls[2*16*20]);
+DECLARE_ALIGNED_16(uint8_t, dest_cb_ls[2*8*10]);
+DECLARE_ALIGNED_16(uint8_t, dest_cr_ls[2*8*10]);
+
+//dma transfer buffer
+DECLARE_ALIGNED_16(uint8_t, dma_y_ls [64*(32+20)]); //EDGE_WIDTH = 32
+DECLARE_ALIGNED_16(uint8_t, dma_cb_ls[32*(16+10)]);
+DECLARE_ALIGNED_16(uint8_t, dma_cr_ls[32*(16+10)]);
+
+DECLARE_ALIGNED_16(uint8_t, extra_edge_y [32*(32+20)]); //EDGE_WIDTH = 32
+DECLARE_ALIGNED_16(uint8_t, extra_edge_cr[16*(16+10)]);
+DECLARE_ALIGNED_16(uint8_t, extra_edge_cb[16*(16+10)]);
+
+
+// For intra mode
+/// for now do the extra copy before dma, but it's better to skip this and do the dma right away
+static void backup_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+	H264Mb* mb= h->mb;
+	
+    int i;
+	uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y;
+	uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb;
+	uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr;
+	
+	uint8_t* left_border_y = left_ls.unfiltered_y;
+	uint8_t* left_border_cb = left_ls.unfiltered_cb;
+	uint8_t* left_border_cr = left_ls.unfiltered_cr;
+		
+    src_y  -=   linesize;
+    src_cb -= uvlinesize;
+    src_cr -= uvlinesize;
+
+    // There are two lines saved, the line above the top macroblock of a pair,
+    // and the line above the bottom macroblock
+    left_border_y[0] = top_border_y[15];
+    for(i=1; i<17; i++){
+        left_border_y[i] = src_y[15+i*  linesize];
+    }
+
+   *(qword*)(top_border_y)= *(qword*)(src_y +  16*linesize);
+
+    left_border_cb[0] = top_border_cb[7];
+    left_border_cr[0] = top_border_cr[7];
+    for(i=1; i<9; i++){
+        left_border_cb[i] = src_cb[7+i*uvlinesize];
+        left_border_cr[i] = src_cr[7+i*uvlinesize];
+    }
+    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
+    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
+}
+
+static void xchg_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+	H264Mb* mb= h->mb;
+	H264slice* s = h->s;
+	
+	int temp8, i;
+	uint64_t temp64;
+	int deblock_left;
+	int deblock_top;
+	
+	uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y;	
+	uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb;
+	uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr;
+	uint8_t* top_border_y_next = top_ls[mb->mb_x +1].unfiltered_y;
+	
+	uint8_t* left_border_y = left_ls.unfiltered_y;
+	uint8_t* left_border_cb = left_ls.unfiltered_cb;
+	uint8_t* left_border_cr = left_ls.unfiltered_cr;
+	
+	deblock_left = (mb->mb_x > 0);
+	deblock_top =  (mb->mb_y > 0);
+	
+	src_y  -= (  linesize + 1);
+	src_cb -= (uvlinesize + 1);
+	src_cr -= (uvlinesize + 1);
+	
+	#define XCHG(a,b,t,xchg)\
+	t= a;\
+	if(xchg)\
+		a= b;\
+	b= t;
+	
+	if(deblock_left){
+		for(i = !deblock_top; i<16; i++){
+			XCHG(left_border_y[i], src_y [i*  linesize], temp8, xchg);
+		}
+		XCHG(left_border_y[i], src_y [i*  linesize], temp8, 1);
+		
+		for(i = !deblock_top; i<8; i++){
+			XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg);
+			XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg);
+		}
+		XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1);
+		XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1);
+	}
+	
+	if(deblock_top){
+		XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg);
+		XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1);
+		if(mb->mb_x+1 < s->mb_width){
+			XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1);
+		}
+		XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1);
+		XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1);
+	}
+}
+
+void copy_top_borders(int mb_x, uint8_t *dst_y, uint8_t *dst_cb, uint8_t *dst_cr, int stride_y, int stride_c){			
+	qword *qsrc_y = (qword *) (top_ls[mb_x].top_borders_y);
+	dst_y-= 4*stride_y;
+	
+	*((qword *) (dst_y + 0*stride_y)) = *qsrc_y++;
+	*((qword *) (dst_y + 1*stride_y)) = *qsrc_y++;
+	*((qword *) (dst_y + 2*stride_y)) = *qsrc_y++;
+	*((qword *) (dst_y + 3*stride_y)) = *qsrc_y++;
+
+	dst_cb-=2*stride_c;	
+	uint64_t *dsrc_cb = (uint64_t *) (top_ls[mb_x].top_borders_cb);
+	*((uint64_t *) (dst_cb + 0*stride_c)) = *dsrc_cb++; 
+	*((uint64_t *) (dst_cb + 1*stride_c)) = *dsrc_cb++;
+
+	dst_cr-=2*stride_c;	
+	uint64_t *dsrc_cr = (uint64_t *) (top_ls[mb_x].top_borders_cr);
+	*((uint64_t *) (dst_cr + 0*stride_c)) = *dsrc_cr++;
+	*((uint64_t *) (dst_cr + 1*stride_c)) = *dsrc_cr++;
+}
+
+static void send_top_borders(H264Context_spu *h, int mb_x, uint8_t* dest_y, uint8_t* dest_cb, uint8_t* dest_cr, int stride_y, int stride_c){
+	H264spe *spe= &h->spe;
+	//fill borders (unfiltered borders already filled in backup_mb_border)
+	dest_y+= 12*stride_y;
+	qword *qtop_y = (qword *) top_ls[mb_x].top_borders_y;	
+	for(int i=0; i<4; i++){
+		qword *qdest_y = (qword *) dest_y;
+		*qtop_y++ = *qdest_y;		
+		dest_y+=stride_y;
+	}
+	dest_cb+= 6*stride_c;
+	dest_cr+= 6*stride_c;
+	uint64_t *dtop_cb = (uint64_t *) top_ls[mb_x].top_borders_cb;
+	uint64_t *dtop_cr = (uint64_t *) top_ls[mb_x].top_borders_cr;
+	for(int i=0; i<2; i++){
+		uint64_t *ddest_cb = (uint64_t *) dest_cb;
+		uint64_t *ddest_cr = (uint64_t *) dest_cr;
+		
+		*dtop_cb++  = *ddest_cb;
+		*dtop_cr++  = *ddest_cr;
+		
+		dest_cb+=stride_c;
+		dest_cr+=stride_c;
+	}
+	uint8_t* top_border_tgt = spe->tgt_spe + (unsigned) &top_ls[mb_x];
+	spu_dma_put(&top_ls[mb_x], (unsigned) top_border_tgt, sizeof(TopBorder), MBD_put);
+}
+
+static void extend_edges_left(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c){
+	for (int i=0; i<lines; i++){
+		memset(dma_y, dma_y[32], 32);
+		dma_y+=64;
+	}
+
+	for (int i=0; i<lines_c; i++){
+		memset(dma_cb, dma_cb[16], 16);
+		memset(dma_cr, dma_cr[16], 16);
+		dma_cb+=32; dma_cr+=32;
+	}
+}
+
+static void extend_edges_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c, int slots){
+		
+	for (int i=0; i<lines; i++){
+		memset(dma_y, dma_y[-1], slots*16);
+		dma_y+=64;
+	}
+	
+	for (int i=0; i<lines_c; i++){
+		memset(dma_cb, dma_cb[-1], slots*8);
+		memset(dma_cr, dma_cr[-1], slots*8);
+		dma_cb+=32; dma_cr+=32;
+	}
+}
+
+static void extend_edges_top(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr ){
+	qword *qborder_y = (qword *) dma_y;
+	for (int i=1; i<=32; i++){
+		qword *qdma_y = (qword *) (dma_y - i*64);
+		*qdma_y = *qborder_y;
+	}
+
+	uint64_t *dborder_cb = (uint64_t *) dma_cb;
+	uint64_t *dborder_cr = (uint64_t *) dma_cr;
+	for (int i=1; i<=16; i++){
+		uint64_t *ddma_cb = (uint64_t *) (dma_cb - i*32);
+		uint64_t *ddma_cr = (uint64_t *) (dma_cr - i*32);
+		*ddma_cb = *dborder_cb;
+		*ddma_cr = *dborder_cr;
+	}
+}
+
+static void extend_edges_bottom(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr){
+	qword *qborder_y = (qword *) dma_y;
+	for (int i=1; i<=32; i++){
+		qword *qdma_y = (qword *) (dma_y + i*64);
+		*qdma_y = *qborder_y;
+	}
+	
+	uint64_t *dborder_cb = (uint64_t *) dma_cb;
+	uint64_t *dborder_cr = (uint64_t *) dma_cr;
+	for (int i=1; i<=16; i++){
+		uint64_t *ddma_cb = (uint64_t *) (dma_cb + i*32);
+		uint64_t *ddma_cr = (uint64_t *) (dma_cr + i*32);
+		*ddma_cb = *dborder_cb;
+		*ddma_cr = *dborder_cr;
+	}
+}
+
+static void extend_extra_edge_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr, uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr, int lines, int lines_c){
+
+	for (int i=0; i<lines; i++){
+		memset(extra_y, dma_y[-1], 32);
+		dma_y+=64; extra_y+=32;
+	}
+	
+	for (int i=0; i<lines_c; i++){
+		memset(extra_cb, dma_cb[-1], 16);
+		memset(extra_cr, dma_cr[-1], 16);
+		dma_cb+=32; dma_cr+=32;
+		extra_cb+=16; extra_cr+=16;
+	}
+}
+
+static void extend_extra_edge_top(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){
+	qword *qborder_y = (qword *) extra_y;
+	qword *qborder_y2 = (qword *) (extra_y+16);
+	
+	for (int i=1; i<=32; i++){
+		qword *qextra_y = (qword *) (extra_y-i*32);
+		*qextra_y = *qborder_y;
+		*(qextra_y+1) = *qborder_y2;
+	}
+	
+	qword *qborder_cb = (qword *) extra_cb;
+	qword *qborder_cr = (qword *) extra_cr;
+	for (int i=1; i<=16; i++){
+		qword *qextra_cb = (qword *) (extra_cb - i*16);
+		qword *qextra_cr = (qword *) (extra_cr - i*16);
+		*qextra_cb = *qborder_cb;
+		*qextra_cr = *qborder_cr;
+	}
+}
+
+static void extend_extra_edge_bottom(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){
+	qword *qborder_y = (qword *) extra_y;
+	qword *qborder_y2 = (qword *) (extra_y+16);
+	
+	for (int i=1; i<=32; i++){
+		qword *qextra_y = (qword *) (extra_y+i*32);
+		*qextra_y = *qborder_y;
+		*(qextra_y+1) = *qborder_y2;
+	}
+	
+	qword *qborder_cb = (qword *) extra_cb;
+	qword *qborder_cr = (qword *) extra_cr;
+	for (int i=1; i<=16; i++){
+		qword *qextra_cb = (qword *) (extra_cb + i*16);
+		qword *qextra_cr = (qword *) (extra_cr + i*16);
+		*qextra_cb = *qborder_cb;
+		*qextra_cr = *qborder_cr;
+	}
+}
+
+static void extend_edges(H264Context_spu *h, int mb_x, int mb_y){
+	H264slice *s = h->s;
+	
+	uint8_t *dma_y; 
+	uint8_t *dma_cb; 
+	uint8_t *dma_cr;
+	
+	uint8_t *extra_y  = extra_edge_y;
+	uint8_t *extra_cb = extra_edge_cb;
+	uint8_t *extra_cr = extra_edge_cr;
+	
+	int pos = (mb_x+2) %4;
+	if (mb_x == 0){
+		if (mb_y ==0){
+			extend_edges_left(&dma_y_ls[32*64], &dma_cb_ls[16*32], &dma_cr_ls[16*32], 12, 6);
+		}else if (mb_y == s->mb_height -1){
+			extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 20, 10);
+		}else {
+			extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 16, 8);
+		}
+	}else if (mb_x == s->mb_width-1){
+		dma_y  = &dma_y_ls [(pos+1)*16];
+		dma_cb = &dma_cb_ls[(pos+1)*8];
+		dma_cr = &dma_cr_ls[(pos+1)*8];
+		if (mb_y ==0){
+			dma_y   += 32*64;
+			dma_cb  += 16*32;
+			dma_cr  += 16*32;
+			extra_y = extra_edge_y  + 32*32;
+			extra_cb= extra_edge_cb + 16*16;
+			extra_cr= extra_edge_cr + 16*16;
+			
+			if (pos==2){
+				extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 1);
+				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6);
+			}else if (pos==3){
+				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6);
+			}else{
+				extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 2);
+			}
+		}else if (mb_y == s->mb_height -1){
+			if (pos==2){
+				extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 1);
+				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10);
+			}else if (pos==3){
+				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10);
+			}else{
+				extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 2);
+			}				
+		}else {
+			if (pos==2){
+				extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1);
+				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8);
+			}else if (pos==3){
+				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8);
+			}else{
+				extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1);
+			}
+		}
+	}
+		
+	if (mb_y == 0){
+		dma_y  = &dma_y_ls [32*64];
+		dma_cb = &dma_cb_ls[16*32];
+		dma_cr = &dma_cr_ls[16*32];
+		extra_y = extra_edge_y  + 32*32;
+		extra_cb= extra_edge_cb + 16*16;
+		extra_cr= extra_edge_cr + 16*16;
+		
+		if (mb_x ==0){
+			extend_edges_top (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8);
+			extend_edges_top (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8);
+			extend_edges_top (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8);
+		}else if (mb_x == s->mb_width -1){
+			if (pos==2){
+				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+				extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
+				extend_extra_edge_top(extra_y, extra_cb, extra_cr);
+			}else if (pos == 3){
+				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+				extend_extra_edge_top(extra_y, extra_cb, extra_cr);
+			}else{
+				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+				extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
+				extend_edges_top (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8);
+			}			
+		}else {
+			extend_edges_top (dma_y + pos*16, dma_cb + pos*8, dma_cr + pos*8);
+		}
+	}else if (mb_y == s->mb_height -1){
+		dma_y  = &dma_y_ls [19*64];
+		dma_cb = &dma_cb_ls[9*32];
+		dma_cr = &dma_cr_ls[9*32];
+		extra_y = extra_edge_y  + 19*32;
+		extra_cb= extra_edge_cb + 9*16;
+		extra_cr= extra_edge_cr + 9*16;
+		
+		if (mb_x ==0){
+			extend_edges_bottom (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8);
+			extend_edges_bottom (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8);
+			extend_edges_bottom (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8);
+		}else if (mb_x == s->mb_width -1){
+			if (pos==2){
+				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+				extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
+				extend_extra_edge_bottom(extra_y, extra_cb, extra_cr);
+			}else if (pos == 3){
+				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+				extend_extra_edge_bottom(extra_y, extra_cb, extra_cr);
+			}else{				
+				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+				extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
+				extend_edges_bottom (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8);
+			}
+		}else {
+			extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
+		}
+	}
+}
+
+static void send_pic_data(H264Context_spu *h, int mb_x, int mb_y, int pos, int stride_y, int stride_c){
+	H264slice *s = h->s;
+	int lines, lines_c;
+	int linesize = s->linesize;
+	int uvlinesize = s->uvlinesize;
+	
+	uint8_t* dst_y  = s->dst_y + (mb_x-pos)*16 + (mb_y*16)*linesize;
+	uint8_t* dst_cb = s->dst_cb +(mb_x-pos)*8 + (mb_y*8)*uvlinesize;
+	uint8_t* dst_cr = s->dst_cr +(mb_x-pos)*8 + (mb_y*8)*uvlinesize;
+
+	if (mb_y == 0){
+		dst_y -= 32 *linesize;
+		dst_cb-= 16 *uvlinesize;
+		dst_cr-= 16 *uvlinesize;
+	}else {
+		dst_y -= 4 *linesize;
+		dst_cb-= 2 *uvlinesize;
+		dst_cr-= 2 *uvlinesize;
+	}
+	
+	if (mb_y == 0){
+		lines = 12+32; lines_c=6+16;
+	}else if (mb_y == s->mb_height-1){
+		lines = 20+32; lines_c=10+16;
+	}else{
+		lines = 16; lines_c=8;
+	}
+	
+	put_list = put_list_buf;
+	put_dma_list(dma_y_ls, dst_y, stride_y, lines, linesize, MBD_pic);
+	put_dma_list(dma_cb_ls, dst_cb, stride_c, lines_c, uvlinesize, MBD_pic);
+	put_dma_list(dma_cr_ls, dst_cr, stride_c, lines_c, uvlinesize, MBD_pic);
+
+	if (mb_x == s->mb_width-1 && pos>1){		
+		put_dma_list(extra_edge_y, dst_y+64, 32, lines, linesize, MBD_pic);
+		put_dma_list(extra_edge_cb, dst_cb+32, 16, lines_c, uvlinesize, MBD_pic);
+		put_dma_list(extra_edge_cr, dst_cr+32, 16, lines_c, uvlinesize, MBD_pic);
+   	}
+}
+
+void copy_data_and_send(H264Context_spu *h, int mb_x, int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
+	H264slice *s = h->s;
+	int lines, lines_c;
+	int pos = (mb_x+2)%4; //4 slots in our 64 byte wide transfer buffer. Offset 2 for edge emulation
+	uint8_t *dma_y = &dma_y_ls[pos*16];
+	uint8_t *dma_cb = &dma_cb_ls[pos*8];
+	uint8_t *dma_cr = &dma_cr_ls[pos*8];
+	
+	if (mb_y == 0){
+		dma_y += 32*64;
+		dma_cb+= 16*32;
+		dma_cr+= 16*32;
+	}else{		
+		dest_y -= 4*stride_y;
+		dest_cb-= 2*stride_c;
+		dest_cr-= 2*stride_c;		
+	}
+	
+	if (mb_y == 0){
+		lines = 12; lines_c=6;
+	}else if (mb_y == s->mb_height-1){
+		lines = 20; lines_c=10;
+	}else{
+		lines = 16; lines_c=8;
+	}
+
+	for(int i=0; i<lines; i++){
+		qword *qdest_y = (qword *) dest_y;
+		qword *qdma_y  = (qword *) dma_y;
+		*qdma_y = *qdest_y;
+		dma_y +=64;
+		dest_y+=stride_y;
+	}
+
+	for(int i=0; i<lines_c; i++){
+		uint64_t *ddest_cb  = (uint64_t *) dest_cb;
+		uint64_t *ddest_cr  = (uint64_t *) dest_cr;
+		uint64_t *ddma_cb   = (uint64_t *) dma_cb;
+		uint64_t *ddma_cr   = (uint64_t *) dma_cr;
+		*ddma_cb = *ddest_cb;
+		*ddma_cr = *ddest_cr;
+		dma_cb +=32;
+		dma_cr +=32;
+		dest_cb+=stride_c;
+		dest_cr+=stride_c;
+	}
+
+	extend_edges(h, mb_x, mb_y);
+
+	//send when dma buf is full
+	if (pos==3){
+		send_pic_data(h, mb_x, mb_y, pos, 64, 32);
+	} else if (mb_x == s->mb_width-1){
+		send_pic_data(h, mb_x, mb_y, pos, 64, 32);
+	}
+}
+
+static void shift_left(int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
+	int lines, lines_c;
+	if (mb_y > 0){
+		lines  =20;
+		lines_c=10;
+		dest_y  -= 4*stride_y;
+		dest_cb -= 2*stride_c;
+		dest_cr -= 2*stride_c;
+	}else {
+		lines  =16;
+		lines_c= 8;		
+	}		
+		
+	for (int i=0; i<lines; i++){
+		qword *left_y  = (qword *) (dest_y -16);
+		qword *qdest_y = (qword *) dest_y;
+		*left_y = *qdest_y;
+		dest_y += stride_y;
+	}
+	
+	for (int i=0; i<lines_c; i++){
+		uint64_t *left_cb  = (uint64_t *) (dest_cb -8);
+		uint64_t *left_cr  = (uint64_t *) (dest_cr -8);
+		uint64_t *ddest_cb = (uint64_t *) dest_cb;
+		uint64_t *ddest_cr = (uint64_t *) dest_cr;
+		*left_cb = *ddest_cb;
+		*left_cr = *ddest_cr;
+		dest_cb += stride_c;
+		dest_cr += stride_c;
+	}
+}
+
+void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c){
+	H264slice *s = h->s;
+	H264Mb *mb = h->mb;
+    const int mb_x= mb->mb_x;
+    const int mb_y= mb->mb_y;    
+    const int mb_type= mb->mb_type;
+	
+	uint8_t *dest_y, *dest_cb, *dest_cr;	//ls ptrs (abstracts the fact it is operating in a ls buffer)
+
+    int i;
+  
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+
+	dest_y  = dest_y_ls + 16 + 4*stride_y;
+	dest_cb = dest_cb_ls + 8 + 2*stride_c;
+	dest_cr = dest_cr_ls + 8 + 2*stride_c;
+	
+	if(IS_8x8DCT(mb_type)){
+		idct_dc_add = ff_idct8_dc_add;
+		idct_add = h->dsp.h264_idct_add[0];
+	}
+	else{
+		idct_dc_add = ff_idct_dc_add;
+		idct_add = h->dsp.h264_idct_add[1];
+	}
+
+	if (mb_y>0){
+		copy_top_borders(mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+	}
+
+	if(IS_INTRA(mb_type)){
+		xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 1);
+
+		h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cb, stride_c);
+		h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cr, stride_c);
+
+		if(IS_INTRA4x4(mb_type)){
+			if(IS_8x8DCT(mb_type)){
+
+				for(i=0; i<16; i+=4){
+					uint8_t * const ptr= dest_y + block_offset[i];
+					const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ];
+					const int nnz = mb->non_zero_count_cache[ scan8[i] ];
+					h->hpc.pred8x8l[ dir ](ptr, (mb->topleft_samples_available<<i)&0x8000,
+												(mb->topright_samples_available<<i)&0x4000, stride_y);
+
+					if(nnz){
+						if(nnz == 1 && mb->mb[i*16])
+							idct_dc_add(ptr, mb->mb + i*16, stride_y);
+						else{
+							idct_add   (ptr, mb->mb + i*16, stride_y);
+						}
+					}
+				}
+			}else{
+				for(i=0; i<16; i++){
+					uint8_t * const ptr= dest_y + block_offset[i];
+					const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ];
+
+					uint8_t *topright;
+					int nnz, tr;
+					if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
+						const int topright_avail= (mb->topright_samples_available<<i)&0x8000;
+						if(!topright_avail){
+							tr= ptr[3 - stride_y]*0x01010101;
+							topright= (uint8_t*) &tr;
+						}else
+							topright= ptr + 4 - stride_y;
+					}else
+						topright= NULL;
+
+					h->hpc.pred4x4[ dir ](ptr, topright, stride_y);
+					nnz = mb->non_zero_count_cache[ scan8[i] ];
+					if(nnz){
+						if(nnz == 1 && mb->mb[i*16])
+							idct_dc_add(ptr, mb->mb + i*16, stride_y);
+						else
+							idct_add   (ptr, mb->mb + i*16, stride_y);
+					}
+				}
+			}
+
+		}else{
+			h->hpc.pred16x16[ mb->intra16x16_pred_mode ](dest_y , stride_y);
+			h264_luma_dc_dequant_idct_c(mb->mb, mb->dequant4_coeff_y);
+		}
+		xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 0);
+
+	}else {
+		hl_motion(h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+	}
+
+	if(!IS_INTRA4x4(mb_type)){
+		if(IS_INTRA16x16(mb_type)){
+			for(i=0; i<16; i++){
+				if(mb->non_zero_count_cache[ scan8[i] ])
+					idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
+				else if(mb->mb[i*16])
+					idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
+			}
+		}else if(mb->cbp&15){
+			const int incr = IS_8x8DCT(mb_type) ? 4 : 1;
+			for(i=0; i<16; i+=incr){
+				int nnz = mb->non_zero_count_cache[ scan8[i] ];
+				if(nnz){
+					if(nnz==1 && mb->mb[i*16])
+						idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
+					else
+						idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
+				}
+			}
+		}
+	}
+
+	if(mb->cbp&0x30){
+		uint8_t *dest[2] = {dest_cb, dest_cr};
+		chroma_dc_dequant_idct_c(mb->mb + 16*16, mb->dequant4_coeff_cb);
+		chroma_dc_dequant_idct_c(mb->mb + 16*16+4*16, mb->dequant4_coeff_cr);
+
+		idct_add = h->dsp.h264_idct_add[1];
+		idct_dc_add = ff_idct_dc_add;
+		for(i=16; i<16+8; i++){
+			if(mb->non_zero_count_cache[ scan8[i] ])
+				idct_add   (dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c);
+			else if(mb->mb[i*16])
+				idct_dc_add(dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c);
+		}
+	}
+
+	// save unfiltered borders
+	backup_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+	if (mb->deblock_mb){
+		filter_mb( h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+	}
+
+	if (mb_y < s->mb_height-1){
+		if(mb_x>0){
+			send_top_borders(h, mb_x-1, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
+		}
+		if (mb_x == s->mb_width-1){
+			send_top_borders(h, mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+		}
+	}
+	update_tgt_spe_dep(h, 0);
+
+	if (h->blocking){
+		if (mb_x>0){			
+			copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
+			wait_dma_id(MBD_pic);
+		}
+		if (mb_x == s->mb_width-1){			
+			copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+			wait_dma_id(MBD_pic);
+		}
+		
+	}else{
+		if (mb_x>0){
+			wait_dma_id(MBD_pic);
+			copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
+		}
+		if (mb_x == s->mb_width-1){
+			wait_dma_id(MBD_pic);
+			copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+		}
+	}
+
+	if (mb_x < s->mb_width)
+		shift_left(mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
+	
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_decode_mb_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_decode_mb_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding. 
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+#ifndef H264_DECODE_MB_SPU_H
+#define H264_DECODE_MB_SPU_H
+
+#define CELL_SPE
+#include "libavcodec/avcodec.h"
+#include "types_spu.h"
+#include "h264_types_spu.h"
+#include "h264_mc_spu.h"
+#include "h264_dma.h"
+#include "dsputil_spu.h"
+#include "h264_intra_spu.h"
+
+/**
+ * H264Context
+ */
+typedef struct H264Context_spu{
+	DECLARE_ALIGNED_16(H264spe, spe);		// contains simple type parameters that doesn't change
+    DECLARE_ALIGNED_16(H264Mb, mb_buf[3]);			// contains simple type parameters that changes for macroblock
+    DECLARE_ALIGNED_16(H264slice, slice_buf[2]);	// contains simple type parameters that changes for slice
+	
+	DSPContext_spu dsp;  // struct that contains pointers to mc interpolations functions
+	H264PredContext_spu hpc;  // struct that contains pointers to intra prediction functions
+
+	H264slice *s;
+	int sl_idx;
+	int frames;
+	//mc arg buffer
+	H264mc mc_buf[2];
+	H264mc *mc;		//mc ptr to current decoded mb
+	int mc_idx;
+	int n_mc;		//next mb_id to mc
+	int mb_proc;
+	int mb_total;
+	int curr_line;
+	
+	H264Mb* mb;		//mb ptr to current decoded mb
+	int mb_id;		//next mb_id to dma
+	int mb_dec; 	//mb_buf index - decoded mb
+	int mb_mc;		//mb_buf index - prebuffer motion data
+	int mb_dma;		//mb_buf index - target for dma mb data
+	int next_mb_idx;
+/*// for deblocking filter
+    int edges[2];
+    int start[2]; 
+    int bS[2][4][4];				// dir, edge, bS;
+    int qp[2][4];					// dir, edge;
+    int chroma_qp[2][2][4];			// cb/cr, dir, edge;	
+*/
+	int blocking; 
+}H264Context_spu;
+
+void print_output(H264Context_spu* h, const char* msg);
+void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c);
+void update_tgt_spe_dep(H264Context_spu *h, int end);
+
+// IDCT functions
+void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+
+void ff_idct_dc_add(uint8_t *dst, DCTELEM *block, int stride);
+void ff_idct8_dc_add(uint8_t *dst, DCTELEM *block, int stride);
+
+void ff_cropTbl_init();
+void add_pixels8_c(uint8_t *pixels, DCTELEM *block, int line_size);
+void add_pixels4_c(uint8_t *pixels, DCTELEM *block, int line_size);
+void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul);
+void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul);
+// Filter functions
+//void calculate_bS_qp(H264Context_spu *h);
+
+// Motion compensation function
+void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc);
+void calc_mc_params(H264Mb *mb, H264mc *mc);
+void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c);
+
+
+// Function to get traces
+void trace_event_SPU(int event, int id);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_direct_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_direct_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,332 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 direct mb/block decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+#define CELL_SPE
+#include "libavcodec/avcodec.h"
+#include "dsputil_spu.h"
+#include "h264_tables.h"
+#include "h264_types_spu.h"
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "mathops_spu.h"
+#include "rectangle_spu.h"
+
+//#undef NDEBUG
+#include <assert.h>
+static void pred_spatial_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
+    H264Mb *m = s->m;
+    int b4_stride = hc->b_stride;
+	const int mb_x = m->mb_x;    
+    int mb_type_col[2];
+    const int16_t (*l1mv0)[2], (*l1mv1)[2];
+    const int8_t *l1ref0, *l1ref1;
+    const int is_b8x8 = IS_8X8(*mb_type);
+    unsigned int sub_mb_type= MB_TYPE_L0L1;
+    int i8, i4;
+    int ref[2];
+    int mv[2];
+    int list;
+
+    //assert(h->ref_list[1][0].reference&3);
+
+#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
+
+    /* ref = min(neighbors) */
+    for(list=0; list<2; list++){
+        int left_ref = m->ref_cache[list][scan8[0] - 1];
+        int top_ref  = m->ref_cache[list][scan8[0] - 8];
+        int refc = m->ref_cache[list][scan8[0] - 8 + 4];
+        const int16_t *C= m->mv_cache[list][ scan8[0] - 8 + 4];
+        if(refc == PART_NOT_AVAILABLE){
+            refc = m->ref_cache[list][scan8[0] - 8 - 1];
+            C    = m-> mv_cache[list][scan8[0] - 8 - 1];
+        }
+        ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
+        if(ref[list] >= 0){
+            //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
+            const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ];
+            const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ];
+
+            int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
+            if(match_count > 1){ //most common
+                mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]),
+                                     mid_pred(A[1], B[1], C[1]) );
+            }else {
+                assert(match_count==1);
+                if(left_ref==ref[list]){
+                    mv[list]= AV_RN32A(A);
+                }else if(top_ref==ref[list]){
+                    mv[list]= AV_RN32A(B);
+                }else{
+                    mv[list]= AV_RN32A(C);
+                }
+            }
+        }else{
+            int mask= ~(MB_TYPE_L0 << (2*list));
+            mv[list] = 0;
+            ref[list] = -1;
+            if(!is_b8x8)
+                *mb_type &= mask;
+            sub_mb_type &= mask;
+        }
+    }
+
+    if(ref[0] < 0 && ref[1] < 0){
+        ref[0] = ref[1] = 0;
+        if(!is_b8x8)
+            *mb_type |= MB_TYPE_L0L1;
+        sub_mb_type |= MB_TYPE_L0L1;
+    }
+
+    if(!(is_b8x8|mv[0]|mv[1])){
+        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
+        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
+        fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+        fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+        *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
+        return;
+    }
+
+    mb_type_col[0] =
+    mb_type_col[1] = hc->list1_mb_type[mb_x];
+
+    sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
+        *mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
+    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
+        *mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
+    }else{
+        if(!s->direct_8x8_inference_flag){
+            /* FIXME save sub mb types from previous frames (or derive from MVs)
+            * so we know exactly what block size to use */
+            sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
+        }
+        *mb_type   |= MB_TYPE_8x8;
+    }
+
+//     l1mv0  = (void *) &hc->list1_motion_val[0][4*mb_x];
+//     l1mv1  = (void *) &hc->list1_motion_val[1][4*mb_x];
+	l1mv0  = (void *) hc->list1_motion_val[0];
+    l1mv1  = (void *) hc->list1_motion_val[1];
+    l1ref0 = &hc->list1_ref_index [0][4*mb_x];
+    l1ref1 = &hc->list1_ref_index [1][4*mb_x];
+//     if(!b8_stride){
+//         if(m->mb_y&1){
+//             l1ref0 += 2;
+//             l1ref1 += 2;
+//             l1mv0  +=  2*b4_stride;
+//             l1mv1  +=  2*b4_stride;
+//         }
+//     }
+
+    if(IS_16X16(*mb_type)){
+        int a,b;
+
+        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
+        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
+        if(!IS_INTRA(mb_type_col[0]) && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
+            || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
+            ))){
+            a=b=0;
+            if(ref[0] > 0)
+                a= mv[0];
+            if(ref[1] > 0)
+                b= mv[1];
+        }else{
+            a= mv[0];
+            b= mv[1];
+        }
+        fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
+        fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
+    }else{
+        int n=0;
+        for(i8=0; i8<4; i8++){
+            const int x8 = i8&1;
+            const int y8 = i8>>1;
+
+            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
+                continue;
+            m->sub_mb_type[i8] = sub_mb_type;
+
+            fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4);
+            fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4);
+            fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
+            fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
+
+            /* col_zero_flag */
+            if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 ))
+                ){
+                const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
+                if(IS_SUB_8X8(sub_mb_type)){
+//                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
+					const int16_t *mv_col = l1mv[x8*3 + y8*3*4];
+                    if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
+                        if(ref[0] == 0)
+                            fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
+                        if(ref[1] == 0)
+                            fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
+                        n+=4;
+                    }
+                }else{
+                    int k=0;
+                    for(i4=0; i4<4; i4++){
+                        //const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
+						const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4];
+                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
+                            if(ref[0] == 0)
+                                AV_ZERO32(m->mv_cache[0][scan8[i8*4+i4]]);
+                            if(ref[1] == 0)
+                                AV_ZERO32(m->mv_cache[1][scan8[i8*4+i4]]);
+                            k++;
+                        }
+                    }
+                    if(!(k&3))
+                        m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8;
+                    n+=k;
+                }
+            }
+        }
+        if(!is_b8x8 && !(n&15)){
+            *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
+        }
+    }
+}
+
+static void pred_temp_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
+    H264Mb *m = s->m;
+	const int mb_x = m->mb_x;
+    int b4_stride = hc->b_stride;    
+    int mb_type_col[2];
+    const int16_t (*l1mv0)[2], (*l1mv1)[2];
+    const int8_t *l1ref0, *l1ref1;
+    const int is_b8x8 = IS_8X8(*mb_type);
+    unsigned int sub_mb_type;
+    int i8, i4;
+    const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]};
+    const int *dist_scale_factor = s->dist_scale_factor;
+
+    mb_type_col[0] =
+    mb_type_col[1] = hc->list1_mb_type[mb_x];
+
+    sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
+        *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
+    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
+        *mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
+    }else{
+        if(!s->direct_8x8_inference_flag){
+            /* FIXME save sub mb types from previous frames (or derive from MVs)
+            * so we know exactly what block size to use */
+            sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
+        }
+        *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
+    }
+
+//     l1mv0  = (void *) &hc->list1_motion_val[0][4*mb_x];
+//     l1mv1  = (void *) &hc->list1_motion_val[1][4*mb_x];
+	l1mv0  = (void *) hc->list1_motion_val[0];
+    l1mv1  = (void *) hc->list1_motion_val[1];
+    l1ref0 = &hc->list1_ref_index [0][4*mb_x];
+    l1ref1 = &hc->list1_ref_index [1][4*mb_x];
+
+    /* one-to-one mv scaling */
+    if(IS_16X16(*mb_type)){
+        int ref, mv0, mv1;
+
+        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
+        if(IS_INTRA(mb_type_col[0])){
+            ref=mv0=mv1=0;
+        }else{
+            const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
+            : map_col_to_list0[1][l1ref1[0]];
+            const int scale = dist_scale_factor[ref0];
+            const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
+            int mv_l0[2];
+            mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
+            mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
+            ref= ref0;
+            mv0= pack16to32(mv_l0[0],mv_l0[1]);
+            mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
+        }
+        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
+        fill_rectangle(&m-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
+        fill_rectangle(&m-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
+    }else{
+        for(i8=0; i8<4; i8++){
+            const int x8 = i8&1;
+            const int y8 = i8>>1;
+            int ref0, scale;
+            const int16_t (*l1mv)[2]= l1mv0;
+
+            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
+                continue;
+            m->sub_mb_type[i8] = sub_mb_type;
+            fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
+            if(IS_INTRA(mb_type_col[0])){
+                fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
+                fill_rectangle(&m-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
+                fill_rectangle(&m-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
+                continue;
+            }
+
+            ref0 = l1ref0[i8];
+            if(ref0 >= 0)
+                ref0 = map_col_to_list0[0][ref0 ];
+            else{
+                ref0 = map_col_to_list0[1][l1ref1[i8]];
+                l1mv= l1mv1;
+            }
+            scale = dist_scale_factor[ref0];
+
+            fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
+            if(IS_SUB_8X8(sub_mb_type)){
+//                 const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
+				const int16_t *mv_col = l1mv[x8*3 + y8*3*4];
+                int mx = (scale * mv_col[0] + 128) >> 8;
+                int my = (scale * mv_col[1] + 128) >> 8;
+                fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
+                fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
+            }else
+            for(i4=0; i4<4; i4++){
+//                 const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
+				const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4];
+                int16_t *mv_l0 = m->mv_cache[0][scan8[i8*4+i4]];
+                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
+                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
+                AV_WN32A(m->mv_cache[1][scan8[i8*4+i4]],
+                    pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]));
+            }
+        }
+    }
+}
+
+void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
+    if(s->direct_spatial_mv_pred){
+        pred_spatial_direct_motion(hc, s, mb_type);
+    }else{
+        pred_temp_direct_motion(hc, s, mb_type);
+    }
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_direct_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_direct_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,8 @@
+#ifndef H264_DIRECT_H
+#define H264_DIRECT_H
+
+#include "h264_types_spu.h"
+
+void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_dma.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_dma.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,74 @@
+#include <spu_mfcio.h>
+#include "h264_dma.h"
+
+DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]);
+dma_list_elem_t* put_list;
+
+DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]);
+dma_list_elem_t* get_list;
+
+inline void spu_dma_get(void *ls, unsigned ea, int size, int tag){
+	mfc_get(ls, ea, size, tag, 0, 0);
+}
+
+inline void spu_dma_put(void *ls, unsigned ea, int size, int tag){
+	mfc_put(ls, ea, size, tag, 0, 0);
+}
+
+inline void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag){
+	mfc_putb(ls, ea, size, tag, 0, 0);
+}
+
+// Function that wait to finish a DMA transfer with especific id
+inline void wait_dma_id(int id){
+	spu_writech(MFC_WrTagMask, 1<< id);
+	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
+}
+
+// Functions to get/put a block from/to main memory
+void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier)
+{
+    unsigned int i = 0;
+    unsigned int listsize;
+    unsigned int ea_low;
+
+	dma_list_elem_t* list = get_list;
+	get_list+=h;
+
+    ea_low=(uint32_t) mfc_ea2l(ea);
+
+    /* Create the list, size of each list id the "width" parameter defined by the user */
+    for ( i=0; i<h; i++ ){
+        list[i].size.all32 = w;
+        list[i].ea_low = ea_low;
+        ea_low += stride;
+    }
+    /* Specify the list size and initiate the list transfer */
+    listsize = h*sizeof(dma_list_elem_t);
+    if (barrier)
+		mfc_getlb(dst, (unsigned)ea, list, listsize, tag, 0, 0);
+	else
+		mfc_getl(dst, (unsigned)ea, list, listsize, tag, 0, 0);
+}
+
+
+void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag){
+    unsigned int i = 0;
+    unsigned int listsize;
+    unsigned int ea_low;
+
+	dma_list_elem_t* list = put_list;
+	put_list+=h;
+
+	ea_low=(uint32_t) mfc_ea2l(ea);
+
+    /* Create the list, size of each list id the "width" parameter defined by the user */
+    for ( i=0; i<h; i++ ) {
+        list[i].size.all32 = size;
+        list[i].ea_low = ea_low;
+        ea_low += stride;
+    }
+    /* Specify the list size and initiate the list transfer */
+    listsize = h*sizeof(dma_list_elem_t);
+	mfc_putl(src, (unsigned) ea, list, listsize, tag, 0, 0);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_dma.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_dma.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,59 @@
+#ifndef H264_DMA_H
+#define H264_DMA_H
+
+#include "libavutil/mem.h"
+
+typedef struct dma_list_elem {
+	union {
+		unsigned int all32;
+		struct {
+		unsigned int stall    : 1;
+		unsigned int reserved : 15;
+		unsigned int nbytes   : 16;
+		} bits;
+	} size;
+	uint64_t ea_low : 32;
+}dma_list_elem_t;
+
+extern DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]);
+extern dma_list_elem_t* put_list;
+
+extern DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]);
+extern dma_list_elem_t* get_list;
+
+enum{
+	MBD_slice=1,
+	MBD_buf1,
+	MBD_buf2,
+	MBD_buf3,
+	MBD_put,
+	MBD_pic,
+	MBD_mc_buf1,
+	MBD_mc_buf2
+};
+
+enum{
+	ED_spe=1,
+	ED_slice,
+	ED_raw,
+	ED_get,
+	ED_get2,
+	ED_get_mv,
+	ED_put,
+	ED_putmb0,
+	ED_putmb1,
+};
+
+// Functions to get/put a block from/to main memory
+void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier);
+void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag);
+
+//Functions to do a dma transfer for 32-bit
+void spu_dma_get(void *ls, unsigned ea, int size, int tag);
+void spu_dma_put(void *ls, unsigned ea, int size, int tag);
+void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag);
+
+// Function that wait to finish a DMA transfer with especific id
+void wait_dma_id(int id);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_filter_spu_vec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_filter_spu_vec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding. 
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+
+#include <stdio.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+
+#include "h264_filter_spu.h"
+#include "h264_decode_mb_spu.h"
+// To use scan8 table
+#include "h264_mc_spu.h"
+
+
+int get_chroma_qp(H264Context_spu *h, int t, int qscale){
+    return h->slice.chroma_qp_table[t][qscale];
+}
+
+static inline int clip(int a, int amin, int amax){
+    if (a < amin)
+        return amin;
+    else if (a > amax)
+        return amax;
+    else
+        return a;
+}
+
+static inline vsint16_t clip_altivec(vsint16_t a, vsint16_t amin, vsint16_t amax){
+    vector unsigned short min_mask,max_mask;
+    min_mask = spu_cmpgt(amin, a);
+    max_mask = spu_cmpgt(a, amax);
+
+    return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask);
+}
+
+static inline vsint16_t clip_uint8_altivec(vsint16_t a){
+    const vsint16_t amax = {255,255,255,255,255,255,255,255};
+    const vsint16_t amin = {0, 0, 0, 0, 0, 0, 0, 0};
+    vector unsigned short min_mask,max_mask;
+    min_mask = spu_cmpgt(amin, a);
+    max_mask = spu_cmpgt(a, amax);
+
+    return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask);
+}
+
+static  inline void h264_loop_filter_chroma(vsint16_t *pix, int alpha, int beta, int8_t *tc0){
+
+    short a = (short) tc0[0];
+    short b = (short) tc0[1];
+    short c = (short) tc0[2];
+    short d = (short) tc0[3];
+    const vsint16_t vec_tc0 = {a,a,b,b,c,c,d,d};
+    const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0};
+    vector unsigned short mask_B0;
+
+    mask_B0 = spu_cmpgt(vec_v0, vec_tc0);
+
+    const vsint16_t p0 = pix[-1];
+    const vsint16_t p1 = pix[-2];
+    const vsint16_t q0 = pix[0];
+    const vsint16_t q1 = pix[1];
+
+    const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
+    const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
+    const vsint16_t v_2 = {2,2,2,2,2,2,2,2};
+    const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
+    const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
+
+    vsint16_t rp0;
+    vsint16_t rq0;
+    vsint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0;
+    vector unsigned short mask_B1, mask_tmp;
+    vsint16_t i_delta;
+
+    abs_p0mq0 = (vector signed short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
+    abs_p1mp0 = (vector signed short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
+    abs_q1mq0 = (vector signed short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
+
+    mask_B1  = spu_cmpgt(v_alpha, abs_p0mq0);
+    mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
+    mask_B1  = spu_and(mask_B1, mask_tmp);
+    mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
+    mask_B1  = spu_and(mask_B1, mask_tmp);
+
+
+    i_delta = clip_altivec(spu_rlmaska(spu_add(spu_sl(spu_sub(q0,p0 ), (vuint16_t)v_2), spu_add(spu_sub(p1,q1),v_4)), (vsint16_t)-v_3), -vec_tc0, vec_tc0);
+
+    rp0 = clip_uint8_altivec( spu_add(p0,i_delta));
+    rq0 = clip_uint8_altivec( spu_sub(q0,i_delta));
+
+    pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0);
+    pix[0]  = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0);
+}
+
+static void h264_v_loop_filter_luma_c(vsint16_t *pix, int alpha, int beta, int8_t *tc0, int inc_low2high){
+
+    short a = (short) tc0[0 + inc_low2high];
+    short b = (short) tc0[1 + inc_low2high];
+    const vsint16_t vec_tc0 = {a,a,a,a,b,b,b,b};
+    const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0};
+    vector unsigned short mask_B0;
+
+    mask_B0 = spu_cmpgt(vec_v0, vec_tc0);
+    const vsint16_t p0 = pix[-1];
+    const vsint16_t p1 = pix[-2];
+    const vsint16_t p2 = pix[-3];
+    const vsint16_t q0 = pix[0];
+    const vsint16_t q1 = pix[1];
+    const vsint16_t q2 = pix[2];
+
+    const vuint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
+    const vuint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
+
+    const vuint16_t v_1 = {1,1,1,1,1,1,1,1};
+    const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
+    const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
+    const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
+
+    vsint16_t rp0, rp1;
+    vsint16_t rq0, rq1;
+    vsint16_t tc0_B2P, tc0_B2Q, rtc0;
+    vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0;
+    vector unsigned short mask_B1, mask_B2P, mask_B2Q, mask_tmp;
+    vsint16_t i_delta, i_delta2;
+
+    abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
+    abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
+    abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
+    abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0);
+    abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0);
+
+    mask_B1  = spu_cmpgt(v_alpha, abs_p0mq0);
+    mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
+    mask_B1  = spu_and(mask_B1, mask_tmp);
+    mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
+    mask_B1  = spu_and(mask_B1, mask_tmp);
+
+    mask_B2P = spu_cmpgt(v_beta, abs_p2mp0);
+    mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0);
+
+    rp1 = spu_add(p1, clip_altivec(spu_sub(spu_rlmaska(spu_add(p2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), p1), -vec_tc0, vec_tc0 ));
+    rq1 = spu_add(q1, clip_altivec(spu_sub(spu_rlmaska(spu_add(q2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), q1), -vec_tc0, vec_tc0 ));
+
+    tc0_B2P = spu_add(vec_tc0, (vsint16_t) v_1);
+    tc0_B2P = spu_sel(vec_tc0, tc0_B2P, mask_B2P);
+
+    tc0_B2Q = spu_add(tc0_B2P, (vsint16_t) v_1);
+    rtc0    = spu_sel(tc0_B2P, tc0_B2Q, mask_B2Q);
+    i_delta2 = spu_add(spu_sub(p1,q1),v_4);
+    i_delta = spu_sl(spu_sub(q0,p0 ), v_2);
+    i_delta = spu_add(i_delta,i_delta2 );
+    i_delta = spu_rlmaska(i_delta, (vsint16_t)-v_3);
+    i_delta = clip_altivec(i_delta, -rtc0, rtc0);
+
+    rp0 = clip_uint8_altivec( spu_add(p0,i_delta));    /* p0' */
+    rq0 = clip_uint8_altivec( spu_sub(q0,i_delta));    /* q0' */
+
+    pix[-2] = spu_sel(spu_sel(p1,spu_sel(p1,rp1,mask_B2P) ,mask_B1), p1,mask_B0);
+    pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0);
+    pix[0]  = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0);
+    pix[1]  = spu_sel(spu_sel(q1,spu_sel(q1,rq1,mask_B2Q) ,mask_B1), q1,mask_B0);
+}
+
+
+
+static inline void h264_loop_filter_chroma_intra(vsint16_t *pix, int alpha, int beta){
+
+    const vuint16_t p0 = (vuint16_t) pix[-1];
+    const vuint16_t p1 = (vuint16_t) pix[-2];
+    const vuint16_t q0 = (vuint16_t) pix[0];
+    const vuint16_t q1 = (vuint16_t) pix[1];
+
+    const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
+    const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
+    const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
+
+    vuint16_t rp0;
+    vuint16_t rq0;
+    vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0;
+    vector unsigned short mask_B0, mask_tmp;
+
+    abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
+    abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
+    abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
+
+    mask_B0  = spu_cmpgt(v_alpha, (vsint16_t)abs_p0mq0);
+    mask_tmp = spu_cmpgt(v_beta, (vsint16_t)abs_p1mp0);
+    mask_B0  = spu_and(mask_B0, mask_tmp);
+    mask_tmp = spu_cmpgt( v_beta, (vsint16_t)abs_q1mq0);
+    mask_B0  = spu_and(mask_B0, mask_tmp);
+
+    rp0 = spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2;
+    rp0 = spu_rlmaska(rp0, (vsint16_t)-v_2);
+    rq0 = spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2;
+    rq0 = spu_rlmaska(rq0, (vsint16_t)-v_2);
+
+    pix[-1] = (vsint16_t) spu_sel(p0, rp0, mask_B0);
+    pix[0]  = (vsint16_t) spu_sel(q0, rq0, mask_B0);
+}
+int slice_alpha_c0_offset;
+int slice_beta_offset;
+static void filter_mb_edgecv(vsint16_t *pix, int bS[4], int qp ) {
+    int i;	
+    const int index_a = qp + slice_alpha_c0_offset;
+    const int alpha = (alpha_table+52)[index_a];
+    const int beta  = (beta_table+52)[qp + slice_beta_offset];
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        for(i=0; i<4; i++)
+            tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
+        h264_loop_filter_chroma(pix, alpha, beta, tc);
+    } else {
+        h264_loop_filter_chroma_intra(pix, alpha, beta);
+    }
+}
+
+static void filter_mb_edgeh(vsint16_t *pix, int bS[4], int qp, int inc_low2high ) {
+    int i;
+    const int index_a = qp + slice_alpha_c0_offset;
+    const int alpha = (alpha_table+52)[index_a];
+    const int beta  = (beta_table+52)[qp + slice_beta_offset];
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        for(i=0; i<4; i++)
+            tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
+        h264_v_loop_filter_luma_c(pix, alpha, beta, tc, inc_low2high);
+    } else {
+
+        const vuint16_t p0 = (vuint16_t) pix[-1];
+        const vuint16_t p1 = (vuint16_t) pix[-2];
+        const vuint16_t p2 = (vuint16_t) pix[-3];
+        const vuint16_t p3 = (vuint16_t) pix[-4];
+        const vuint16_t q0 = (vuint16_t) pix[0];
+        const vuint16_t q1 = (vuint16_t) pix[1];
+        const vuint16_t q2 = (vuint16_t) pix[2];
+        const vuint16_t q3 = (vuint16_t) pix[3];
+
+    	const vuint16_t v_alpha = {(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha};
+    	const vuint16_t v_beta = {(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta};
+    	const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
+    	const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
+    	const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
+
+        vuint16_t rp0_B1f, rp0_B2t, rp0_B2f, rp1_B2t, rp2_B2t;
+        vuint16_t rq0_B1f, rq0_B2t, rq0_B2f, rq1_B2t, rq2_B2t;
+        vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0;
+        vuint16_t v_alpha_2 = spu_rlmaska(v_alpha, (vsint16_t)-v_2);
+        vector unsigned short mask_B0, mask_B1, mask_B2P, mask_B2Q, mask_tmp;
+
+        v_alpha_2 = spu_add(v_alpha_2, v_2);
+
+	abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
+    	abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
+    	abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
+        abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0);
+        abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0);
+
+	mask_B0  = spu_cmpgt(v_alpha, abs_p0mq0);
+	mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
+	mask_B0  = spu_and(mask_B0, mask_tmp);
+	mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
+	mask_B0  = spu_and(mask_B0, mask_tmp);
+
+        mask_B1  = spu_cmpgt(v_alpha_2, abs_p0mq0);
+        mask_B2P = spu_cmpgt(v_beta,abs_p2mp0);
+        mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0);
+
+        rp0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p2,p1),spu_add(p1,p0)),spu_add(spu_add(p0,q0),spu_add(q0,q1))),(vuint16_t)v_4),(vsint16_t) -v_3);
+        		//( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+        rp1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p2,p1),spu_add(q0,p0)),v_2),(vsint16_t)-v_2);//( p2 + p1 + p0 + q0 + 2 ) >> 2;
+        rp2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p3,p3),spu_add(p2,p2)),spu_add(spu_add(p2,p1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3);
+        		//( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+        rq0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p1,p0),spu_add(p0,q0)),spu_add(spu_add(q0,q1),spu_add(q1,q2))),(vuint16_t)v_4),(vsint16_t)-v_3);
+
+        		//( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+        rq1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p0,q0),spu_add(q1,q2)),v_2),(vsint16_t)-v_2);//( p0 + q0 + q1 + q2 + 2 ) >> 2;
+        rq2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(q3,q3),spu_add(q2,q2)),spu_add(spu_add(q2,q1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3);
+        		//( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+        rp0_B1f =
+        rp0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2),(vsint16_t)-v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2;
+        rq0_B1f =
+        rq0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2),(vsint16_t)-v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2;
+
+        pix[-1] = (vsint16_t) spu_sel(p0, spu_sel(rp0_B1f, spu_sel(rp0_B2f, rp0_B2t, mask_B2P), mask_B1), mask_B0);
+        pix[-2] = (vsint16_t) spu_sel(p1, spu_sel(p1, spu_sel(p1, rp1_B2t, mask_B2P), mask_B1), mask_B0);
+        pix[-3] = (vsint16_t) spu_sel(p2, spu_sel(p2, spu_sel(p2, rp2_B2t, mask_B2P), mask_B1), mask_B0);
+        pix[0] = (vsint16_t) spu_sel(q0, spu_sel(rq0_B1f, spu_sel(rq0_B2f, rq0_B2t, mask_B2Q), mask_B1), mask_B0);
+        pix[1] = (vsint16_t) spu_sel(q1, spu_sel(q1, spu_sel(q1, rq1_B2t,mask_B2Q), mask_B1), mask_B0);
+        pix[2] = (vsint16_t) spu_sel(q2, spu_sel(q2, spu_sel(q2, rq2_B2t,mask_B2Q), mask_B1), mask_B0);
+    }
+}
+
+// This function gets bS and qp for luma and chroma before the filter
+void calculate_bS_qp(H264Context_spu *h){
+	H264mb* mb = &h->mb;
+	H264slice* slice = h->slice;
+    int dir;
+    const int mvy_limit = 4;
+    /* FIXME: A given frame may occupy more than one position in
+     * the reference list. So ref2frm should be populated with
+     * frame numbers, not indices. */
+
+	int (*ref2frm)[64] = slice->ref2frm;
+	int mb_x = mb->mb_x;
+	int mb_y = mb->mb_y;
+	int mb_type =mb->mb_type;
+    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
+    for( dir = 0; dir < 2; dir++ ){
+        int edge;
+		const int mbm_type = dir == 0 ? mb->mb_type_xy_n1 : mb->mb_type_top;
+        const int8_t qscale_mbm = dir == 0 ? mb->qscale_mbxy_n1 : mb->qscale_mbxy_top;
+
+        // how often to recheck mv-based bS when iterating between edges
+        const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
+        // how often to recheck mv-based bS when iterating along each edge
+        const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
+
+		h->edges[dir] = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
+
+		if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0))
+			h->start[dir] =1;
+		else
+			h->start[dir] =0;
+
+        /* Calculate bS */
+        for( edge = h->start[dir]; edge < h->edges[dir]; edge++ ) {
+            /* mbn_xy: neighbor macroblock */
+            const int mbn_type = edge > 0 ? mb_type : mbm_type;
+            const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm;
+			int* bS = h->bS[dir][edge];
+
+            if( (edge&1) && IS_8x8DCT(mb_type) ){
+                bS[0] = bS[1] = bS[2] = bS[3] = 0; //extra code due to decoupling
+                continue;
+            }
+            if( IS_INTRA(mb_type) ||
+                IS_INTRA(mbn_type) ) {
+                int value;
+                if (edge == 0) {
+					value = 4;
+				} else {
+					value = 3;
+				}
+                bS[0] = bS[1] = bS[2] = bS[3] = value;
+            } else {
+                int i, l;
+                int mv_done;
+
+                if( edge & mask_edge ) {
+					bS[0] = bS[1] = bS[2] = bS[3] = 0;
+                    mv_done = 1;
+                }
+                else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
+                    int b_idx= 8 + 4 + edge * (dir ? 8:1);
+                    int bn_idx= b_idx - (dir ? 8:1);
+                    int v = 0;
+
+                    for( l = 0; !v && l < 1 + (slice->slice_type_nos == FF_B_TYPE); l++ ) {
+                        v |= ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] ||
+                             FFABS(mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
+                             FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit;
+                    }
+                    bS[0] = bS[1] = bS[2] = bS[3] = v;
+
+					mv_done = 1;
+                }
+                else
+                    mv_done = 0;
+
+                for( i = 0; i < 4; i++ ) {
+                    int x = dir == 0 ? edge : i;
+                    int y = dir == 0 ? i    : edge;
+                    int b_idx= 8 + 4 + x + 8*y;
+                    int bn_idx= b_idx - (dir ? 8:1);
+
+                    if( mb->non_zero_count_cache[b_idx] != 0 ||
+                        mb->non_zero_count_cache[bn_idx] != 0 ) {
+                        bS[i] = 2;
+                    }
+                    else if(!mv_done)
+                    {
+                        bS[i] = 0;
+                        for( l = 0; l < 1 + (slice->slice_type == B_TYPE); l++ ) {
+                            if( ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] ||
+                                FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
+                                FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
+                                bS[i] = 1;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+                    continue;
+            }
+
+            /* Filter edge */
+            // Do not use s->qscale as luma quantizer because it has not the same
+            // value in IPCM macroblocks.
+            h->qp[dir][edge] = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1;
+            h->chroma_qp[0][dir][edge] = ( mb->chroma_qp[0] + get_chroma_qp(h, 0, qscale_mbn_xy ) + 1 ) >> 1;
+
+			h->chroma_qp[1][dir][edge] = ( mb->chroma_qp[1] + get_chroma_qp(h, 1, qscale_mbn_xy ) + 1 ) >> 1;
+        }
+		slice_alpha_c0_offset=slice->slice_alpha_c0_offset;
+		slice_beta_offset= slice->slice_beta_offset;
+    }
+}
+
+
+#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7,merge_h,merge_l) \
+    b0 = spu_shuffle( a0, a4, merge_h); \
+    b1 = spu_shuffle( a0, a4, merge_l ); \
+    b2 = spu_shuffle( a1, a5, merge_h ); \
+    b3 = spu_shuffle( a1, a5, merge_l ); \
+    b4 = spu_shuffle( a2, a6, merge_h ); \
+    b5 = spu_shuffle( a2, a6, merge_l ); \
+    b6 = spu_shuffle( a3, a7, merge_h ); \
+    b7 = spu_shuffle( a3, a7, merge_l ); \
+    a0 = spu_shuffle( b0, b4, merge_h ); \
+    a1 = spu_shuffle( b0, b4, merge_l ); \
+    a2 = spu_shuffle( b1, b5, merge_h ); \
+    a3 = spu_shuffle( b1, b5, merge_l ); \
+    a4 = spu_shuffle( b2, b6, merge_h ); \
+    a5 = spu_shuffle( b2, b6, merge_l); \
+    a6 = spu_shuffle( b3, b7, merge_h ); \
+    a7 = spu_shuffle( b3, b7, merge_l ); \
+    b0 = spu_shuffle( a0, a4, merge_h ); \
+    b1 = spu_shuffle( a0, a4, merge_l ); \
+    b2 = spu_shuffle( a1, a5, merge_h ); \
+    b3 = spu_shuffle( a1, a5, merge_l); \
+    b4 = spu_shuffle( a2, a6, merge_h ); \
+    b5 = spu_shuffle( a2, a6, merge_l ); \
+    b6 = spu_shuffle( a3, a7, merge_h ); \
+    b7 = spu_shuffle( a3, a7, merge_l )
+
+void filter_mb_spu(vsint16_t *img_y, vsint16_t *img_cb, vsint16_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int edges[2], int bS[2][4][4], int qp[2][4], int chroma_qp[2][2][4], int start[2]){
+
+    int dir,x;
+    vsint16_t o_vec_img_y[(16+8)*2];
+    vsint16_t t_vec_img_y[(16+8)*2];
+    vsint16_t *vec_img_y_o = o_vec_img_y;
+    vsint16_t *vec_img_y_t = t_vec_img_y;
+
+    vsint16_t o_vec_img_cb[8+8+4];
+    vsint16_t t_vec_img_cb[8+8];
+    vsint16_t *vec_img_cb_o = &o_vec_img_cb[2];
+    vsint16_t *vec_img_cb_t = t_vec_img_cb;
+
+    vsint16_t o_vec_img_cr[8+8+4];
+    vsint16_t t_vec_img_cr[8+8];
+    vsint16_t *vec_img_cr_o = &o_vec_img_cr[2];
+    vsint16_t *vec_img_cr_t = t_vec_img_cr;
+
+    vuint8_t *pvec_tmp;
+
+    const vuint8_t patt_high = {16,  0, 17,  1, 18,  2, 19,  3, 20,  4, 21,  5, 22,  6, 23,  7};
+    const vuint8_t patt_low  = {16,  8, 17,  9, 18, 10, 19, 11, 20, 12, 21, 13, 22, 14, 23, 15};
+    const vuint8_t patt_unpack={ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
+    const vuint8_t patt_pack_hw={0,  1,  2,  3,  4,  5,  6,  7, 17, 19, 21, 23, 25, 27, 29, 31};
+    const vuint8_t patt_pack_chroma_aligned={0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F,
+                                             0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F};
+    const vuint8_t patt_pack_chroma_unaligned={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                               0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F};
+    const vuint8_t v_0  	   = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    const vuint8_t mergehu16 = {0x00,0x01,0x10,0x11,0x02,0x03,0x12,0x13,0x04,0x05,0x14,0x15,0x06,0x07,0x16,0x17};
+    const vuint8_t mergelu16 = {0x08,0x09,0x18,0x19,0x0A,0x0B,0x1A,0x1B,0x0C,0x0D,0x1C,0x1D,0x0E,0x0F,0x1E,0x1F};
+    vuint8_t store_chroma, store_chroma_n1, load_chroma, load_chroma_n1;
+    int mb_xy_n1;
+    const int unalign_chroma = (unsigned int) img_cb & 15;
+
+    if(unalign_chroma==0){
+        load_chroma = patt_high;
+        load_chroma_n1 = patt_low;  // for load chroma mb_x-1
+        store_chroma = patt_pack_chroma_aligned;
+        store_chroma_n1 = patt_pack_chroma_unaligned;  // for store chroma mb_x-1
+        mb_xy_n1 = 1;   //  si no hay desalineamineto se necesita el bloque anterior para filtrar horizontalmente
+    }
+    else{
+        load_chroma = patt_low;
+        load_chroma_n1 = patt_high; // for load mb_x-1
+        store_chroma = patt_pack_chroma_unaligned;
+        store_chroma_n1 = patt_pack_chroma_aligned;    // for store chroma mb_x-1
+        mb_xy_n1 = 0;   //  si hay desalineamineto 8 no se necesita el bloque anterior
+    }
+
+    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
+
+    // LOAD MB_X -1
+
+    for (x = 0; x < 16; x++){  //Unpack Memory to 8 positions vector
+        vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize - 1], v_0 , patt_low);
+    }
+
+    for (x = 0; x < 8; x++){  //Unpack Memory to 8 positions vector
+	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cb[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1);
+	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cr[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1);
+    }
+
+    VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16);
+
+    vec_img_y_t  = &vec_img_y_t[8];
+    vec_img_y_o  = &vec_img_y_o[8];
+    vec_img_cb_t = &vec_img_cb_t[8];
+    vec_img_cb_o = &vec_img_cb_o[10];
+    vec_img_cr_t = &vec_img_cr_t[8];
+    vec_img_cr_o = &vec_img_cr_o[10];
+
+    //LOAD CURRENT MB
+    for (x = 0; x < 16; x++){  //Unpack Memory to 8 positions vector
+        pvec_tmp  	  = (vuint8_t *) &img_y[x*linesize];
+	vec_img_y_o[x]    = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_high);
+	vec_img_y_o[x+24] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_low);
+    }
+
+    for (x = 0; x < 8; x++){  //Unpack Memory to 8 positions vector
+	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma);
+	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma);
+    }
+
+    //TRANSPOSE MATRIX
+
+    VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31], vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39], vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16);
+
+    //PROCESS
+    dir = 0;
+    {
+        int edge;
+        for( edge = start[dir]; edge < edges[dir]; edge++ ) {
+            if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0)
+            {
+            	filter_mb_edgeh( &vec_img_y_t[4*edge   ], bS[dir][edge], qp[dir][edge],0);//low
+            	filter_mb_edgeh( &vec_img_y_t[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high
+
+                if( (edge&1) == 0 ) {
+                    filter_mb_edgecv( &vec_img_cb_t[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] );
+                    filter_mb_edgecv( &vec_img_cr_t[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] );
+                }
+            }
+        }
+    }
+
+    //SAVE MB_X -1 RESULTS
+
+    VEC_TRANSPOSE_8(vec_img_y_t[-8], vec_img_y_t[-7], vec_img_y_t[-6], vec_img_y_t[-5], vec_img_y_t[-4], vec_img_y_t[-3], vec_img_y_t[-2], vec_img_y_t[-1], vec_img_y_o[-8], vec_img_y_o[-7], vec_img_y_o[-6], vec_img_y_o[-5], vec_img_y_o[-4], vec_img_y_o[-3], vec_img_y_o[-2], vec_img_y_o[-1],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_t[16], vec_img_y_t[17], vec_img_y_t[18], vec_img_y_t[19], vec_img_y_t[20], vec_img_y_t[21], vec_img_y_t[22], vec_img_y_t[23], vec_img_y_o[16], vec_img_y_o[17], vec_img_y_o[18], vec_img_y_o[19], vec_img_y_o[20], vec_img_y_o[21], vec_img_y_o[22], vec_img_y_o[23],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cb_t[ -8], vec_img_cb_t[-7], vec_img_cb_t[-6], vec_img_cb_t[-5], vec_img_cb_t[-4], vec_img_cb_t[-3], vec_img_cb_t[-2], vec_img_cb_t[-1], vec_img_cb_o[-10], vec_img_cb_o[-9], vec_img_cb_o[-8], vec_img_cb_o[-7], vec_img_cb_o[-6], vec_img_cb_o[-5], vec_img_cb_o[-4], vec_img_cb_o[-3],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cr_t[ -8], vec_img_cr_t[-7], vec_img_cr_t[-6], vec_img_cr_t[-5], vec_img_cr_t[-4], vec_img_cr_t[-3], vec_img_cr_t[-2], vec_img_cr_t[-1], vec_img_cr_o[-10], vec_img_cr_o[-9], vec_img_cr_o[-8], vec_img_cr_o[-7], vec_img_cr_o[-6], vec_img_cr_o[-5], vec_img_cr_o[-4], vec_img_cr_o[-3],mergehu16, mergelu16);
+
+    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
+    	img_y[x*linesize - 1] = spu_shuffle(img_y[x*linesize - 1], vec_img_y_o[-8+x], patt_pack_hw);
+    }
+
+    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
+    	img_y[(x+8)*linesize - 1] = spu_shuffle(img_y[(x+8)*linesize - 1], vec_img_y_o[16+x], patt_pack_hw);
+    }
+
+    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
+    	img_cb[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cb[x*uvlinesize - mb_xy_n1], vec_img_cb_o[-10+x], store_chroma_n1);
+    	img_cr[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cr[x*uvlinesize - mb_xy_n1], vec_img_cr_o[-10+x], store_chroma_n1);
+    }
+
+    //TRANSPOSE MATRIX
+
+    VEC_TRANSPOSE_8(vec_img_y_t[ 0], vec_img_y_t[ 1], vec_img_y_t[ 2], vec_img_y_t[ 3], vec_img_y_t[ 4], vec_img_y_t[ 5], vec_img_y_t[ 6], vec_img_y_t[ 7], vec_img_y_o[ 0], vec_img_y_o[ 1], vec_img_y_o[ 2], vec_img_y_o[ 3], vec_img_y_o[ 4], vec_img_y_o[ 5], vec_img_y_o[ 6], vec_img_y_o[ 7],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15], vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31], vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39], vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7], vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7],mergehu16, mergelu16);
+
+    VEC_TRANSPOSE_8(vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7], vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7],mergehu16, mergelu16);
+
+
+    //LOAD MB_Y - 1
+    for (x = -4; x < 0; x++){  //Unpack Memory to 8 positions vector
+	vec_img_y_o[x]    = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_high);
+	vec_img_y_o[x+24] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_low);
+    }
+
+    for (x = -2; x < 0; x++){  //Unpack Memory to 8 positions vector
+	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma);
+	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma);
+    }
+
+    //PROCESS
+    dir = 1;
+    {
+        int edge;
+        for( edge = start[dir]; edge < edges[dir]; edge++ ) {
+            if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0)
+            {
+            	filter_mb_edgeh( &vec_img_y_o[4*edge   ], bS[dir][edge], qp[dir][edge],0);//low
+            	filter_mb_edgeh( &vec_img_y_o[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high
+            	if( (edge&1) == 0 ) {
+            	    filter_mb_edgecv( &vec_img_cb_o[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] );
+                    filter_mb_edgecv( &vec_img_cr_o[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] );
+            	}
+            }
+        }
+
+        for (x = -3; x < 16; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
+    	    img_y[x*linesize] = spu_shuffle(vec_img_y_o[x], vec_img_y_o[x+24], patt_unpack);
+        }
+
+        for (x = -1; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
+            img_cb[x*uvlinesize] = spu_shuffle(img_cb[x*uvlinesize], vec_img_cb_o[x], store_chroma);
+            img_cr[x*uvlinesize] = spu_shuffle(img_cr[x*uvlinesize], vec_img_cr_o[x], store_chroma);
+        }
+    }
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_idct_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_idct_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - Macroblock Decoding.
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+#include <spu_intrinsics.h>
+#include "types_spu.h"
+#include "h264_tables.h"
+#include "h264_idct_spu.h"
+#include "h264_intra_spu.h"
+
+/***********************************************************************
+ * ff_h264_idct_add_spu
+ ***********************************************************************
+ *  h264 idct 4x4 transform with SPU SIMD intrinsics
+ *  using the factorized algorithm 
+ *  Mauricio Alvarez: alvarez@ac.upc.edu
+ *  - DCTELEM* block: transformed coefficients are stored consecutvely in memory, 
+ *  - for the 4x4 transform the structure is like that:
+ *       || coef_00 | coef_01 || coef_02 | coef_03 ||..||coef_0F||
+ *  - Usually the DCTELEM block is declared with an alignment modificator in such a way 
+ *    that the  array is 128 bit (16 byte, 8 short) aligned.
+ *  - The dst pointer can be unaligned with unaligment as a multiple of 4.
+ ***********************************************************************/
+
+// idct_dc
+void ff_idct_dc_add(uint8_t *dst, short *block, int stride){
+    int i, j;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 4; j++ ){
+        for( i = 0; i < 4; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
+
+void ff_idct8_dc_add(uint8_t *dst, short *block, int stride){
+    int i, j;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 8; j++ ){
+        for( i = 0; i < 8; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
+
+// add without idct
+
+void add_pixels8_c(uint8_t *pixels, short *block, int line_size)
+{
+    int i;
+    for(i=0;i<8;i++) {
+        pixels[0] += block[0];
+        pixels[1] += block[1];
+        pixels[2] += block[2];
+        pixels[3] += block[3];
+        pixels[4] += block[4];
+        pixels[5] += block[5];
+        pixels[6] += block[6];
+        pixels[7] += block[7];
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+void add_pixels4_c(uint8_t *pixels, short *block, int line_size)
+{
+    int i;
+    for(i=0;i<4;i++) {
+        pixels[0] += block[0];
+        pixels[1] += block[1];
+        pixels[2] += block[2];
+        pixels[3] += block[3];
+        pixels += line_size;
+        block += 4;
+    }
+}
+
+void h264_luma_dc_dequant_idct_c(short *block, int qmul){
+	#define stride 16
+	int i;
+	int temp[16]; //FIXME check if this is a good idea
+	static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
+	static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
+
+	for(i=0; i<4; i++){
+		const int offset= y_offset[i];
+		const int z0= block[offset+stride*0] + block[offset+stride*4];
+		const int z1= block[offset+stride*0] - block[offset+stride*4];
+		const int z2= block[offset+stride*1] - block[offset+stride*5];
+		const int z3= block[offset+stride*1] + block[offset+stride*5];
+
+		temp[4*i+0]= z0+z3;
+		temp[4*i+1]= z1+z2;
+		temp[4*i+2]= z1-z2;
+		temp[4*i+3]= z0-z3;
+	}
+
+	for(i=0; i<4; i++){
+		const int offset= x_offset[i];
+		const int z0= temp[4*0+i] + temp[4*2+i];
+		const int z1= temp[4*0+i] - temp[4*2+i];
+		const int z2= temp[4*1+i] - temp[4*3+i];
+		const int z3= temp[4*1+i] + temp[4*3+i];
+
+		block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
+		block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
+		block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
+		block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+	}
+}
+#undef stride
+
+void chroma_dc_dequant_idct_c(short *block, int qmul){
+	const int stride= 16*2;
+	const int xStride= 16;
+	int a,b,c,d,e;
+
+	a= block[stride*0 + xStride*0];
+	b= block[stride*0 + xStride*1];
+	c= block[stride*1 + xStride*0];
+	d= block[stride*1 + xStride*1];
+
+	e= a-b;
+	a= a+b;
+	b= c-d;
+	c= c+d;
+
+	block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
+	block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
+	block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
+	block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
+}
+
+void h264_idct4_add_spu(uint8_t *dst, short *block, int stride)
+{
+  vsint16_t __vz0, __vz1, __vz2, __vz3; // used as temporal storage in for VEC_1D_DCT
+  vsint16_t va0, va1, va2, va3;
+  vsint16_t vtmp0, vtmp1, vtmp2, vtmp3;
+  vuint16_t sat;
+  vuint8_t va_u8;
+  vsint16_t vdst_ss;
+  vuint8_t dstperm;
+  vuint8_t vdst, vdst_orig, vfdst;
+  const int16_t imax = 255;
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  const int shift_dst = (unsigned int) dst  & 15;
+  const vuint8_t packu16   = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F);
+  const vuint8_t mergehu8  = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17);
+  //for optimized matrix transpose:
+  const vuint8_t tr0 =AVV(0x00,0x01,0x08,0x09,0x10,0x11,0x18,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
+  const vuint8_t tr1 =AVV(0x02,0x03,0x0A,0x0B,0x12,0x13,0x1A,0x1B,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
+  const vuint8_t tr2 =AVV(0x04,0x05,0x0C,0x0D,0x14,0x15,0x1C,0x1D,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
+  const vuint8_t tr3 =AVV(0x06,0x07,0x0E,0x0F,0x16,0x17,0x1E,0x1F,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
+  const vuint8_t conc =AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17);
+
+  block[0] += 32;  // add 32 as a DC-level for rounding
+
+  //load matrix
+  vtmp0 = *(vsint16_t *)(block);
+  vtmp1 = spu_rlqwbyte(vtmp0,8);
+  vtmp2 = *(vsint16_t *)(block+8);
+  vtmp3 = spu_rlqwbyte(vtmp2,8);
+
+  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
+
+  //concatenate first two rows of matrix
+  va0=spu_shuffle(va0,va1,conc);
+  //concatenate last two rows of matrix
+  va2=spu_shuffle(va2,va3,conc);
+
+  //do transpose starting from two vectors, storing as four vectors of which the second part is unused
+  vtmp0 = spu_shuffle( va0, va2, tr0);
+  vtmp1 = spu_shuffle( va0, va2, tr1);
+  vtmp2 = spu_shuffle( va0, va2, tr2);
+  vtmp3 = spu_shuffle( va0, va2, tr3);
+
+  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
+
+  // division by 64
+  va0 = spu_rlmaska(va0,-6);
+  va1 = spu_rlmaska(va1,-6);
+  va2 = spu_rlmaska(va2,-6);
+  va3 = spu_rlmaska(va3,-6);
+
+  switch (shift_dst){
+    case 0: {
+      dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+    } break;
+    case 4: {
+      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+    } break;
+    case 8: {
+      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  	                      0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F);
+    } break;
+    case 12: {
+      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                              0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13);
+    } break;
+    default: {
+      dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+    } break;
+  }
+
+  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm);
+  dst += stride;
+  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm);
+  dst += stride;
+  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm);
+  dst += stride;
+  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm);
+}
+
+void h264_idct8_add_spu(uint8_t *dst, short *block, int stride)
+{
+	vsint16_t va0, va1, va2, va3, va4, va5, va6, va7;
+	vsint16_t vza0, vza1, vza2, vza3, vza4, vza5, vza6, vza7, vzal,vzah;
+	vsint16_t vzb0, vzb1, vzb2, vzb3, vzb4, vzb5, vzb6, vzb7;
+	vsint16_t vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6, vtmp7;
+	vuint16_t sat;
+	vuint8_t va_u8;
+	const int block_stride=8;
+	vsint16_t vdst_ss;
+	const int16_t imax = 255;
+	const vsint32_t vzero = spu_splats(0);
+	const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+	vuint8_t vdst, vdst_orig, vfdst;
+	vuint8_t dstperm;
+	const int shift_dst = (unsigned int) dst  & 15;
+	const vuint8_t packu16   = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F);
+	const vuint8_t mergehu8  = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17);
+	const vuint8_t m1        = AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17);
+	const vuint8_t m2        = AVV(0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F);
+	const vuint8_t m3        = AVV(0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x18,0x19,0x1A,0x1B);
+	const vuint8_t m4        = AVV(0x14,0x15,0x16,0x17,0x04,0x05,0x06,0x07,0x1C,0x1D,0x1E,0x1F,0x0C,0x0D,0x0E,0x0F);
+	const vuint8_t m5        = AVV(0x00,0x01,0x10,0x11,0x04,0x05,0x14,0x15,0x08,0x09,0x18,0x19,0x0C,0x0D,0x1C,0x1D);
+	const vuint8_t m6        = AVV(0x12,0x13,0x02,0x03,0x16,0x17,0x06,0x07,0x1A,0x1B,0x0A,0x0B,0x1E,0x1F,0x0E,0x0F);
+
+	block[0] += 32;  // add 32 as a DC-level for rounding
+
+	vtmp0 = *(vsint16_t *)(block);
+	vtmp1 = *(vsint16_t *)(block + block_stride);
+	vtmp2 = *(vsint16_t *)(block + 2*block_stride);
+	vtmp3 = *(vsint16_t *)(block + 3*block_stride);
+	vtmp4 = *(vsint16_t *)(block + 4*block_stride);
+	vtmp5 = *(vsint16_t *)(block + 5*block_stride);
+	vtmp6 = *(vsint16_t *)(block + 6*block_stride);
+	vtmp7 = *(vsint16_t *)(block + 7*block_stride);
+
+	VEC_1D_DCT8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7);
+	VEC_TRANSPOSE_8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7,va0,va1,va2,va3,va4,va5,va6,va7);
+	VEC_1D_DCT8(va0, va1, va2, va3, va4, va5, va6, va7);
+
+	va0 = spu_rlmaska(va0,-6);
+	va1 = spu_rlmaska(va1,-6);
+	va2 = spu_rlmaska(va2,-6);
+	va3 = spu_rlmaska(va3,-6);
+	va4 = spu_rlmaska(va4,-6);
+	va5 = spu_rlmaska(va5,-6);
+	va6 = spu_rlmaska(va6,-6);
+	va7 = spu_rlmaska(va7,-6);
+
+	if (shift_dst==8)
+		dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+				   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
+	else																		    dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+			0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va4,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va5,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va6,dstperm);
+	dst += stride;
+	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va7,dstperm);
+
+}
+
+/*
+
+void h264_idct4_add_spu(uint8_t *dst, short *block, int stride){
+    int i;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+
+    block[0] += 32;
+
+    for(i=0; i<4; i++){
+        const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
+        const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
+        const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
+        const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
+
+        block[0 + 4*i]= z0 + z3;
+        block[1 + 4*i]= z1 + z2;
+        block[2 + 4*i]= z1 - z2;
+        block[3 + 4*i]= z0 - z3;
+    }
+
+    for(i=0; i<4; i++){
+        const int z0=  block[i + 4*0]     +  block[i + 4*2];
+        const int z1=  block[i + 4*0]     -  block[i + 4*2];
+        const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
+        const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
+
+        dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ];
+        dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ];
+        dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
+        dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
+    }
+}
+
+void h264_idct8_add_spu(uint8_t *dst, short *block, int stride){
+    int i;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+
+    block[0] += 32;
+	
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[0+i*8] + block[4+i*8];
+        const int a2 =  block[0+i*8] - block[4+i*8];
+        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
+        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
+        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
+        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
+        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        block[0+i*8] = b0 + b7;
+        block[7+i*8] = b0 - b7;
+        block[1+i*8] = b2 + b5;
+        block[6+i*8] = b2 - b5;
+        block[2+i*8] = b4 + b3;
+        block[5+i*8] = b4 - b3;
+        block[3+i*8] = b6 + b1;
+        block[4+i*8] = b6 - b1;
+    }
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[i+0*8] + block[i+4*8];
+        const int a2 =  block[i+0*8] - block[i+4*8];
+        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
+        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
+        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
+        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
+        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+			
+		dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
+		dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
+		dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
+		dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
+		dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
+		dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
+		dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
+		dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
+	}
+}*/
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_idct_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_idct_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,141 @@
+#ifndef H264_IDCT_SPU_H
+#define H264_IDCT_SPU_H
+
+void h264_idct4_add_spu(uint8_t *dst, short *block, int stride);
+void h264_idct8_add_spu(uint8_t *dst, short *block, int stride);
+
+/***********************************************************************
+ * VEC_1D_IDCT
+ ***********************************************************************
+ * 1-dimensional 4x4 H264 integer DCT inverse transform.
+ * Actually source and destination are 8x4. The low elements of the
+ * source are discarded and the low elements of the destination mustn't
+ * be used. 
+ * __vz0-__vz3 registers need to be declared in the caller function
+ ***********************************************************************/
+#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)				\
+  /* 1st stage */								\
+  __vz0 = spu_add(vb0,vb2);		/* temp[0] = Y[0] + Y[2] 	*/	\
+  __vz1 = spu_sub(vb0,vb2);		/* temp[1] = Y[0] - Y[2] 	*/	\
+  __vz2 = spu_rlmaska(vb1,-1);							\
+  __vz2 = spu_sub(__vz2,vb3);		/* temp[2] = Y[1].1/2 - Y[3] 	*/	\
+  __vz3 = spu_rlmaska(vb3,-1);							\
+  __vz3 = spu_add(vb1,__vz3);		/* temp[3] = Y[1] + Y[3].1/2 	*/	\
+										\
+  /* 2nd stage: output */							\
+  va0 = spu_add(__vz0,__vz3);		/* x[0] = temp[0] + temp[3] 	*/	\
+  va1 = spu_add(__vz1,__vz2);		/* x[1] = temp[1] + temp[2] 	*/	\
+  va2 = spu_sub(__vz1,__vz2);		/* x[2] = temp[1] - temp[2] 	*/  	\
+  va3 = spu_sub(__vz0,__vz3)		/* x[3] = temp[0] - temp[3] 	*/	
+
+/***********************************************************************
+ * VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8
+ ***********************************************************************
+ * load a vuint8_t vector from a unaligned memory position p
+ * Converts the vector to vsint16_t
+ * Adds the loaded and converted vector to a defined vector va
+ * converts back the result to vuint8_t and store it to memory
+ **********************************************************************/
+
+#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,shift,va,align_dst)	\
+    vdst_orig = *(vuint8_t *) (p);					\
+    vdst = spu_or(spu_slqwbyte(vdst_orig, shift),(vuint8_t) vzero);	\
+    vdst_ss = (vsint16_t) spu_shuffle((vuint8_t)vzero,vdst,mergehu8);	\
+    va = spu_add(va,vdst_ss);						\
+    sat = spu_cmpgt(va,(vsint16_t)vzero);				\
+    va = spu_and(va,(vsint16_t)sat);					\
+    sat = spu_cmpgt(va,vmax);						\
+    va = spu_sel(va,vmax,sat);						\
+    va_u8 = (vuint8_t) spu_shuffle(va,(vsint16_t) vzero,packu16);	\
+    vfdst = spu_shuffle(vdst_orig, va_u8, align_dst);			\
+    *(vuint8_t *) (dst) = vfdst
+
+/***********************************************************************
+ * VEC_TRANSPOSE_8
+ ***********************************************************************
+ * Transposes a 8x8 matrix of s16 vectors
+ **********************************************************************/
+#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
+    b0 = spu_shuffle( a0, a4, m1 ); \
+    b1 = spu_shuffle( a1, a5, m1 ); \
+    b2 = spu_shuffle( a2, a6, m1 ); \
+    b3 = spu_shuffle( a3, a7, m1 ); \
+    b4 = spu_shuffle( a4, a0, m2 ); \
+    b5 = spu_shuffle( a5, a1, m2 ); \
+    b6 = spu_shuffle( a6, a2, m2 ); \
+    b7 = spu_shuffle( a7, a3, m2 ); \
+    a0 = spu_shuffle( b0, b2, m3 ); \
+    a1 = spu_shuffle( b1, b3, m3 ); \
+    a2 = spu_shuffle( b2, b0, m4 ); \
+    a3 = spu_shuffle( b3, b1, m4 ); \
+    a4 = spu_shuffle( b4, b6, m3 ); \
+    a5 = spu_shuffle( b5, b7, m3 ); \
+    a6 = spu_shuffle( b6, b4, m4 ); \
+    a7 = spu_shuffle( b7, b5, m4 ); \
+    b0 = spu_shuffle( a0, a1, m5 ); \
+    b1 = spu_shuffle( a1, a0, m6 ); \
+    b2 = spu_shuffle( a2, a3, m5 ); \
+    b3 = spu_shuffle( a3, a2, m6 ); \
+    b4 = spu_shuffle( a4, a5, m5 ); \
+    b5 = spu_shuffle( a5, a4, m6 ); \
+    b6 = spu_shuffle( a6, a7, m5 ); \
+    b7 = spu_shuffle( a7, a6, m6 )
+
+/***********************************************************************
+ * VEC_1D_IDCT8
+ ***********************************************************************
+ * 1-dimensional 8x8 H264 integer DCT inverse transform.
+ ***********************************************************************/
+#define VEC_1D_DCT8(vb0,vb1,vb2,vb3,vb4,vb5,vb6,vb7)						\
+  vza0 = spu_add(vb0,vb4);		/* a[0] = Y[0] + Y[4] 	*/				\
+  vza2 = spu_sub(vb0,vb4);		/* a[2] = Y[0] - Y[4]	*/				\
+  vza4 = spu_rlmaska(vb2,-1);									\
+  vza4 = spu_sub(vza4,vb6);		/* a[4] = Y[2]>>1 - Y[6]	*/			\
+  vza6 = spu_rlmaska(vb6,-1	);								\
+  vza6 = spu_add(vb2,vza6);		/* a[6] = Y[2]    + Y[6]>>1	*/			\
+  												\
+  vzb0 = spu_add(vza0,vza6);		/* b[0] = a[0] + a[6]	*/				\
+  vzb2 = spu_add(vza2,vza4);		/* b[2] = a[2] + a[4]	*/				\
+  vzb4 = spu_sub(vza2,vza4);		/* b[4] = a[2] - a[4]	*/				\
+  vzb6 = spu_sub(vza0,vza6);		/* b[6] = a[0] - a[6]	*/				\
+  												\
+  vza1 = spu_rlmaska(vb7,-1);									\
+  vzal = spu_add(vza1,vb7);									\
+  vzah = spu_sub(vb5,vb3);									\
+  vza1 = spu_sub(vzah,vzal);	/* a1 = (-Y[3] + Y[5]) - (Y[7] + (Y[7]>>1))	*/		\
+  												\
+  vza3 = spu_rlmaska(vb3,-1);									\
+  vzal = spu_add(vza3,vb3);									\
+  vzah = spu_add(vb1,vb7);									\
+  vza3 = spu_sub(vzah,vzal);  	/* a3 =  (Y[1] + Y[7]) - (Y[3] + (Y[3]>>1))	*/		\
+  												\
+  vza5 = spu_rlmaska(vb5,-1);									\
+  vzal = spu_add(vza5,vb5);									\
+  vzah = spu_sub(vb7,vb1);									\
+  vza5 = spu_add(vzah,vzal);	/* a5 = (-Y[1] + Y[7]) + (Y[5] + Y[5]>>1))	*/		\
+												\
+  vza7 = spu_rlmaska(vb1,-1);									\
+  vzal = spu_add(vza7,vb1);									\
+  vzah = spu_add(vb3,vb5);									\
+  vza7 = spu_add(vzah,vzal);	/* a7 =  (Y[3] + Y[5]) + (Y[1] + (Y[1]>>1))	*/		\
+  												\
+  vzb1 = spu_rlmaska(vza7,-2);									\
+  vzb1 = spu_add(vzb1,vza1);		/* b1 = (a7>>2) + a1	*/				\
+  vzb3 = spu_rlmaska(vza5,-2);									\
+  vzb3 = spu_add(vzb3,vza3);		/* b3 =  a3 + (a5>>2)	*/				\
+  vzb5 = spu_rlmaska(vza3,-2);									\
+  vzb5 = spu_sub(vzb5,vza5);  		/* b5 = (a3>>2) - a5	*/				\
+  vzb7 = spu_rlmaska(vza1,-2);									\
+  vzb7 = spu_sub(vza7,vzb7);		/* b7 =  a7 - (a1>>2)	*/				\
+  												\
+  vb0 = spu_add(vzb0,vzb7); 		/* src[i][0] = b0 + b7	*/				\
+  vb7 = spu_sub(vzb0,vzb7);		/* src[i][7] = b0 - b7	*/				\
+  vb1 = spu_add(vzb2,vzb5);		/* src[i][1] = b2 + b5	*/				\
+  vb6 = spu_sub(vzb2,vzb5);		/* src[i][6] = b2 - b5	*/				\
+  vb2 = spu_add(vzb4,vzb3);		/* src[i][2] = b4 + b3	*/				\
+  vb5 = spu_sub(vzb4,vzb3);		/* src[i][5] = b4 - b3	*/				\
+  vb3 = spu_add(vzb6,vzb1);		/* src[i][3] = b6 + b1	*/				\
+  vb4 = spu_sub(vzb6,vzb1);		/* src[i][4] = b6 - b1	*/
+  
+
+#endif /*H264_IDCT_SPU_H*/
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_intra_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_intra_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,802 @@
+#include "types_spu.h"
+#include "h264_tables.h"
+#include "h264_intra_spu.h"
+#include <assert.h>
+
+void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const uint32_t a= ((uint32_t*)(src-stride))[0];
+    ((uint32_t*)(src+0*stride))[0]= a;
+    ((uint32_t*)(src+1*stride))[0]= a;
+    ((uint32_t*)(src+2*stride))[0]= a;
+    ((uint32_t*)(src+3*stride))[0]= a;
+}
+
+void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
+    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
+    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
+    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
+}
+
+void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
+                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
+}
+
+void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
+
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
+}
+
+void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
+
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
+}
+
+void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
+}
+
+
+#define LOAD_TOP_RIGHT_EDGE\
+    const int t4= topright[0];\
+    const int t5= topright[1];\
+    const int t6= topright[2];\
+    const int t7= topright[3];\
+
+#define LOAD_LEFT_EDGE\
+    const int l0= src[-1+0*stride];\
+    const int l1= src[-1+1*stride];\
+    const int l2= src[-1+2*stride];\
+    const int l3= src[-1+3*stride];\
+
+#define LOAD_TOP_EDGE\
+    const int t0= src[ 0-1*stride];\
+    const int t1= src[ 1-1*stride];\
+    const int t2= src[ 2-1*stride];\
+    const int t3= src[ 3-1*stride];\
+
+void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){	
+	(void) topright;
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
+    src[0+1*stride]=
+    src[1+2*stride]=
+    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
+    src[0+0*stride]=
+    src[1+1*stride]=
+    src[2+2*stride]=
+    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+0*stride]=
+    src[2+1*stride]=
+    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+0*stride]=
+    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+}
+
+void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+//    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
+    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
+}
+
+void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+	(void) l3;
+
+    src[0+0*stride]=
+    src[1+2*stride]=(lt + t0 + 1)>>1;
+    src[1+0*stride]=
+    src[2+2*stride]=(t0 + t1 + 1)>>1;
+    src[2+0*stride]=
+    src[3+2*stride]=(t1 + t2 + 1)>>1;
+    src[3+0*stride]=(t2 + t3 + 1)>>1;
+    src[0+1*stride]=
+    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+1*stride]=
+    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+1*stride]=
+    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+}
+
+void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+	(void) t7;
+
+    src[0+0*stride]=(t0 + t1 + 1)>>1;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4+ 1)>>1;
+    src[3+2*stride]=(t4 + t5+ 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+}
+
+void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(l0 + l1 + 1)>>1;
+    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[2+0*stride]=
+    src[0+1*stride]=(l1 + l2 + 1)>>1;
+    src[3+0*stride]=
+    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+    src[2+1*stride]=
+    src[0+2*stride]=(l2 + l3 + 1)>>1;
+    src[3+1*stride]=
+    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
+    src[3+2*stride]=
+    src[1+3*stride]=
+    src[0+3*stride]=
+    src[2+2*stride]=
+    src[2+3*stride]=
+    src[3+3*stride]=l3;
+}
+
+void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+	(void) t3;
+
+    src[0+0*stride]=
+    src[2+1*stride]=(lt + l0 + 1)>>1;
+    src[1+0*stride]=
+    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[0+1*stride]=
+    src[2+2*stride]=(l0 + l1 + 1)>>1;
+    src[1+1*stride]=
+    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[2+3*stride]=(l1 + l2+ 1)>>1;
+    src[1+2*stride]=
+    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[0+3*stride]=(l2 + l3 + 1)>>1;
+    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+}
+
+void ff_pred16x16_vertical_c(uint8_t *src, int stride){
+    int i;
+	const vuint32_t v= *((vuint32_t*)(src-stride));
+    for(i=0; i<4; i++){
+		*((vuint32_t*) src 			 ) =v;
+		*((vuint32_t*)(src +   stride)) =v;
+		*((vuint32_t*)(src + 2*stride)) =v;
+		*((vuint32_t*)(src + 3*stride)) =v;
+		src+= 4*stride;
+    }
+	
+	/*const uint32_t a= ((uint32_t*)(src-stride))[0];
+	const uint32_t b= ((uint32_t*)(src-stride))[1];
+	const uint32_t c= ((uint32_t*)(src-stride))[2];
+	const uint32_t d= ((uint32_t*)(src-stride))[3];
+
+	for(i=0; i<16; i++){
+		((uint32_t*)(src+i*stride))[0]= a;
+		((uint32_t*)(src+i*stride))[1]= b;
+		((uint32_t*)(src+i*stride))[2]= c;
+		((uint32_t*)(src+i*stride))[3]= d;
+	}*/
+}
+
+void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
+    int i;
+	
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
+    }
+}
+
+void ff_pred16x16_dc_c(uint8_t *src, int stride){
+    int i;
+	int dc=0;
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    for(i=0;i<16; i++){
+		dc+= src[i-stride];
+    }
+	dc= 0x01010101*((dc + 16)>>5);
+    
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= dc;
+    }
+}
+
+void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
+    int i;
+	
+	int dc=0;
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+	dc= 0x01010101*((dc + 8)>>4);
+	
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= dc;
+    }
+}
+
+void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
+    int i;
+	int dc0=0;
+    for(i=0;i<16; i++){
+        dc0+= src[i-stride];
+    }
+	
+	dc0= 0x01010101*((dc0 + 8)>>4);
+	
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= dc0;
+    }
+}
+
+void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
+    int i;
+	
+	/*const vuint32_t v= AVV(0x01010101U*128U, 0x01010101U*128U,0x01010101U*128U,0x01010101U*128U);
+	for(i=0; i<4; i++){
+		*((vuint32_t*) src 			  ) =v;
+		*((vuint32_t*)(src +   stride)) =v;
+		*((vuint32_t*)(src + 2*stride)) =v;
+		*((vuint32_t*)(src + 3*stride)) =v;
+		src+= 4*stride;
+	}*/
+	
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
+    }
+}
+
+void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
+	int i, j, k;
+	int a;
+	uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+	const uint8_t * const src0 = src+7-stride;
+	const uint8_t *src1 = src+8*stride-1;
+	const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
+	int H = src0[1] - src0[-1];
+	int V = src1[0] - src2[ 0];
+	for(k=2; k<=8; ++k) {
+		src1 += stride; src2 -= stride;
+		H += k*(src0[k] - src0[-k]);
+		V += k*(src1[0] - src2[ 0]);
+	}
+	if(svq3){
+		H = ( 5*(H/4) ) / 16;
+		V = ( 5*(V/4) ) / 16;
+
+		/* required for 100% accuracy */
+		i = H; H = V; V = i;
+	}else{
+		H = ( 5*H+32 ) >> 6;
+		V = ( 5*V+32 ) >> 6;
+	}
+
+	a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
+	for(j=16; j>0; --j) {
+		int b = a;
+		a += V;
+		for(i=-16; i<0; i+=4) {
+		src[16+i] = cm[ (b    ) >> 5 ];
+		src[17+i] = cm[ (b+  H) >> 5 ];
+		src[18+i] = cm[ (b+2*H) >> 5 ];
+		src[19+i] = cm[ (b+3*H) >> 5 ];
+		b += 4*H;
+		}
+		src += stride;
+	}
+}
+
+void ff_pred16x16_plane_c(uint8_t *src, int stride){
+    pred16x16_plane_compat_c(src, stride, 0);
+}
+
+void ff_pred8x8_vertical_c(uint8_t *src, int stride){
+    int i;
+    const uint32_t a= ((uint32_t*)(src-stride))[0];
+    const uint32_t b= ((uint32_t*)(src-stride))[1];
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= a;
+        ((uint32_t*)(src+i*stride))[1]= b;
+    }
+}
+
+void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
+    }
+}
+
+void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
+    }
+}
+
+void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
+    int i;
+    int dc0, dc2;
+
+    dc0=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc0= 0x01010101*((dc0 + 2)>>2);
+    dc2= 0x01010101*((dc2 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc2;
+    }
+}
+
+void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
+    int i;
+    int dc0, dc1;
+
+    dc0=dc1=0;
+    for(i=0;i<4; i++){
+        dc0+= src[i-stride];
+        dc1+= src[4+i-stride];
+    }
+    dc0= 0x01010101*((dc0 + 2)>>2);
+    dc1= 0x01010101*((dc1 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc1;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc1;
+    }
+}
+
+
+void ff_pred8x8_dc_c(uint8_t *src, int stride){
+    int i;
+    int dc0, dc1, dc2, dc3;
+
+    dc0=dc1=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc1+= src[4+i-stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
+    dc0= 0x01010101*((dc0 + 4)>>3);
+    dc1= 0x01010101*((dc1 + 2)>>2);
+    dc2= 0x01010101*((dc2 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc1;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc2;
+        ((uint32_t*)(src+i*stride))[1]= dc3;
+    }
+}
+
+void ff_pred8x8_plane_c(uint8_t *src, int stride){
+  int j, k;
+  int a;
+  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+  const uint8_t * const src0 = src+3-stride;
+  const uint8_t *src1 = src+4*stride-1;
+  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=4; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  H = ( 17*H+16 ) >> 5;
+  V = ( 17*V+16 ) >> 5;
+
+  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
+  for(j=8; j>0; --j) {
+    int b = a;
+    a += V;
+    src[0] = cm[ (b    ) >> 5 ];
+    src[1] = cm[ (b+  H) >> 5 ];
+    src[2] = cm[ (b+2*H) >> 5 ];
+    src[3] = cm[ (b+3*H) >> 5 ];
+    src[4] = cm[ (b+4*H) >> 5 ];
+    src[5] = cm[ (b+5*H) >> 5 ];
+    src[6] = cm[ (b+6*H) >> 5 ];
+    src[7] = cm[ (b+7*H) >> 5 ];
+    src += stride;
+  }
+}
+
+
+#define SRC(x,y) src[(x)+(y)*stride]
+#define PL(y) \
+    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_LEFT \
+    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
+                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
+    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
+    const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
+
+#define PT(x) \
+    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOP \
+    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
+                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
+    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
+    const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
+                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
+
+#define PTR(x) \
+    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+    int t8, t9, t10, t11, t12, t13, t14, t15; \
+    if(has_topright) { \
+        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
+        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
+    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+
+#define PREDICT_8x8_LOAD_TOPLEFT \
+    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
+
+#define PREDICT_8x8_DC(v) \
+    int y; \
+    for( y = 0; y < 8; y++ ) { \
+        ((uint32_t*)src)[0] = \
+        ((uint32_t*)src)[1] = v; \
+        src += stride; \
+    }
+
+static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+	(void) has_topright;
+	(void) has_topleft;
+    PREDICT_8x8_DC(0x80808080);
+}
+static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+	(void) has_topright;
+    PREDICT_8x8_LOAD_LEFT;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
+                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+	(void) has_topright;
+    PREDICT_8x8_LOAD_LEFT;
+#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
+               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
+    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
+#undef ROW
+}
+static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    int y;
+    PREDICT_8x8_LOAD_TOP;
+    src[0] = t0;
+    src[1] = t1;
+    src[2] = t2;
+    src[3] = t3;
+    src[4] = t4;
+    src[5] = t5;
+    src[6] = t6;
+    src[7] = t7;
+    for( y = 1; y < 8; y++ )
+        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
+}
+static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
+    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
+    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
+    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
+    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
+}
+static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
+    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+
+}
+static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+	(void) l7;
+    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
+    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
+    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
+    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
+    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
+    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
+    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
+    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(7,0)= (t6 + t7 + 1) >> 1;
+}
+static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+	(void) t7;
+    SRC(0,7)= (l6 + l7 + 1) >> 1;
+    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
+    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
+    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
+    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
+    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
+    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
+    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
+    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
+    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
+    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
+    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
+    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
+    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
+    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
+}
+static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + t1 + 1) >> 1;
+    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
+    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
+    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
+    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
+    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
+    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
+    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
+    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
+    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
+    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(7,6)= (t10 + t11 + 1) >> 1;
+    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
+}
+static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
+{
+	(void) has_topright;
+    PREDICT_8x8_LOAD_LEFT;
+    SRC(0,0)= (l0 + l1 + 1) >> 1;
+    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
+    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
+    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
+    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
+    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
+    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
+    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
+    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
+    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
+    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
+    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
+}
+#undef PREDICT_8x8_LOAD_LEFT
+#undef PREDICT_8x8_LOAD_TOP
+#undef PREDICT_8x8_LOAD_TOPLEFT
+#undef PREDICT_8x8_LOAD_TOPRIGHT
+#undef PREDICT_8x8_DC
+#undef PTR
+#undef PT
+#undef PL
+#undef SRC
+
+void init_pred_ptrs(H264PredContext_spu *i){
+
+    i->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
+    i->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
+    i->pred4x4[DC_PRED             ]= pred4x4_dc_c;
+    i->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
+    i->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
+    i->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
+    i->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
+    i->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
+    i->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
+    i->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
+    i->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
+    i->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
+
+    i->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
+    i->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
+    i->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
+    i->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
+    i->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
+    i->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
+    i->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
+    i->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
+    i->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
+    i->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
+    i->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
+    i->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
+
+  
+    i->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
+    i->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
+    i->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
+	i->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
+    i->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
+    i->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
+    i->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
+
+    i->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
+    i->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
+    i->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
+    i->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
+    i->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
+    i->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
+    i->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
+
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_intra_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_intra_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,48 @@
+#ifndef H264_INTRA_SPU_H
+#define H264_INTRA_SPU_H
+
+#define MAX_NEG_CROP       1024
+
+// For Intra mode
+#define MB_TYPE_INTRA4x4   0x0001
+#define IS_INTRA(a)       ((a)&7)
+#define IS_INTRA4x4(a)    ((a)&MB_TYPE_INTRA4x4)
+
+#define CODEC_FLAG_GRAY   0x2000
+
+#define VERT_PRED             0
+#define HOR_PRED              1
+#define DC_PRED               2
+#define DIAG_DOWN_LEFT_PRED   3
+#define DIAG_DOWN_RIGHT_PRED  4
+#define VERT_RIGHT_PRED       5
+#define HOR_DOWN_PRED         6
+#define VERT_LEFT_PRED        7
+#define HOR_UP_PRED           8
+
+#define LEFT_DC_PRED          9
+#define TOP_DC_PRED           10
+#define DC_128_PRED           11
+
+
+#define DC_PRED8x8            0
+#define HOR_PRED8x8           1
+#define VERT_PRED8x8          2
+#define PLANE_PRED8x8         3
+
+#define LEFT_DC_PRED8x8       4
+#define TOP_DC_PRED8x8        5
+#define DC_128_PRED8x8        6
+
+typedef struct H264PredContext_spu{
+
+  intra_pred4x4 pred4x4[9+3];
+  intra_pred16x16 pred16x16[4+3];
+  intra_pred8x8 pred8x8[4+3];
+  intra_pred8x8l pred8x8l[9+3];
+
+}H264PredContext_spu;
+
+void init_pred_ptrs(H264PredContext_spu *i);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_luma_template_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_luma_template_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1560 @@
+static void PREFIX_h264_qpel16_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
+  
+  register int i;
+
+  const int16_t i20ss= 20;
+  const int16_t i5ss= 5;
+  const int16_t i16ss= 16;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t v16ss = spu_splats(i16ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const int shift_src =(unsigned int) src & 15;
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  uint8_t *srcbis = src - (STRIDE_Y * 2);
+
+  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+
+  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+  vsint16_t srcM2ssB = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
+  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+  vsint16_t srcM1ssB = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
+  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+  vsint16_t srcP0ssB = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
+  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+  vsint16_t srcP1ssB = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
+  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+  vsint16_t srcP2ssB = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
+
+  for (i = 0 ; i < h ; i++) {
+    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
+    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
+    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
+
+    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+    const vsint16_t srcP3ssB = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
+    srcbis += STRIDE_Y;
+
+    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
+    const vsint16_t sum1B = spu_add(srcP0ssB, srcP1ssB);
+    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
+    const vsint16_t sum2B = spu_add(srcM1ssB, srcP2ssB);
+    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
+    const vsint16_t sum3B = spu_add(srcM2ssB, srcP3ssB);
+
+    srcM2ssA = srcM1ssA;
+    srcM2ssB = srcM1ssB;
+    srcM1ssA = srcP0ssA;
+    srcM1ssB = srcP0ssB;
+    srcP0ssA = srcP1ssA;
+    srcP0ssB = srcP1ssB;
+    srcP1ssA = srcP2ssA;
+    srcP1ssB = srcP2ssB;
+    srcP2ssA = srcP3ssA;
+    srcP2ssB = srcP3ssB;
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
+
+    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
+    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
+    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
+    const vsint16_t pp1B = spu_add(pp1B3, v16ss);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
+    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
+    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
+
+    const vsint16_t pp3A = spu_add(sum3A, pp1A);
+    const vsint16_t pp3B = spu_add(sum3B, pp1B);
+
+    const vsint16_t psumA = spu_sub(pp3A, pp2A);
+    const vsint16_t psumB = spu_sub(pp3B, pp2B);
+
+    vsint16_t sumA = spu_rlmask(psumA, -5);
+    vsint16_t sumB = spu_rlmask(psumB, -5);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
+    sumA = spu_and(sumA,(vsint16_t)sat);
+    sat = spu_cmpgt(sumA,vmax);
+    sumA = spu_sel(sumA,vmax,sat);
+    sat = spu_cmpgt(sumB,(vsint16_t)vzero);
+    sumB = spu_and(sumB,(vsint16_t)sat);
+    sat = spu_cmpgt(sumB,vmax);
+    sumB = spu_sel(sumB,vmax,sat);
+
+    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu);
+
+    /* 16x16 dest luma blocks are alway aligned */
+    const vuint8_t vdst = *(vuint8_t *)dst;
+
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, sum, vdst);
+
+    *(vuint8_t *)dst=fsum;
+    
+    dst += dstStride; /* stride is  multiple of 16 ,so dstperm and dstmask can remain out of the loop */
+  }
+}
+
+static void PREFIX_h264_qpel16_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
+
+  register int i;
+  
+  const int16_t i20ss = 20;
+  const int16_t i5ss = 5;
+  const int16_t i16ss = 16;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t v16ss = spu_splats(i16ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  const int permM2 = (unsigned int) (src-2) & 15;
+  const int permM1 = (unsigned int) (src-1) & 15;
+  const int permP0 = (unsigned int) (src) & 15;
+  const int permP1 = (unsigned int) (src+1) & 15;
+  const int permP2 = (unsigned int) (src+2) & 15;
+  const int permP3 = (unsigned int) (src+3) & 15;
+
+  register int align = ((((unsigned long)src) - 2) % 16);
+
+  for (i = 0 ; i < h ; i ++) {
+    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    vuint8_t srcR1 = *(vuint8_t *)(src-2);
+    vuint8_t srcR2 = *(vuint8_t *)(src+14);
+
+    switch (align) {
+    default: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
+    } break;
+    case 11: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = srcR2;
+    } break;
+    case 12: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = srcR2;
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 13: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = srcR2;
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 14: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = srcR2;
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 15: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = srcR2;
+      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    }
+
+    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+    const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
+    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+    const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
+
+    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+    const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
+    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+    const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
+
+    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+    const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
+    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+    const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
+
+    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
+    const vsint16_t sum1B = spu_add(srcP0B, srcP1B);
+    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
+    const vsint16_t sum2B = spu_add(srcM1B, srcP2B);
+    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
+    const vsint16_t sum3B = spu_add(srcM2B, srcP3B);
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
+
+    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
+    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
+    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
+    const vsint16_t pp1B = spu_add(pp1B3, v16ss);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
+    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
+    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
+
+    const vsint16_t pp3A = spu_add(sum3A, pp1A);
+    const vsint16_t pp3B = spu_add(sum3B, pp1B);
+
+    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
+    const vsint16_t psumB = spu_sub(pp3B, (vsint16_t)pp2B);
+
+    vsint16_t sumA = spu_rlmask(psumA, -5);
+    vsint16_t sumB = spu_rlmask(psumB, -5);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
+    sumA = spu_and(sumA,(vsint16_t)sat);
+    sat = spu_cmpgt(sumA,vmax);
+    sumA = spu_sel(sumA,vmax,sat);
+    sat = spu_cmpgt(sumB,(vsint16_t)vzero);
+    sumB = spu_and(sumB,(vsint16_t)sat);
+    sat = spu_cmpgt(sumB,vmax);
+    sumB = spu_sel(sumB,vmax,sat);
+
+    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu);
+
+    /* 16x16 dest luma blocks are alway aligned */
+    const vuint8_t vdst = *(vuint8_t *)dst;
+
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, sum, vdst);
+
+    *(vuint8_t *)dst=fsum;
+    
+    src += STRIDE_Y;
+    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
+   }
+}
+
+/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
+static void PREFIX_h264_qpel16_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
+  register int i;
+
+  const int16_t i20ss = 20;
+  const int16_t i5ss = 5;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  const int permM2 = (unsigned int) (src-2) & 15;
+  const int permM1 = (unsigned int) (src-1) & 15;
+  const int permP0 = (unsigned int) (src) & 15;
+  const int permP1 = (unsigned int) (src+1) & 15;
+  const int permP2 = (unsigned int) (src+2) & 15;
+  const int permP3 = (unsigned int) (src+3) & 15;
+
+  register int align = ((((unsigned long)src) - 2) % 16);
+
+  src -= (2 * STRIDE_Y);
+
+  for (i = 0 ; i < (h+5) ; i ++) {
+    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    vuint8_t srcR1 = *(vuint8_t *)(src-2);
+    vuint8_t srcR2 = *(vuint8_t *)(src+14);
+
+    switch (align) {
+    default: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
+    } break;
+    case 11: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = srcR2;
+    } break;
+    case 12: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = srcR2;
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 13: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = srcR2;
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 14: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = srcR2;
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 15: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = srcR2;
+      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    }
+
+    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+    const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
+    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+    const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
+
+    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+    const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
+    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+    const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
+
+    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+    const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
+    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+    const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
+
+    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
+    const vsint16_t sum1B = spu_add(srcP0B, srcP1B);
+    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
+    const vsint16_t sum2B = spu_add(srcM1B, srcP2B);
+    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
+    const vsint16_t sum3B = spu_add(srcM2B, srcP3B);
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
+
+    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
+    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
+    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
+    const vsint16_t pp1B = spu_add(pp1B3, sum3B);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
+    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
+    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
+
+    const vsint16_t psumA = spu_sub(pp1A, pp2A);
+    const vsint16_t psumB = spu_sub(pp1B, pp2B);
+
+    *(vsint16_t *)tmp = psumA;
+    *(vsint16_t *)(tmp+8) = psumB;
+
+    src += STRIDE_Y;
+    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
+  }
+
+  const int32_t ni10si = -10;
+  const int16_t i1ss = 1;
+  const int32_t i512si = 512;
+  const int32_t ni16si = -16;
+
+  const vsint32_t nv10si = spu_splats(ni10si);
+  const vsint16_t v1ss = spu_splats(i1ss);
+  const vsint32_t v512si = spu_splats(i512si);
+  const vsint32_t nv16si = spu_splats(ni16si);
+
+  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
+  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
+
+  int16_t *tmpbis = tmp - (tmpStride * (h+5));
+
+  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
+  vsint16_t tmpM2ssB = *(vsint16_t *)(tmpbis+8);
+  tmpbis += tmpStride;
+  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
+  vsint16_t tmpM1ssB = *(vsint16_t *)(tmpbis+8);
+  tmpbis += tmpStride;
+  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
+  vsint16_t tmpP0ssB = *(vsint16_t *)(tmpbis+8);
+  tmpbis += tmpStride;
+  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
+  vsint16_t tmpP1ssB = *(vsint16_t *)(tmpbis+8);
+  tmpbis += tmpStride;
+  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
+  vsint16_t tmpP2ssB = *(vsint16_t *)(tmpbis+8);
+  tmpbis += tmpStride;
+
+  for (i = 0 ; i < h ; i++) {
+    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
+    const vsint16_t tmpP3ssB = *(vsint16_t *)(tmpbis+8);
+    tmpbis += tmpStride;
+
+    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
+    const vsint16_t sum1B = spu_add(tmpP0ssB, tmpP1ssB);
+    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
+    const vsint16_t sum2B = spu_add(tmpM1ssB, tmpP2ssB);
+    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
+    const vsint16_t sum3B = spu_add(tmpM2ssB, tmpP3ssB);
+
+    tmpM2ssA = tmpM1ssA;
+    tmpM2ssB = tmpM1ssB;
+    tmpM1ssA = tmpP0ssA;
+    tmpM1ssB = tmpP0ssB;
+    tmpP0ssA = tmpP1ssA;
+    tmpP0ssB = tmpP1ssB;
+    tmpP1ssA = tmpP2ssA;
+    tmpP1ssB = tmpP2ssB;
+    tmpP2ssA = tmpP3ssA;
+    tmpP2ssB = tmpP3ssB;
+
+    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
+    const vsint32_t pp1Be = spu_mule(sum1B, v20ss);
+    const vsint32_t pp1Bo = spu_mulo(sum1B, v20ss);
+
+    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
+    const vsint32_t pp2Be = spu_mule(sum2B, v5ss);
+    const vsint32_t pp2Bo = spu_mulo(sum2B, v5ss);
+
+    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
+    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
+    const vsint32_t pp3Be = spu_rlmask((vsint32_t)sum3B, nv16si);
+    const vsint32_t pp3Bo = spu_mulo(sum3B, v1ss);
+
+    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
+    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
+    const vsint32_t pp1cBe = spu_add(pp1Be, v512si);
+    const vsint32_t pp1cBo = spu_add(pp1Bo, v512si);
+
+    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
+    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
+    const vsint32_t pp32Be = spu_sub(pp3Be, pp2Be);
+    const vsint32_t pp32Bo = spu_sub(pp3Bo, pp2Bo);
+
+    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
+    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
+    const vsint32_t sumBe = spu_add(pp1cBe, pp32Be);
+    const vsint32_t sumBo = spu_add(pp1cBo, pp32Bo);
+
+    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
+    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
+    const vsint32_t ssumBe = spu_rlmask(sumBe, nv10si);
+    const vsint32_t ssumBo = spu_rlmask(sumBo, nv10si);
+
+    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, ssumBe, packs);
+    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, ssumBo, packs);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
+    ssume = spu_and(ssume,(vsint16_t)sat);
+    sat = spu_cmpgt(ssume,vmax);
+    ssume = spu_sel(ssume,vmax,sat);
+    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
+    ssumo = spu_and(ssumo,(vsint16_t)sat);
+    sat = spu_cmpgt(ssumo,vmax);
+    ssumo = spu_sel(ssumo,vmax,sat);
+
+    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
+
+    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
+
+    /* 16x16 dest luma blocks are alway aligned */
+    const vuint8_t vdst = *(vuint8_t *)dst;
+
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, sum, vdst);
+
+    *(vuint8_t *)dst=fsum;
+    
+    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
+
+  }
+}
+
+static void PREFIX_h264_qpel8_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
+  
+  register int i;
+
+  const int16_t i20ss= 20;
+  const int16_t i5ss= 5;
+  const int16_t i16ss= 16;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t v16ss = spu_splats(i16ss);
+  const int shift_src = (unsigned int) src & 15;
+
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
+  const int shift_dst = (unsigned int) dst & 15;
+  vuint8_t dstmask;
+  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+  if(shift_dst==0){
+    dstmask = dst8mask1;
+  }
+  else{
+    dstmask = dst8mask2;
+  }
+
+  uint8_t *srcbis = src - (STRIDE_Y * 2);
+
+  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+
+  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+
+  for (i = 0 ; i < h ; i++) {
+    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
+    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
+    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
+
+    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+    srcbis += STRIDE_Y;
+
+    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
+    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
+    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
+
+    srcM2ssA = srcM1ssA;
+    srcM1ssA = srcP0ssA;
+    srcP0ssA = srcP1ssA;
+    srcP1ssA = srcP2ssA;
+    srcP2ssA = srcP3ssA;
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint16_t pp3A = spu_add(sum3A, pp1A);
+    const vsint16_t psumA = spu_sub(pp3A, pp2A);
+    vsint16_t sumA = spu_rlmask(psumA, -5);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
+    sumA = spu_and(sumA,(vsint16_t)sat);
+    sat = spu_cmpgt(sumA,vmax);
+    sumA = spu_sel(sumA,vmax,sat);
+
+    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+    
+    dst += dstStride; 
+  }
+}
+
+static void PREFIX_h264_qpel8_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
+
+  register int i;
+  
+  const int16_t i20ss = 20;
+  const int16_t i5ss = 5;
+  const int16_t i16ss = 16;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t v16ss = spu_splats(i16ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
+  const int shift_dst = (unsigned int) dst & 15;
+  vuint8_t dstmask;
+  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+  if(shift_dst==0){
+    dstmask = dst8mask1;
+  }
+  else{
+    dstmask = dst8mask2;
+  }
+
+  const int permM2 = (unsigned int) (src-2) & 15;
+  const int permM1 = (unsigned int) (src-1) & 15;
+  const int permP0 = (unsigned int) (src) & 15;
+  const int permP1 = (unsigned int) (src+1) & 15;
+  const int permP2 = (unsigned int) (src+2) & 15;
+  const int permP3 = (unsigned int) (src+3) & 15;
+
+  register int align = ((((unsigned long)src) - 2) % 16);
+
+  for (i = 0 ; i < h ; i ++) {
+    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    vuint8_t srcR1 = *(vuint8_t *)(src-2);
+    vuint8_t srcR2 = *(vuint8_t *)(src+14);
+
+    switch (align) {
+    default: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
+    } break;
+    case 11: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = srcR2;
+    } break;
+    case 12: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = srcR2;
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 13: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = srcR2;
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 14: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = srcR2;
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 15: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = srcR2;
+      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    }
+
+    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+
+    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+
+    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+
+    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
+    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
+    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint16_t pp3A = spu_add(sum3A, pp1A);
+
+    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
+    
+    vsint16_t sumA = spu_rlmask(psumA, -5);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
+    sumA = spu_and(sumA,(vsint16_t)sat);
+    sat = spu_cmpgt(sumA,vmax);
+    sumA = spu_sel(sumA,vmax,sat);
+
+    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+    
+    src += STRIDE_Y;
+    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
+   }
+}
+
+/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
+static void PREFIX_h264_qpel8_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
+  register int i;
+
+  const int16_t i20ss = 20;
+  const int16_t i5ss = 5;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  const int permM2 = (unsigned int) (src-2) & 15;
+  const int permM1 = (unsigned int) (src-1) & 15;
+  const int permP0 = (unsigned int) (src) & 15;
+  const int permP1 = (unsigned int) (src+1) & 15;
+  const int permP2 = (unsigned int) (src+2) & 15;
+  const int permP3 = (unsigned int) (src+3) & 15;
+
+  register int align = ((((unsigned long)src) - 2) % 16);
+
+  src -= (2 * STRIDE_Y);
+
+  for (i = 0 ; i < (h+5) ; i ++) {
+    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    vuint8_t srcR1 = *(vuint8_t *)(src-2);
+    vuint8_t srcR2 = *(vuint8_t *)(src+14);
+
+    switch (align) {
+    default: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
+    } break;
+    case 11: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = srcR2;
+    } break;
+    case 12: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = srcR2;
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 13: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = srcR2;
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 14: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = srcR2;
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 15: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = srcR2;
+      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    }
+
+    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh);
+
+    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
+    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
+    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint16_t psumA = spu_sub(pp1A, pp2A);
+
+    *(vsint16_t *)tmp = psumA;
+
+    src += STRIDE_Y;
+    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
+  }
+
+  const int32_t ni10si = -10;
+  const int16_t i1ss = 1;
+  const int32_t i512si = 512;
+  const int32_t ni16si = -16;
+
+  const vsint32_t nv10si = spu_splats(ni10si);
+  const vsint16_t v1ss = spu_splats(i1ss);
+  const vsint32_t v512si = spu_splats(i512si);
+  const vsint32_t nv16si = spu_splats(ni16si);
+
+  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
+  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
+
+  const int shift_dst = (unsigned int) (dst) & 15;
+  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
+  vuint8_t dstmask;
+  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
+
+  if(shift_dst==0){
+    dstmask = dst8mask1;
+  }
+  else{
+    dstmask = dst8mask2;
+  }
+
+  int16_t *tmpbis = tmp - (tmpStride * (h+5));
+
+  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+
+  for (i = 0 ; i < h ; i++) {
+    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
+    tmpbis += tmpStride;
+
+    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
+    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
+    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
+
+    tmpM2ssA = tmpM1ssA;
+    tmpM1ssA = tmpP0ssA;
+    tmpP0ssA = tmpP1ssA;
+    tmpP1ssA = tmpP2ssA;
+    tmpP2ssA = tmpP3ssA;
+
+    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
+    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
+
+    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
+    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
+
+    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
+    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
+
+    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
+    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
+
+    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
+    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
+
+    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
+    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
+
+    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs);
+    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
+    ssume = spu_and(ssume,(vsint16_t)sat);
+    sat = spu_cmpgt(ssume,vmax);
+    ssume = spu_sel(ssume,vmax,sat);
+    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
+    ssumo = spu_and(ssumo,(vsint16_t)sat);
+    sat = spu_cmpgt(ssumo,vmax);
+    ssumo = spu_sel(ssumo,vmax,sat);
+
+    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
+
+    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+    
+    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
+
+  }
+}
+
+static void PREFIX_h264_qpel4_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
+  
+  register int i;
+
+  const int16_t i20ss= 20;
+  const int16_t i5ss= 5;
+  const int16_t i16ss= 16;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t v16ss = spu_splats(i16ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const int shift_src = (unsigned int) src & 15;
+
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
+  const int shift_dst = (unsigned int) dst & 15;
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+  switch(shift_dst){
+    case 0:  dstmask = dst4mask0;
+             break;
+    case 4:  dstmask = dst4mask4;
+             break;
+    case 8:  dstmask = dst4mask8;
+             break;
+    case 12: dstmask = dst4mask12;
+             break;
+  }
+
+  uint8_t *srcbis = src - (STRIDE_Y * 2);
+
+  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
+  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
+  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
+
+  srcbis += STRIDE_Y;
+
+  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+
+  for (i = 0 ; i < h ; i++) {
+    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
+    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
+    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
+
+    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+    srcbis += STRIDE_Y;
+
+    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
+    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
+    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
+
+    srcM2ssA = srcM1ssA;
+    srcM1ssA = srcP0ssA;
+    srcP0ssA = srcP1ssA;
+    srcP1ssA = srcP2ssA;
+    srcP2ssA = srcP3ssA;
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint16_t pp3A = spu_add(sum3A, pp1A);
+    const vsint16_t psumA = spu_sub(pp3A, pp2A);
+    vsint16_t sumA = spu_rlmask(psumA, -5);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
+    sumA = spu_and(sumA,(vsint16_t)sat);
+    sat = spu_cmpgt(sumA,vmax);
+    sumA = spu_sel(sumA,vmax,sat);
+
+    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+    
+    dst += dstStride; 
+  }
+}
+
+static void PREFIX_h264_qpel4_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
+
+  register int i;
+  
+  const int16_t i20ss = 20;
+  const int16_t i5ss = 5;
+  const int16_t i16ss = 16;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t v16ss = spu_splats(i16ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
+  const int shift_dst = (unsigned int) dst & 15;
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+  switch(shift_dst){
+    case 0:  dstmask = dst4mask0;
+             break;
+    case 4:  dstmask = dst4mask4;
+             break;
+    case 8:  dstmask = dst4mask8;
+             break;
+    case 12: dstmask = dst4mask12;
+             break;
+  }
+
+  const int permM2 = (unsigned int) (src-2) & 15;
+  const int permM1 = (unsigned int) (src-1) & 15;
+  const int permP0 = (unsigned int) (src) & 15;
+  const int permP1 = (unsigned int) (src+1) & 15;
+  const int permP2 = (unsigned int) (src+2) & 15;
+  const int permP3 = (unsigned int) (src+3) & 15;
+
+  register int align = ((((unsigned long)src) - 2) % 16);
+
+  for (i = 0 ; i < h ; i ++) {
+    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    vuint8_t srcR1 = *(vuint8_t *)(src-2);
+    vuint8_t srcR2 = *(vuint8_t *)(src+14);
+
+    switch (align) {
+    default: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
+    } break;
+    case 11: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = srcR2;
+    } break;
+    case 12: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = srcR2;
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 13: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = srcR2;
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 14: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = srcR2;
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 15: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = srcR2;
+      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    }
+
+    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
+    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
+
+    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
+    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
+
+    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
+    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
+
+    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
+    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
+    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint16_t pp3A = spu_add(sum3A, pp1A);
+
+    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
+    
+    vsint16_t sumA = spu_rlmask(psumA, -5);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
+    sumA = spu_and(sumA,(vsint16_t)sat);
+    sat = spu_cmpgt(sumA,vmax);
+    sumA = spu_sel(sumA,vmax,sat);
+
+    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+    
+    src += STRIDE_Y;
+    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
+   }
+}
+
+static void PREFIX_h264_qpel4_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
+  register int i;
+
+  const int16_t i20ss = 20;
+  const int16_t i5ss = 5;
+  const int16_t imax = 255;
+
+  const vsint32_t vzero = spu_splats(0);
+  const vsint16_t v20ss = spu_splats(i20ss);
+  const vsint16_t v5ss = spu_splats(i5ss);
+  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
+  vuint16_t sat;
+
+  const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07};
+  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
+  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
+
+  const int permM2 = (unsigned int) (src-2) & 15;
+  const int permM1 = (unsigned int) (src-1) & 15;
+  const int permP0 = (unsigned int) (src) & 15;
+  const int permP1 = (unsigned int) (src+1) & 15;
+  const int permP2 = (unsigned int) (src+2) & 15;
+  const int permP3 = (unsigned int) (src+3) & 15;
+
+  register int align = ((((unsigned long)src) - 2) % 16);
+
+  src -= (2 * STRIDE_Y);
+
+  for (i = 0 ; i < (h+5) ; i ++) {
+    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    vuint8_t srcR1 = *(vuint8_t *)(src-2);
+    vuint8_t srcR2 = *(vuint8_t *)(src+14);
+
+    switch (align) {
+    default: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
+    } break;
+    case 11: {
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
+      srcP3 = srcR2;
+    } break;
+    case 12: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
+      srcP2 = srcR2;
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 13: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
+      srcP1 = srcR2;
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 14: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
+      srcP0 = srcR2;
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    case 15: {
+      vuint8_t srcR3 = *(vuint8_t *)(src+30);
+      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
+      srcM1 = srcR2;
+      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
+      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
+      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
+      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
+    } break;
+    }
+
+    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh);
+    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh);
+
+    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
+    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
+    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
+
+    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
+    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
+    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
+
+    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
+    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
+
+    const vsint16_t psumA = spu_sub(pp1A, pp2A);
+
+    *(vsint16_t *)tmp = psumA;
+
+    src += STRIDE_Y;
+    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
+  }
+
+  const int32_t ni10si = -10;
+  const int16_t i1ss = 1;
+  const int32_t i512si = 512;
+  const int32_t ni16si = -16;
+
+  const vsint32_t nv10si = spu_splats(ni10si);
+  const vsint16_t v1ss = spu_splats(i1ss);
+  const vsint32_t v512si = spu_splats(i512si);
+  const vsint32_t nv16si = spu_splats(ni16si);
+
+  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
+  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
+
+  const int shift_dst = (unsigned int) (dst) & 15;
+  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
+  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
+  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
+
+  switch(shift_dst){
+    case 0:  dstmask = dst4mask0;
+             break;
+    case 4:  dstmask = dst4mask4;
+             break;
+    case 8:  dstmask = dst4mask8;
+             break;
+    case 12: dstmask = dst4mask12;
+             break;
+  }
+
+  int16_t *tmpbis = tmp - (tmpStride * (h+5));
+
+  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
+  tmpbis += tmpStride;
+
+  for (i = 0 ; i < h ; i++) {
+    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
+    tmpbis += tmpStride;
+
+    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
+    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
+    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
+
+    tmpM2ssA = tmpM1ssA;
+    tmpM1ssA = tmpP0ssA;
+    tmpP0ssA = tmpP1ssA;
+    tmpP1ssA = tmpP2ssA;
+    tmpP2ssA = tmpP3ssA;
+
+    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
+    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
+    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
+    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
+
+    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
+    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
+
+    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
+    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
+
+    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
+    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
+
+    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
+    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
+
+    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
+    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
+
+    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs);
+    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs);
+
+    //Saturation to 0 and 255
+    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
+    ssume = spu_and(ssume,(vsint16_t)sat);
+    sat = spu_cmpgt(ssume,vmax);
+    ssume = spu_sel(ssume,vmax,sat);
+    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
+    ssumo = spu_and(ssumo,(vsint16_t)sat);
+    sat = spu_cmpgt(ssumo,vmax);
+    ssumo = spu_sel(ssumo,vmax,sat);
+
+    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
+
+    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
+
+    const vuint8_t dst1 = *(vuint8_t *)dst;
+
+    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
+    vuint8_t fsum;
+    OP_U8_SPU(fsum, dsum, dst1);
+
+    *(vuint8_t *)dst=fsum;
+    
+    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
+
+  }
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_mc_spu.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_mc_spu.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding. 
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+
+#include <stdio.h>
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <assert.h>
+
+#include "h264_mc_spu.h"
+#include "h264_dma.h"
+#include "h264_tables.h"
+#include "h264_decode_mb_spu.h"
+
+
+//biweight buffer 
+DECLARE_ALIGNED_16(uint8_t, tmp_y_ls[48*16]);	      		
+DECLARE_ALIGNED_16(uint8_t, tmp_cb_ls[32*8]);
+DECLARE_ALIGNED_16(uint8_t, tmp_cr_ls[32*8]);
+
+//ref buffer (double buffered)
+DECLARE_ALIGNED_16(uint8_t, mc_ref[2][16*(4+5)*48 + 2*16*(2+1)*32]);
+uint8_t* ref_ptr;
+
+/** Motion Compensation functions*/
+
+static void fill_mc_part(H264mc *mc, int n, int chroma_height, int x_offset, int y_offset, int itp, int weight, int list0, int list1){
+	H264mc_part *mc_part = mc->mc_part + mc->npart;
+	mc_part->n =n;
+	mc_part->chroma_height =chroma_height;
+	mc_part->x_offset = x_offset;
+	mc_part->y_offset = y_offset;
+	mc_part->itp = itp;
+	mc_part->weight = weight;
+	mc_part->list0 = list0;
+	mc_part->list1 = list1;
+	
+	mc->npart++;
+}
+
+void calc_mc_params(H264Mb* mb, H264mc *mc){
+	int mb_type = mb->mb_type;
+	mc->npart=0;	
+
+	assert(!IS_INTRA(mb_type));
+	if(IS_16X16(mb_type)){
+		fill_mc_part(mc, 0, 8, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+    }else if(IS_16X8(mb_type)){
+		fill_mc_part(mc, 0, 4, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+		fill_mc_part(mc, 8, 4, 0, 4, 0, 1, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+    }else if(IS_8X16(mb_type)){
+		fill_mc_part(mc, 0, 8, 0, 0, 1, 2, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+		fill_mc_part(mc, 4, 8, 4, 0, 1, 2, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+    }else{
+        int i;
+        assert(IS_8X8(mb_type));
+
+        for(i=0; i<4; i++){
+            const int sub_mb_type= mb->sub_mb_type[i];
+            const int n= 4*i;
+            int x_offset= (i&1)<<2;
+            int y_offset= (i&2)<<1;
+
+			if(IS_SUB_8X8(sub_mb_type)){
+				fill_mc_part(mc, n, 4, x_offset, y_offset, 1, 3, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+            }else if(IS_SUB_8X4(sub_mb_type)){
+				fill_mc_part(mc, n, 2, x_offset, y_offset, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+				fill_mc_part(mc, n+2, 2, x_offset, y_offset+2, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+            }else if(IS_SUB_4X8(sub_mb_type)){
+				fill_mc_part(mc, n, 4, x_offset, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+				fill_mc_part(mc, n+1, 4, x_offset+2, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+            }else{
+                int j;
+                assert(IS_SUB_4X4(sub_mb_type));
+                for(j=0; j<4; j++){
+                    int sub_x_offset= x_offset + 2*(j&1);
+                    int sub_y_offset= y_offset +   (j&2);
+					fill_mc_part(mc, n+j, 2, sub_x_offset, sub_y_offset, 2, 6, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+                }
+            }
+        }
+    }
+}
+
+/**
+*	Returns a pointer to mc_buf 
+*/
+static void* alloc_mc_buf(int size){
+	void* ptr = ref_ptr;
+	ref_ptr += size;
+	return ptr;
+}
+
+#define TAG_OFFSET_MC MBD_mc_buf1
+static uint8_t* get_mc_data(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){
+	assert(src_ea);
+	int unalign;
+	unsigned address_align;
+	
+	uint8_t* ea;
+	uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride);
+
+	ea = src_ea + pic_xoffset + pic_yoffset*linesize; 
+	address_align = ((unsigned) ea) & 0xFFFFFFF0;
+	unalign = ((unsigned) ea) & 0xF;
+	get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, idx + TAG_OFFSET_MC, 0);
+	return (ref_ptr + unalign);
+}
+
+static uint8_t* get_mc_data_blocking(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){
+	assert(src_ea);
+	int unalign;
+	unsigned address_align;
+
+	uint8_t* ea;
+	uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride);
+
+	ea = src_ea + pic_xoffset + pic_yoffset*linesize;
+	address_align = ((unsigned) ea) & 0xFFFFFFF0;
+	unalign = ((unsigned) ea) & 0xF;
+	get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, MBD_mc_buf1, 0);
+	wait_dma_id(MBD_mc_buf1);
+	return (ref_ptr + unalign);
+}
+
+//#undef TAG_OFFSET_MC
+
+static void get_mc_components(H264Context_spu *h, H264Mb *mb, H264mc_part* mc_part, Picture_spu *pic, int n, int chroma_height, int list, int src_x_offset, int src_y_offset, int idx){
+	assert(pic);
+	H264slice *s = h->s;
+	ref_data *ref = &mc_part->ref[list];
+    const int mx= mb->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
+    const int my= mb->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
+
+    const int pic_width  = 16*s->mb_width;
+    const int pic_height = 16*s->mb_height;
+	
+	int blk_h= chroma_height*2+5;
+	//int blk_w= 8*2+5;
+	
+	int blk_h_c= chroma_height+1;
+	//int blk_w_c= 9;
+
+	int ymx= mx>>2;
+    int ymy= my>>2;
+    int cmy= my>>3;
+    int cmx= mx>>3;
+
+    //truncate the motion vectors references
+    if(ymy>= pic_height+2){
+        ymy=pic_height+1;
+    }else if(ymy <=-19){
+        ymy=-18;
+    }
+    if(ymx>= pic_width+2){
+        ymx= pic_width+1;
+    }else if(ymx<=-19){
+        ymx=-19;
+    }
+
+	if(cmy >= pic_height>>1){
+        cmy = (pic_height>>1) -1;
+    }else if(cmy<=-9){
+        cmy=-8;
+    }
+    if(cmx >= pic_width>>1){
+        cmx = (pic_width>>1) -1;
+    }else if(cmx<=-9){
+        cmx=-8;
+    }
+	if (!h->blocking){
+		ref->data[0]=get_mc_data(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx);
+		ref->data[1]=get_mc_data(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
+		ref->data[2]=get_mc_data(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
+	} else {
+		ref->data[0]=get_mc_data_blocking(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx);
+		ref->data[1]=get_mc_data_blocking(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
+		ref->data[2]=get_mc_data_blocking(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
+
+	}
+	
+}
+
+static void get_ref_data(H264Context_spu *h, H264Mb *mb, H264mc_part *mc_part, int idx){
+	H264slice *s = h->s;
+	int x_offset = mc_part->x_offset;
+	int y_offset = mc_part->y_offset;
+	int list0 = mc_part->list0;
+	int list1 = mc_part->list1;
+	int n = mc_part->n;
+	int chroma_height = mc_part->chroma_height;
+	Picture_spu *refpic;
+	
+	x_offset += 8*mb->mb_x;
+    y_offset += 8*mb->mb_y;
+	
+	if(list0){
+		refpic= &s->ref_list[0][ mb->ref_cache[0][ scan8[n] ] ];
+		get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 0, x_offset, y_offset, idx);
+	}
+	if(list1){
+		refpic= &s->ref_list[1][ mb->ref_cache[1][ scan8[n] ] ];
+		get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 1, x_offset, y_offset, idx);
+	}
+}
+
+void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc){
+	int idx = h->mc_idx;
+	int i;
+
+	get_list = get_list_buf;
+	ref_ptr = mc_ref[idx];
+	for(i=0; i<mc->npart; i++){
+		get_ref_data(h, mb, &mc->mc_part[i], idx);
+	}
+}
+
+static void mc_dir_part(H264Context_spu *h, H264mc_part* mc_part, int n, int chroma_height, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, int stride_y, int stride_c){
+	
+	H264Mb *mb = h->mb;
+	ref_data* ref = &mc_part->ref[list];
+    const int mx= mb->mv_cache[list][ scan8[n] ][0];	//to determine the interpolation mode
+    const int my= mb->mv_cache[list][ scan8[n] ][1];
+    const int luma_xy= (mx&3) + ((my&3)<<2);
+	uint8_t *src_y, *src_cb, *src_cr;
+    
+	src_y = ref->data[0] +2+2*STRIDE_Y;
+	src_cb = ref->data[1];
+	src_cr = ref->data[2];
+	
+	qpix_op[luma_xy](dest_y, src_y, stride_y, chroma_height*2);
+	chroma_op(dest_cb, src_cb, stride_c, chroma_height, mx&7, my&7);
+	chroma_op(dest_cr, src_cr, stride_c, chroma_height, mx&7, my&7);
+}
+
+
+static void mc_part_biweighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg){
+
+	H264Mb *mb = h->mb;
+	H264slice *s = h->s;
+	int n = mc_part->n;
+	int chroma_height = mc_part->chroma_height;
+	int itp = mc_part->itp;
+	int refn0 = mb->ref_cache[0][ scan8[n] ];
+	int refn1 = mb->ref_cache[1][ scan8[n] ];        
+	qpel_mc_func *qpix_put=  h->dsp.put_h264_qpel_pixels_tab[itp];
+    h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp];
+    
+	// don't optimize for luma-only case, since B-frames usually
+	// use implicit weights => chroma too. 
+	mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c);
+	
+	mc_dir_part(h, mc_part, n, chroma_height, 1, tmp_y_ls, tmp_cb_ls, tmp_cr_ls, qpix_put, chroma_put, STRIDE_Y, STRIDE_C);
+
+	if(s->use_weight == 2){
+		int weight0 = s->implicit_weight[refn0][refn1][mb->mb_y&1];
+		int weight1 = 64 - weight0;
+		luma_weight_avg(  dest_y,  tmp_y_ls, stride_y, STRIDE_Y, 5, weight0, weight1, 0);
+		chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0);
+		chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0);
+	}else{
+		luma_weight_avg(dest_y, tmp_y_ls, stride_y, STRIDE_Y, s->luma_log2_weight_denom,  s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]);
+		
+		chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]);
+		
+		chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]);
+	}
+}
+
+static void mc_part_weighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, int list1){
+
+	H264Mb *mb = h->mb;
+	H264slice *s = h->s;
+
+	int n = mc_part->n;
+	int chroma_height = mc_part->chroma_height;
+	int itp = mc_part->itp;
+	qpel_mc_func *qpix_put=  h->dsp.put_h264_qpel_pixels_tab[itp];
+    h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp];
+    
+    int list = list1 ? 1 : 0;
+	int refn = mb->ref_cache[list][ scan8[n] ];      
+
+	mc_dir_part(h, mc_part, n, chroma_height, list, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c);
+
+	luma_weight_op(dest_y, stride_y, s->luma_log2_weight_denom, s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]);
+	if(s->use_weight_chroma){
+		chroma_weight_op(dest_cb, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]);
+		
+		chroma_weight_op(dest_cr, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]);
+	}
+}
+
+
+static void mc_part_std(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, int list0, int list1){
+	int n = mc_part->n;
+	int chroma_height = mc_part->chroma_height;
+	int itp = mc_part->itp;
+
+    qpel_mc_func *qpix_op=  h->dsp.put_h264_qpel_pixels_tab[itp];
+    h264_chroma_mc_func chroma_op= h->dsp.put_h264_chroma_pixels_tab[itp];
+    
+    if(list0){
+        mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c);
+
+        qpix_op=  h->dsp.avg_h264_qpel_pixels_tab[itp];
+        chroma_op= h->dsp.avg_h264_chroma_pixels_tab[itp];
+    }
+
+    if(list1){
+        mc_dir_part(h, mc_part, n, chroma_height, 1, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c);
+    }
+}
+
+static void mc_part(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
+	H264slice *s = h->s;
+	
+	int weight = mc_part->weight;
+	
+	int x_offset = mc_part->x_offset;
+	int y_offset = mc_part->y_offset;
+	int list0 = mc_part->list0;
+	int list1 = mc_part->list1;
+    
+	dest_y  += 2*x_offset + 2*y_offset*stride_y;
+    dest_cb +=   x_offset +   y_offset*stride_c;
+    dest_cr +=   x_offset +   y_offset*stride_c;
+    
+	if(list0 && list1 && s->use_weight !=0){
+		h264_biweight_func *weight_avg = &h->dsp.biweight_h264_pixels_tab[weight];
+        mc_part_biweighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_avg[0], weight_avg[3]);
+	}
+	else if ((list0 || list1) && s->use_weight ==1){
+		h264_weight_func *weight_op = &h->dsp.weight_h264_pixels_tab[weight];
+		mc_part_weighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_op[0], weight_op[3], list1);
+	}
+	else{
+        mc_part_std(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, list0, list1);
+	}
+}
+
+void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
+	int i;
+	H264mc *mc =h->mc; 
+	for(i=0; i<mc->npart; i++){
+		mc_part(h, &mc->mc_part[i], dest_y, dest_cb, dest_cr, stride_y, stride_c);
+	}
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_mc_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_mc_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,53 @@
+#ifndef H264_MC_SPU_H
+#define H264_MC_SPU_H
+
+//#include "types_spu.h"
+
+// motion compensation constants:
+#define MB_TYPE_16x16      0x0008
+#define MB_TYPE_16x8       0x0010
+#define MB_TYPE_8x16       0x0020
+#define MB_TYPE_8x8        0x0040
+#define MB_TYPE_P0L0       0x1000
+#define IS_16X16(a)        ((a)&MB_TYPE_16x16)
+#define IS_16X8(a)         ((a)&MB_TYPE_16x8)
+#define IS_8X16(a)         ((a)&MB_TYPE_8x16)
+#define IS_8X8(a)          ((a)&MB_TYPE_8x8)
+#define IS_SUB_8X8(a)      ((a)&MB_TYPE_16x16) //note reused
+#define IS_SUB_8X4(a)      ((a)&MB_TYPE_16x8)  //note reused
+#define IS_SUB_4X8(a)      ((a)&MB_TYPE_8x16)  //note reused
+#define IS_SUB_4X4(a)      ((a)&MB_TYPE_8x8)   //note reused
+#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list))))
+
+#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
+#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
+
+//Motion compensation buffer strides
+#define STRIDE_Y 48 
+#define STRIDE_C 32
+
+typedef struct ref_data{
+	uint8_t *data[3];
+}ref_data;
+
+typedef struct H264mc_part{
+	int n;
+	int chroma_height;
+	int x_offset;
+	int y_offset;
+	int itp;
+	int weight;
+	int list0;
+	int list1;
+	int use_weight;
+	ref_data ref[2];
+
+}H264mc_part;
+
+typedef struct H264mc{
+	H264mc_part mc_part[16];
+	int npart;
+}H264mc;
+
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_pred_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_pred_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,90 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 prediction functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_H264PRED_H
+#define AVCODEC_H264PRED_H
+
+//#include "libavutil/common.h"
+//#include "dsputil.h"
+
+/**
+ * Prediction types
+ */
+//@{
+#define VERT_PRED             0
+#define HOR_PRED              1
+#define DC_PRED               2
+#define DIAG_DOWN_LEFT_PRED   3
+#define DIAG_DOWN_RIGHT_PRED  4
+#define VERT_RIGHT_PRED       5
+#define HOR_DOWN_PRED         6
+#define VERT_LEFT_PRED        7
+#define HOR_UP_PRED           8
+
+#define LEFT_DC_PRED          9
+#define TOP_DC_PRED           10
+#define DC_128_PRED           11
+
+#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN   12
+#define HOR_UP_PRED_RV40_NODOWN           13
+#define VERT_LEFT_PRED_RV40_NODOWN        14
+
+#define DC_PRED8x8            0
+#define HOR_PRED8x8           1
+#define VERT_PRED8x8          2
+#define PLANE_PRED8x8         3
+
+#define LEFT_DC_PRED8x8       4
+#define TOP_DC_PRED8x8        5
+#define DC_128_PRED8x8        6
+
+#define ALZHEIMER_DC_L0T_PRED8x8 7
+#define ALZHEIMER_DC_0LT_PRED8x8 8
+#define ALZHEIMER_DC_L00_PRED8x8 9
+#define ALZHEIMER_DC_0L0_PRED8x8 10
+//@}
+
+/**
+ * Context for storing H.264 prediction functions
+ */
+typedef struct H264PredContext{
+    void (*pred4x4  [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
+    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
+    void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
+    void (*pred16x16[4+3])(uint8_t *src, int stride);
+
+    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
+}H264PredContext;
+
+void ff_h264_pred_init(H264PredContext *h);
+void ff_h264_pred_init_arm(H264PredContext *h);
+
+
+#endif /* AVCODEC_H264PRED_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_tables.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_tables.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,26 @@
+#include <stdint.h>
+#include "h264_tables.h"
+
+uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP] = {0, };
+
+int block_offset[16+4+4];
+
+void ff_cropTbl_init(){
+    int i;
+    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
+    for(i=0;i<MAX_NEG_CROP;i++) {
+        ff_cropTbl[i] = 0;
+        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
+    }
+}
+
+void init_block_offset(int linesize, int uvlinesize){
+	int i;
+	for(i=0; i<16; i++){
+        block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*linesize*((scan8[i] - scan8[0])>>3);
+    }
+    for(i=0; i<4; i++){
+        block_offset[16+i]=
+        block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*uvlinesize*((scan8[i] - scan8[0])>>3);
+    }
+}
\ No newline at end of file
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_tables.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_tables.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,83 @@
+#ifndef H264_TABLES_H
+#define H264_TABLES_H
+
+#define MAX_NEG_CROP       1024
+
+extern uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP];
+extern int block_offset[16+4+4];
+
+static const uint8_t scan8[16 + 2*4]={
+	4+1*8, 5+1*8, 4+2*8, 5+2*8,
+	6+1*8, 7+1*8, 6+2*8, 7+2*8,
+	4+3*8, 5+3*8, 4+4*8, 5+4*8,
+	6+3*8, 7+3*8, 6+4*8, 7+4*8,
+	1+1*8, 2+1*8,
+	1+2*8, 2+2*8,
+	1+4*8, 2+4*8,
+	1+5*8, 2+5*8,
+};
+
+static const uint8_t ff_zigzag_direct[64] = {
+    0,   1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+static const uint8_t zigzag_scan[16]={
+ 0+0*4, 1+0*4, 0+1*4, 0+2*4,
+ 1+1*4, 2+0*4, 3+0*4, 2+1*4,
+ 1+2*4, 0+3*4, 1+3*4, 2+2*4,
+ 3+1*4, 3+2*4, 2+3*4, 3+3*4,
+};
+
+static const uint8_t luma_dc_zigzag_scan[16]={
+ 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
+ 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
+ 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
+ 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
+};
+
+static const uint8_t chroma_dc_scan[4]={
+ (0+0*2)*16, (1+0*2)*16,
+ (0+1*2)*16, (1+1*2)*16,  //FIXME
+};
+
+static const uint8_t rem6[52]={
+0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+};
+
+static const uint8_t div6[52]={
+0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+};
+
+static const uint8_t dequant4_coeff_init[6][3]={
+  {10,13,16},
+  {11,14,18},
+  {13,16,20},
+  {14,18,23},
+  {16,20,25},
+  {18,23,29},
+};
+
+static const uint8_t dequant8_coeff_init_scan[16] = {
+  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
+};
+static const uint8_t dequant8_coeff_init[6][6]={
+  {20,18,32,19,25,24},
+  {22,19,35,21,28,26},
+  {26,23,42,24,33,31},
+  {28,25,45,26,35,33},
+  {32,28,51,30,40,38},
+  {36,32,58,34,46,43},
+};
+
+
+void init_block_offset(int linesize, int uvlinesize);
+void ff_cropTbl_init();
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_types_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/h264_types_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,203 @@
+#ifndef H264_CELL_TYPES_H
+#define H264_CELL_TYPES_H
+
+#include <libsync.h>
+#include <libavcodec/avcodec.h>
+
+typedef struct spe_pos{
+	volatile int count;		//number of mb processed
+	uint32_t pad[3];
+}spe_pos;
+
+//only the picture pointers are needed from the picture struct;
+typedef struct Picture_spu {
+	uint8_t* data[3];
+} Picture_spu;
+
+///For Cell, might be idea to use this instead for everything
+// struct that contains the pararms that change on slice
+typedef struct H264slice{
+	int deblocking_filter;
+    int linesize;
+    int uvlinesize;
+	int mb_width;
+	int mb_height;
+
+    int use_weight;
+    int use_weight_chroma;
+    int luma_log2_weight_denom;
+    int chroma_log2_weight_denom;
+
+    int16_t luma_weight[16][2][2];
+    int16_t chroma_weight[16][2][2][2];
+    int16_t implicit_weight[16][16][2];
+
+	// ref picture ptr
+    Picture_spu ref_list[2][16];
+	int state;
+	int emu_edge_width;
+    int emu_edge_height;
+
+    int slice_type;
+	int slice_type_nos;
+	int slice_alpha_c0_offset;
+    int slice_beta_offset;
+
+	uint8_t chroma_qp_table[2][64];
+
+	H264Mb *blocks;
+	uint8_t  *dst_y, *dst_cb, *dst_cr;
+
+    //uint32_t pad[2];		// padding the structure for multiple of 16 bytes
+}H264slice;
+
+typedef struct 	H264spe{
+#define EDIP 0
+#define EDB  1
+#define MBD  2
+	int type;
+	int idx;
+	int spe_id;
+	int spe_total;
+	int mb_width;
+	int mb_stride;
+	int mb_height;
+	int linesize;
+	int uvlinesize;
+	//H264slice* slice_params;
+	void* src_spe;
+	void* tgt_spe;
+
+	mutex_ea_t lock;
+	cond_ea_t cond;
+	atomic_ea_t cnt;
+
+	mutex_ea_t rl_lock;
+	cond_ea_t rl_cond;
+	atomic_ea_t rl_cnt;
+}H264spe;
+
+typedef struct H264Cabac_spu{
+	int blocking;
+
+    int top_cbp;
+    int left_cbp;
+    int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct
+
+    uint32_t dequant4_buffer[6][52][16];
+    uint32_t dequant8_buffer[2][52][64];
+    uint32_t (*dequant4_coeff[6])[16];
+    uint32_t (*dequant8_coeff[2])[64];
+
+    uint8_t (*non_zero_count_top)[32];
+	uint8_t (*non_zero_count)[32];
+
+	uint8_t (*mvd_top[2])[2];
+	uint8_t (*mvd[2])[2];
+
+	uint8_t *direct_top;
+	uint8_t *direct;    
+
+	uint8_t *chroma_pred_mode_top;
+	uint8_t *chroma_pred_mode;    
+
+	int8_t  *intra4x4_pred_mode_top;
+    int8_t  *intra4x4_pred_mode;	
+
+	uint16_t *cbp_top;
+	uint16_t *cbp;    
+
+	int8_t *qscale_top;
+	int8_t *qscale;	
+
+	int8_t *ref_index_top[2];
+	int8_t *ref_index[2];
+
+	int16_t (*motion_val_top[2])[2];
+	int16_t (*motion_val[2])[2];
+	uint32_t *mb_type_top;
+	uint32_t *mb_type;
+
+	int8_t *list1_ref_index[2];		
+	uint32_t *list1_mb_type;
+	DECLARE_ALIGNED_16(int16_t, list1_motion_val[2][4*4][2]); // fill for a macroblock when required
+
+	int b_stride;
+	int mb_stride;
+	int mb_width;
+	int mb_height;
+
+    uint8_t zigzag_scan[16];
+    uint8_t zigzag_scan8x8[64];
+
+    uint8_t direct_cache[5*8];
+    // Used to calculate loopfilter bS.
+    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
+    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
+    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
+
+} H264Cabac_spu;
+
+typedef struct EDSlice_spu{
+    PPS pps;                 ///< current pps
+    
+    H264Mb *mbs;
+
+    int state;
+    int qp_thresh;      ///< QP threshold to skip loopfilter
+
+	PictureInfo pic;
+	PictureInfo list1;
+//    Picture *ref_list[2][16];         ///Reordered version of default_ref_list according to picture reordering in slice header
+    int ref_count[2];   ///< counts frames or fields, depending on current mb mode
+	int slice_type;
+    int slice_type_nos;
+	int direct_8x8_inference_flag;
+
+    uint8_t list_count;
+    uint32_t coded_pic_num;
+///stuff only needed for nal/entropy decoding
+    H264Mb *m;
+    //GetBitContext gb;
+	const uint8_t *bytestream_start;
+	int byte_bufsize;
+    int transform_bypass;
+    int direct_spatial_mv_pred;
+    int map_col_to_list0[2][16];
+    int dist_scale_factor[16];
+
+    int cabac_init_idc;
+    int ref2frm[2][64];  ///< reference to frame number lists, the first 2 are for -2,-1
+    int qscale;
+    int chroma_qp[2]; //QPc
+    int last_qscale_diff;
+
+//  Picture* release_ref[MAX_MMCO_COUNT];
+//   int release_cnt;
+
+
+//     int use_weight;
+//     int use_weight_chroma;
+//    int luma_log2_weight_denom;
+//    int chroma_log2_weight_denom;
+
+//     int8_t luma_weight[16][2][2];
+//     int8_t chroma_weight[16][2][2][2];
+//     int8_t implicit_weight[16][16][2];
+
+
+
+//  int slice_alpha_c0_offset;
+//  int slice_beta_offset;
+    
+//    int nal_ref_idc;
+//    int nal_unit_type;
+//     uint8_t *rbsp_buffer;
+//     unsigned int rbsp_buffer_size;
+
+
+
+} EDSlice_spu;
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/mathops_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/mathops_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,137 @@
+/*
+ * simple math operations
+ * Copyright (c) 2001, 2002 Fabrice Bellard
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_MATHOPS_H
+#define AVCODEC_MATHOPS_H
+
+// #include "libavutil/common.h"
+// #include "libavutil/internal.h"
+// 
+// /* generic implementation */
+// 
+// #ifndef MULL
+// #   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
+// #endif
+// 
+// #ifndef MULH
+// //gcc 3.4 creates an incredibly bloated mess out of this
+// //#    define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32)
+// 
+// static av_always_inline int MULH(int a, int b){
+//     return ((int64_t)(a) * (int64_t)(b))>>32;
+// }
+// #endif
+// 
+// #ifndef UMULH
+// static av_always_inline unsigned UMULH(unsigned a, unsigned b){
+//     return ((uint64_t)(a) * (uint64_t)(b))>>32;
+// }
+// #endif
+// 
+// #ifndef MUL64
+// #   define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
+// #endif
+// 
+// #ifndef MAC64
+// #   define MAC64(d, a, b) ((d) += MUL64(a, b))
+// #endif
+// 
+// #ifndef MLS64
+// #   define MLS64(d, a, b) ((d) -= MUL64(a, b))
+// #endif
+// 
+// /* signed 16x16 -> 32 multiply add accumulate */
+// #ifndef MAC16
+// #   define MAC16(rt, ra, rb) rt += (ra) * (rb)
+// #endif
+// 
+// /* signed 16x16 -> 32 multiply */
+// #ifndef MUL16
+// #   define MUL16(ra, rb) ((ra) * (rb))
+// #endif
+// 
+// #ifndef MLS16
+// #   define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb))
+// #endif
+
+/* median of 3 */
+#ifndef mid_pred
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+#if 0
+    int t= (a-b)&((a-b)>>31);
+    a-=t;
+    b+=t;
+    b-= (b-c)&((b-c)>>31);
+    b+= (a-b)&((a-b)>>31);
+
+    return b;
+#else
+    if(a>b){
+        if(c>b){
+            if(c>a) b=a;
+            else    b=c;
+        }
+    }else{
+        if(b>c){
+            if(c>a) b=c;
+            else    b=a;
+        }
+    }
+    return b;
+#endif
+}
+#endif
+
+// #ifndef sign_extend
+// static inline av_const int sign_extend(int val, unsigned bits)
+// {
+//     return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
+// }
+// #endif
+// 
+// #ifndef zero_extend
+// static inline av_const unsigned zero_extend(unsigned val, unsigned bits)
+// {
+//     return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
+// }
+// #endif
+// 
+// #ifndef COPY3_IF_LT
+// #define COPY3_IF_LT(x, y, a, b, c, d)\
+// if ((y) < (x)) {\
+//     (x) = (y);\
+//     (a) = (b);\
+//     (c) = (d);\
+// }
+// #endif
+// 
+// #ifndef NEG_SSR32
+// #   define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
+// #endif
+// 
+// #ifndef NEG_USR32
+// #   define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
+// #endif
+
+#endif /* AVCODEC_MATHOPS_H */
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/rectangle_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/rectangle_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,92 @@
+/*
+ * rectangle filling function
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * useful rectangle filling function
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_RECTANGLE_H
+#define AVCODEC_RECTANGLE_H
+
+#include <assert.h>
+
+#define STRIDE_ALIGN 16
+
+
+/**
+ * fill a rectangle.
+ * @param h height of the rectangle, should be a constant
+ * @param w width of the rectangle, should be a constant
+ * @param size the size of val (1, 2 or 4), should be a constant
+ */
+static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
+    uint8_t *p= (uint8_t*)vp;
+    assert(size==1 || size==2 || size==4);
+    assert(w<=4);
+
+    w      *= size;
+    stride *= size;
+
+    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
+    assert((stride&(w-1))==0);
+    if(w==2){
+        const uint16_t v= size==4 ? val : val*0x0101;
+        *(uint16_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint16_t*)(p + 1*stride)= v;
+        if(h==2) return;
+        *(uint16_t*)(p + 2*stride)= v;
+        *(uint16_t*)(p + 3*stride)= v;
+    }else if(w==4){
+        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
+        *(uint32_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint32_t*)(p + 1*stride)= v;
+        if(h==2) return;
+        *(uint32_t*)(p + 2*stride)= v;
+        *(uint32_t*)(p + 3*stride)= v;
+    }else if(w==8){
+        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
+        *(uint64_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint64_t*)(p + 1*stride)= v;
+        if(h==2) return;
+        *(uint64_t*)(p + 2*stride)= v;
+        *(uint64_t*)(p + 3*stride)= v;
+    }else if(w==16){
+        const uint64_t v= val*0x0100000001ULL;
+        *(uint64_t*)(p + 0+0*stride)= v;
+        *(uint64_t*)(p + 8+0*stride)= v;
+        *(uint64_t*)(p + 0+1*stride)= v;
+        *(uint64_t*)(p + 8+1*stride)= v;
+        if(h==2) return;
+        *(uint64_t*)(p + 0+2*stride)= v;
+        *(uint64_t*)(p + 8+2*stride)= v;
+        *(uint64_t*)(p + 0+3*stride)= v;
+        *(uint64_t*)(p + 8+3*stride)= v;
+    }else
+        assert(0);
+    assert(h==4);
+}
+
+#endif /* AVCODEC_RECTANGLE_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/spe_ed.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/spe_ed.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,508 @@
+#define CELL_SPE
+
+#include <string.h>
+#include <stdio.h>
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include "libavcodec/avcodec.h"
+#include "h264_cabac_spu.h"
+#include "cabac_spu.h"
+#include "h264_types_spu.h"
+#include "h264_tables.h"
+#include "h264_dma.h"
+#include "h264_tables.h"
+
+#define MB_WIDTH 240
+#define MB_STRIDE (MB_WIDTH+16)
+
+H264Cabac_spu hcabac;
+CABACContext cabac;
+DECLARE_ALIGNED_16(EDSlice_spu, slice[2]);
+DECLARE_ALIGNED_16(H264Mb, mb[2]);
+DECLARE_ALIGNED_16(H264spe, spe);
+
+DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]);
+DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]);
+DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]);
+DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]);
+DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]);
+DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]);
+DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]);
+
+DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]);
+DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]);
+DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]);
+
+DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]);
+DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]);
+DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]);
+
+DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
+//mb position of neighbouring spes
+DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
+static int total_lines;
+
+static inline int dep_resolved(H264spe *p){
+	int spe_id = p->spe_id;
+	volatile int lines_proc = src_spe.count;
+	if (spe_id==0)
+		return (total_lines < lines_proc-1 +p->mb_height)? 1:0;
+	else
+		return (total_lines < lines_proc-1)? 1:0;
+}
+
+static void update_tgt_spe_dep(H264spe *p, int end){
+	// 	if (end ){
+   total_lines++;
+   spe_pos* dma_spe = &dma_temp;
+   spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store
+   dma_spe->count = end? total_lines+1: total_lines;
+   spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put);
+   // 	}
+   
+}
+
+static int init_cabac(H264spe *p, H264Cabac_spu *hc){
+	hc->mb_height = p->mb_height;
+	hc->mb_width = p->mb_width;
+	hc->b_stride = 4*p->mb_width;
+	hc->mb_stride = p->mb_stride;
+	
+	for(int i=0; i<16; i++){
+		#define T(x) (x>>2) | ((x<<2) & 0xF)
+		hc->zigzag_scan[i] = T(zigzag_scan[i]);
+		#undef T
+	}
+	for(int i=0; i<64; i++){
+		#define T(x) (x>>3) | ((x&7)<<3)
+		hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]);
+		#undef T
+	}
+}
+
+static void reset_cabac_buffers(){
+ memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table));
+	memset(mvd_table, 0, sizeof(mvd_table));
+	memset(direct_table, 0, sizeof(direct_table));
+	memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table));
+	memset(cbp_table, 0, sizeof(cbp_table));
+	memset(qscale_table, 0, sizeof(qscale_table));
+ 	memset(mb_type_table, 0, sizeof(mb_type_table));
+	memset(ref_index_table, 0, sizeof(ref_index_table));
+	memset(motion_val_table, 0, sizeof(motion_val_table));
+}
+
+static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){
+	int align = (unsigned) buf & 0xF;
+	int dma_size;
+	
+	c->bytestream_ea_start=
+	c->bytestream_ea= buf;
+	c->bytestream_ea_end= buf + bufsize;
+	c->bufsize = bufsize;
+	
+	if (bufsize + align >= sizeof(bytestream_ls)){
+		dma_size = sizeof(bytestream_ls);
+		c->bufsize = c->bufsize +align - sizeof(bytestream_ls);				
+	}else{
+		int align_end = (bufsize+align) &0xF;
+		if (align_end)
+			dma_size = bufsize+align + 16-align_end;
+		else
+			dma_size = bufsize+align;
+		c->bufsize = 0;
+	}
+// 	printf("%d\n", dma_size);
+	c->bytestream_end  = &bytestream_ls[dma_size]; 
+	c->bytestream_start= c->bytestream = &bytestream_ls[align];
+ 	spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get );
+	c->bytestream_ea_start=
+	c->bytestream_ea= buf + dma_size -align;
+
+	wait_dma_id(ED_get);
+	
+	if (align %2){
+		c->low =  (*c->bytestream++)<<18;
+		c->low+=  (*c->bytestream++)<<10;
+		c->low+= ((*c->bytestream++)<<2) + 2;
+	}else {
+		c->low =  (*c->bytestream++)<<18;
+		c->low+=  (*c->bytestream++)<<10;
+		c->low+=  (2<<8);
+	}
+
+	c->range= 0x1FE;
+	bytecount=0;
+}
+
+static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
+    int i,q,x;
+    const int transpose = HAVE_ALTIVEC;
+    hc->dequant8_coeff[0] = hc->dequant8_buffer[0];
+    hc->dequant8_coeff[1] = hc->dequant8_buffer[1];
+
+    for(i=0; i<2; i++){
+        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
+            hc->dequant8_coeff[1] = hc->dequant8_buffer[0];
+            break;
+        }
+
+        for(q=0; q<52; q++){
+            int shift = div6[q];
+            int idx = rem6[q];
+            for(x=0; x<64; x++)
+                hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
+                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
+                    s->pps.scaling_matrix8[i][x]) << shift;
+        }
+    }
+}
+
+static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
+    int i,j,q,x;
+    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
+    for(i=0; i<6; i++ ){
+        hc->dequant4_coeff[i] = hc->dequant4_buffer[i];
+        for(j=0; j<i; j++){
+            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
+                hc->dequant4_coeff[i] = hc->dequant4_buffer[j];
+                break;
+            }
+        }
+        if(j<i)
+            continue;
+
+        for(q=0; q<52; q++){
+            int shift = div6[q] + 2;
+            int idx = rem6[q];
+            for(x=0; x<16; x++)
+                hc->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
+                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
+                    s->pps.scaling_matrix4[i][x]) << shift;
+        }
+    }
+}
+
+static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){
+    int i,x;
+
+    init_dequant4_coeff_table(s, hc);
+    if(s->pps.transform_8x8_mode)
+        init_dequant8_coeff_table(s, hc);
+    if(s->transform_bypass){
+        for(i=0; i<6; i++)
+            for(x=0; x<16; x++)
+                hc->dequant4_coeff[i][0][x] = 1<<6;
+        if(s->pps.transform_8x8_mode)
+            for(i=0; i<2; i++)
+                for(x=0; x<64; x++)
+                    hc->dequant8_coeff[i][0][x] = 1<<6;
+    }
+}
+
+static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){
+	hc->non_zero_count_top 		= non_zero_count_table[0];
+	hc->non_zero_count     		= non_zero_count_table[1];
+	hc->mvd_top[0]				= mvd_table[0][0];
+	hc->mvd[0]					= mvd_table[0][1];
+	hc->mvd_top[1]				= mvd_table[1][0];
+	hc->mvd[1]					= mvd_table[1][1];
+	hc->direct_top		   		= direct_table[0];
+	hc->direct			   		= direct_table[1];
+	hc->chroma_pred_mode_top	= chroma_pred_mode_table[0];
+	hc->chroma_pred_mode  		= chroma_pred_mode_table[1];
+	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[0];
+	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[1];
+	hc->cbp_top			   		= cbp_table[0];
+	hc->cbp				   		= cbp_table[1];
+	hc->qscale_top			   	= qscale_table[0] +1;
+	hc->qscale				   	= qscale_table[1] +1;
+
+	hc->mb_type_top 			= mb_type_table[0]+1;
+	hc->mb_type		 			= mb_type_table[1]+1;
+	hc->ref_index_top[0]		= ref_index_table[0][0];
+	hc->ref_index_top[1]		= ref_index_table[1][0];
+	hc->ref_index[0]			= ref_index_table[0][1];
+	hc->ref_index[1]			= ref_index_table[1][1];
+	hc->motion_val_top[0] 		= motion_val_table[0][0];
+	hc->motion_val_top[1] 		= motion_val_table[1][0];
+	hc->motion_val[0] 			= motion_val_table[0][1];
+	hc->motion_val[1] 			= motion_val_table[1][1];
+
+	int mb_stride = hc->mb_stride;
+
+	if (s->slice_type_nos == FF_B_TYPE){
+		while(!dep_resolved(&spe));
+		spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get);
+		spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get);
+		spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get);
+		wait_dma_id(ED_get);
+		spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get);
+		spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
+		spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
+		hc->list1_mb_type = list1_mb_type_table[0]+1;
+		hc->list1_ref_index[0] = list1_ref_index_table[0][0];
+		hc->list1_ref_index[1] = list1_ref_index_table[0][1];
+	}	
+
+}
+
+static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){
+	int mb_stride = hc->mb_stride;
+	int mb_width = hc->mb_width;
+	int top = (line+1)%2;
+	int cur = line%2;
+	int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line.
+
+	hc->non_zero_count_top 		= non_zero_count_table[top];
+	hc->non_zero_count     		= non_zero_count_table[cur];
+	hc->mvd_top[0]				= mvd_table[0][top];
+	hc->mvd[0]					= mvd_table[0][cur];
+	hc->mvd_top[1]				= mvd_table[1][top];
+	hc->mvd[1]					= mvd_table[1][cur];
+	hc->direct_top		   		= direct_table[top];
+	hc->direct			   		= direct_table[cur];
+	hc->chroma_pred_mode_top	= chroma_pred_mode_table[top];
+	hc->chroma_pred_mode  		= chroma_pred_mode_table[cur];
+	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[top];
+	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[cur];
+	hc->cbp_top			   		= cbp_table[top];
+	hc->cbp				   		= cbp_table[cur];
+	hc->qscale_top			   	= qscale_table[top] +1;
+	hc->qscale				   	= qscale_table[cur] +1;
+
+	hc->mb_type_top 			= mb_type_table[top]+1;
+	hc->mb_type		 			= mb_type_table[cur]+1;
+	hc->ref_index_top[0]		= ref_index_table[0][top];
+	hc->ref_index_top[1]		= ref_index_table[1][top];
+	hc->ref_index[0]			= ref_index_table[0][cur];
+	hc->ref_index[1]			= ref_index_table[1][cur];
+	hc->motion_val_top[0] 		= motion_val_table[0][top];
+	hc->motion_val_top[1] 		= motion_val_table[1][top];
+	hc->motion_val[0] 			= motion_val_table[0][cur];
+	hc->motion_val[1] 			= motion_val_table[1][cur];
+
+	wait_dma_id(ED_put);
+	
+	spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put);
+	spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
+	spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
+	spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
+	spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
+
+	if (s->slice_type_nos == FF_B_TYPE){
+		update_tgt_spe_dep(&spe, 0);
+		wait_dma_id(ED_get);
+						
+		if (line + 2 < hc->mb_height){
+			while(!dep_resolved(&spe));
+			spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get);
+			spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
+			spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
+		}
+		hc->list1_mb_type = list1_mb_type_table[bottom]+1;
+		hc->list1_ref_index[0] = list1_ref_index_table[bottom][0];
+		hc->list1_ref_index[1] = list1_ref_index_table[bottom][1];
+	}
+
+}
+
+// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){
+// 
+// 	printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x);
+// 	printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y);
+// 	printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy);
+// 	printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy);
+// 	printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy);
+// 	printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode);
+// 	printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode);
+// 	printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available);
+// 	printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available);
+// 	printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available);
+// 	printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available);
+// 
+// 	if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){
+// 		for (int i=0; i<5; i++){
+// 			for (int j=0; j<8; j++){
+// 				printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]);
+// 			}
+// 			printf("\n");
+// 		}
+// 	}
+// 
+// 	if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){
+// 		for (int i=0; i<6; i++){
+// 			for (int j=0; j<8; j++){
+// 				printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]);
+// 			}
+// 			printf("\n");
+// 		}
+// 	}
+// 
+// 	if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){
+// 		for (int i=0; i<4; i++){
+// 			printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]);
+// 			printf("\n");
+// 		}
+// 	}
+// 
+// 	if (memcmp(mp->mv_cache, ms->mv_cache, 320)){
+// 		for (int k=0; k<2; k++){
+// 			for (int i=0; i<5; i++){
+// 				for (int j=0; j<8; j++){
+// 					printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]);
+// 				}
+// 				printf("\n");
+// 			}
+// 		}
+// 	}
+// 
+// 	if (memcmp(mp->ref_cache, ms->ref_cache, 80)){
+// 		for (int k=0; k<2; k++){
+// 			for (int i=0; i<5; i++){
+// 				for (int j=0; j<8; j++){
+// 					printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]);
+// 				}
+// 				printf("\n");
+// 			}
+// 		}
+// 	}
+// 
+// 	printf("cbp %d, %d\n", mp->cbp, ms->cbp);
+// 	for (int i=0; i<hc->mb_stride; i++){
+//    		printf("%d, ", hc->cbp[i]); fflush(0);
+//    	}
+// 	printf("\n");
+// 
+// 	printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type);
+// 	printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) );
+// 	printf("left_type %d, %d\n", mp->left_type, ms->left_type);
+// 	printf("top_type %d, %d\n", mp->top_type, ms->top_type);
+// 	printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy);
+// 	printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy);
+// 	printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy);
+// // 	for (int i=0; i<hc->mb_stride; i++){
+// // 		printf("%d, ", qscale_table[0][i]); fflush(0);
+// // 	}
+// 
+// 	if (memcmp(mp->mb, ms->mb, 768)){
+// 		for (int i=0; i<16; i++){
+// 			for (int j=0; j<16; j++){
+// 				printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]);
+// 			}
+// 			printf("\n");
+// 		}
+// 		for (int i=0; i<8; i++){
+// 			for (int j=0; j<8; j++){
+// 				printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]);
+// 			}
+// 			printf("\n");
+// 		}
+// 		for (int i=0; i<8; i++){
+// 			for (int j=0; j<8; j++){
+// 				printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]);
+// 			}
+// 			printf("\n");
+// 		}
+// 	}
+// 
+// 	if (memcmp(mp->bS, ms->bS, 32)){
+// 		for (int k=0; k<2; k++){
+// 			for (int i=0; i<4; i++){
+// 				for (int j=0; j<4; j++){
+// 					printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]);
+// 				}
+// 				printf("\n");
+// 			}
+// 		}
+// 	}
+// 	if (memcmp(mp->edges, ms->edges, 4)){
+// 		printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]);
+// 		printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb);
+// 	}
+// 
+// 	printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y);
+// 	printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb);
+// 	printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr);
+// }
+// DECLARE_ALIGNED_16(H264Mb, tmp);
+
+
+int main(unsigned long long id, unsigned long long argp){
+	EDSlice_spu *s;
+	H264Cabac_spu *hc = &hcabac;
+	CABACContext *c = &cabac;
+	H264spe *p = &spe;
+	
+	spu_write_out_mbox((unsigned) slice);
+	spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience
+	wait_dma_id(ED_spe);
+
+	ff_init_cabac_states();
+	init_cabac(p, hc);
+	hc->blocking=0;
+	for(;;){
+		spu_read_in_mbox();
+		s = &slice[0];
+		reset_cabac_buffers();
+		init_entropy_buf(hc, s);
+
+		if (hc->blocking) wait_dma_id(ED_get);
+		//printf("framesize %d\n", s->byte_bufsize);fflush(0);
+ 		init_dequant_tables(s, hc);
+		ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize );
+ 		ff_h264_init_cabac_states(s, c);
+
+		int mb_slot=0;
+ 		for(int j=0; j<hc->mb_height; j++){
+			for(int i=0; i<hc->mb_width; i++){
+				int eos,ret;
+				H264Mb *m = &mb[mb_slot];
+				m->mb_x=i;
+				m->mb_y=j;
+				s->m = m;
+
+				ret = ff_h264_decode_mb_cabac(hc, s, c);
+
+// 				spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get);
+// 				wait_dma_id(ED_get);
+// 				if (memcmp(&tmp, m, sizeof(H264Mb))){
+// 					printf("coded pic num %d\n", s->coded_pic_num);
+// 					printmbdiff(s, hc,&tmp, m);
+// 					return 0;
+// 				}
+				//printf("qscale %d\n", m->qscale_mb_xy);
+				if (!hc->blocking){
+					if (mb_slot){
+						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1);
+						wait_dma_id(ED_putmb0);
+					}else {
+						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
+						wait_dma_id(ED_putmb1);
+					}
+					mb_slot++; mb_slot%=2;
+				}else {
+					spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
+					wait_dma_id(ED_putmb0);
+				}
+				
+
+				eos = get_cabac_terminate( c);
+
+				if( ret < 0) {
+					fprintf(stderr, "error at %d bytecount\n", bytecount);
+					return -1;
+				}
+			}
+			update_entropy_buf(hc, s, j);
+			if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);}
+		}
+		wait_dma_id(ED_put);
+		spu_write_out_mbox(1);
+
+	}
+
+	return 0;
+
+
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/spe_mbd.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/spe_mbd.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2009 TUDelft 
+ * 
+ * Cell Parallel SPU - 2DWave Macroblock Decoding. 
+ */
+
+/**
+ * @file libavcodec/cell/spu/h264_main_spu.c
+ * Cell Parallel SPU - 2DWave Macroblock Decoding
+ * @author C C Chi <c.c.chi@student.tudelft.nl>
+ * 
+ * SIMD kernels 
+ * H.264/AVC motion compensation
+ * @author Mauricio Alvarez <alvarez@ac.upc.edu>
+ * @author Albert Paradis <apar7632@hotmail.com>
+ */ 
+
+
+/* Enable this lines to enable simulator statistic or generate traces */
+
+//#define ENABLE_SIMULATOR
+//#define ENABLE_PARAVER_TRACING_CELL
+
+#ifdef ENABLE_SIMULATOR
+	#include "/opt/ibm/systemsim-cell/include/callthru/spu/profile.h"
+#endif
+
+#ifdef ENABLE_TRACES
+	#include "spu_trace.h"
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <libsync.h>
+#include <sys/time.h>
+#include <assert.h>
+
+//#include "dsputil_cell.h"
+#include "types_spu.h"
+#include "h264_intra_spu.h"
+#include "h264_decode_mb_spu.h"
+#include "h264_mc_spu.h"
+#include "h264_tables.h"
+#include "h264_dma.h"
+
+
+/** functions for supporting tracing with paraver for the SPU 
+ *
+ */
+inline void trace_init_SPU(){
+#ifdef ENABLE_PARAVER_TRACING_CELL
+	SPUtrace_init ();
+#endif
+}
+
+inline void trace_fini_SPU(){
+#ifdef ENABLE_PARAVER_TRACING_CELL
+	SPUtrace_fini ();
+#endif
+}
+
+inline void trace_event_SPU(int event, int id){
+#ifdef ENABLE_PARAVER_TRACING_CELL
+	SPUtrace_event (event, id);
+#else
+	(void) event;
+	(void) id;
+#endif
+}
+
+// for simulator statistic
+inline void clear_statistic(){
+#ifdef ENABLE_SIMULATOR
+	prof_clear();
+#endif
+}
+
+inline void start_statistic(){
+#ifdef ENABLE_SIMULATOR
+	prof_start();
+#endif
+}
+
+inline void stop_statistic(){
+#ifdef ENABLE_SIMULATOR
+	prof_stop();
+#endif
+}
+
+H264Context_spu h_context;  // struct that contain all the params to decode a macroblock
+
+DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
+//mb position of neighbouring spes
+DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
+//DECLARE_ALIGNED_16(spe_pos, tgt_spe); //written by SPE_ID +1
+
+/**	
+*	Initializes the buffering of the mb data and associated mc data. The init_mb_buffer needs to 
+*	be called before any get_next_mb and only once at the beginning of the slice.
+*
+*	Note: init_mc_buffer and get_next_mb expect the width of the picture to be more than 2 mb's
+*/
+#define TAG_OFFSET_MB MBD_buf1
+#define TAG_OFFSET_MC MBD_mc_buf1
+static void init_mb_buffer(H264Context_spu* h){
+	H264slice *s = h->s;
+	H264Mb *next_mb;
+	int mb_height = s->mb_height;
+	int mb_width = s->mb_width;
+
+	h->mc_idx =0;
+	
+	h->mb_dec = 0;
+	h->mb_mc = 0;
+	h->mb_dma = 0;
+		
+	h->curr_line %= mb_height;
+	h->next_mb_idx = h->curr_line * mb_width;
+	h->mb_id = h->curr_line * mb_width;
+	h->n_mc= h->curr_line * mb_width;
+	
+	next_mb = s->blocks + h->mb_id;
+	spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
+	h->mb_dma++;
+	h->mb_id++;
+	
+	next_mb = s->blocks + h->mb_id;
+	spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
+	h->mb_dma++;
+	h->mb_id++;
+	wait_dma_id(0 + TAG_OFFSET_MB);	
+	
+	H264Mb *mb = &h->mb_buf[0];
+	H264mc *mc = &h->mc_buf[0];
+	if(!IS_INTRA(mb->mb_type)){
+		calc_mc_params(mb, mc);
+		fill_ref_buf(h, mb, mc);
+	}
+	h->n_mc++;
+	h->mb_mc++;
+}
+
+static void *get_next_mb(H264Context_spu *h){
+	H264slice *s = h->s;
+	H264spe *spe = &h->spe;
+	H264Mb *mb_buf = h->mb_buf;	
+	H264mc *mc_buf = h->mc_buf;
+	H264Mb *next_mb;
+	H264Mb *next_dma_mb;
+	
+	if (h->curr_line >= s->mb_height)
+		return NULL;
+	
+	if (h->mb_id < h->mb_total){
+		next_dma_mb = s->blocks + h->mb_id;
+		spu_dma_get(&mb_buf[h->mb_dma], (unsigned) next_dma_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
+		h->mb_dma = (h->mb_dma+1)%3;
+		h->mb_id++;
+		if (h->mb_id%s->mb_width ==0){
+			h->mb_id+=(spe->spe_total-1)*s->mb_width;			
+		}
+	}
+	
+	h->mc = &mc_buf[h->mc_idx];
+	wait_dma_id(h->mc_idx + TAG_OFFSET_MC);
+	h->mc_idx = (h->mc_idx+1)%2;
+	if (h->n_mc < h->mb_total){
+		wait_dma_id(h->mb_mc + TAG_OFFSET_MB);
+		H264Mb *mb = &mb_buf[h->mb_mc];
+		H264mc *mc = &mc_buf[h->mc_idx];
+		if(!IS_INTRA(mb->mb_type)){
+			calc_mc_params(mb, mc);
+			fill_ref_buf(h, mb, mc);
+		}
+		h->n_mc++;
+		if (h->n_mc%s->mb_width ==0){
+			h->n_mc+=(spe->spe_total-1)*s->mb_width;			
+		}
+	}
+	h->next_mb_idx++;
+	if (h->next_mb_idx % s->mb_width ==0){
+		h->next_mb_idx+=(spe->spe_total-1)*s->mb_width;
+		h->curr_line+=spe->spe_total;		
+	}
+	
+	h->mb_mc = (h->mb_mc+1)%3;	
+	next_mb = &mb_buf[h->mb_dec];
+	h->mb_dec = (h->mb_dec+1)%3;
+	return next_mb;
+}
+
+static void *get_next_mb_blocking(H264Context_spu *h){
+	H264slice *s = h->s;
+	H264spe *spe = &h->spe;
+	H264Mb *mb_buf = h->mb_buf;
+	H264mc *mc_buf = h->mc_buf;
+	H264Mb *next_mb;
+	H264Mb *next_dma_mb;
+
+	if (h->mb_id >= h->mb_total)
+		return NULL;
+
+	//printf("%d\n", h->mb_id);
+	next_dma_mb = s->blocks + h->mb_id;
+	spu_dma_get(&mb_buf[0], (unsigned) next_dma_mb, sizeof(H264Mb), MBD_buf1);
+	//h->mb_dma = (h->mb_dma+1)%3;
+	h->mb_id++;
+	if (h->mb_id%s->mb_width ==0){
+		h->mb_id+=(spe->spe_total-1)*s->mb_width;
+	}
+	wait_dma_id(MBD_buf1);
+
+	h->mc = &mc_buf[0];	
+	//h->mc_idx = (h->mc_idx+1)%2;
+	//if (h->n_mc < h->mb_total){
+	H264Mb *mb = &mb_buf[0];
+	H264mc *mc = &mc_buf[0];
+	if(!IS_INTRA(mb->mb_type)){
+		calc_mc_params(mb, mc);
+		fill_ref_buf(h, mb, mc);
+	}
+	//h->n_mc++;
+	/*if (h->n_mc%s->mb_width ==0){
+		h->n_mc+=(spe->spe_total-1)*s->mb_width;
+	}*/	
+//	wait_dma_id(MBD_mc_buf1);
+
+// 	h->next_mb_idx++;
+// 	if (h->next_mb_idx % s->mb_width ==0){
+// 		h->next_mb_idx+=(spe->spe_total-1)*s->mb_width;
+// 		h->curr_line+=spe->spe_total;
+// 	}
+
+// 	h->mb_mc = (h->mb_mc+1)%3;
+	next_mb = &mb_buf[0];
+// 	h->mb_dec = (h->mb_dec+1)%3;
+	return next_mb;
+}
+
+
+#undef TAG_OFFSET_MB
+#undef TAG_OFFSET_MC
+static inline int dep_resolved(H264Context_spu *h){
+	H264slice *s = h->s;
+	int spe_id = h->spe.spe_id;
+	volatile int mb_proc_dep = src_spe.count;
+	if (spe_id==0)
+		return (h->mb_proc < mb_proc_dep-1 +s->mb_width)? 1:0;
+	else
+		return (h->mb_proc < mb_proc_dep-1)? 1:0;
+}
+
+void update_tgt_spe_dep(H264Context_spu *h, int end){
+	H264Mb *mb = h->mb;
+	H264slice *s = h->s;
+	H264spe *spe = &h->spe;
+	int mb_x = mb->mb_x;
+	
+	if (end || (mb_x%2==0 && mb_x!=0) || mb_x==s->mb_width-1){
+		spe_pos* dma_spe = &dma_temp;
+		spe_pos* tgt_spe = (spe_pos*) ((unsigned) spe->tgt_spe + (unsigned) &src_spe); //located in target spe local store
+		dma_spe->count = end? h->mb_proc+1: h->mb_proc;
+		spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), MBD_put);
+	}
+	h->mb_proc++;
+}
+
+
+int main(unsigned long long id, unsigned long long argp)
+{
+	(void) id;
+	H264Context_spu* h = &h_context;
+	H264spe *spe_params = (H264spe *) (unsigned) argp;    
+	
+	spu_dma_get(&h->spe, (unsigned) spe_params, sizeof(H264spe), MBD_slice); //ID_slice is used out of convienience
+	wait_dma_id(MBD_slice);
+
+    //clear_statistic();
+    dsputil_h264_init_cell(&h->dsp);
+    ff_cropTbl_init();
+    init_pred_ptrs(&h->hpc);
+
+	//send slice_buf to ppe
+	spu_write_out_mbox((unsigned) h->slice_buf);
+	h->sl_idx=0;
+	// initialize tracing with paraver
+    //trace_init_SPU();
+	h->frames =0;	
+	src_spe.count =0;
+	h->mb_proc = 0;
+
+	h->mb_id=0;
+	h->mc_idx=0;
+	h->mb_dec=0;
+	h->mb_mc=0;
+	h->mb_dma=0;
+	h->next_mb_idx=0;
+
+	h->blocking=0;
+
+
+	H264spe* p = &h->spe;
+	h->curr_line =p->spe_id;
+	h->mb_total = p->mb_height*p->mb_width;
+	int stride_y = 32;
+	int stride_c = 16;
+	//init block_offset array
+	init_block_offset(stride_y, stride_c);
+	for(;;){
+		spu_read_in_mbox();
+
+		h->s = &h->slice_buf[h->sl_idx];
+		h->sl_idx++; h->sl_idx%=2;
+
+		if (h->s->state< 0){			
+			break;
+		}
+
+		{
+			if(!h->blocking){
+				init_mb_buffer(h);
+				while((h->mb=(H264Mb *)get_next_mb(h))){
+					while(!dep_resolved(h));
+					//printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p-	>spe_id);
+					hl_decode_mb_internal(h, stride_y, stride_c);
+				}
+				update_tgt_spe_dep(h, 1);
+			}else{
+				h->mb_id=0;
+				while((h->mb=(H264Mb *)get_next_mb_blocking(h))){
+					while(!dep_resolved(h));
+					//printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p-	>spe_id);
+					hl_decode_mb_internal(h, stride_y, stride_c);
+				}
+				update_tgt_spe_dep(h, 1);
+			}
+			
+		}
+
+		h->frames++;
+		
+		if (p->spe_id == ((h->frames*p->mb_height -1)%p->spe_total)){
+			//printf("spe %d, %d\n", atomic_read(p->rl_cnt), h->frames);
+			//MBSlice is copied beforehand.
+			//only inc cnt.
+			atomic_inc(p->rl_cnt);		
+		}
+		{
+			atomic_dec(p->cnt);
+		}
+	}
+	
+	return 0;
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/types_spu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/cell/types_spu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef TYPES_SPU_H
+#define TYPES_SPU_H
+
+/***********************************************************************
+ * Scalar types
+ **********************************************************************/
+    typedef signed char  int8_t;
+    typedef signed short int16_t;
+    typedef signed int   int32_t;
+    typedef unsigned char  uint8_t;
+    typedef unsigned short uint16_t;
+    typedef unsigned int   uint32_t;
+    typedef unsigned long long uint64_t;
+
+//     typedef short DCTELEM;		// transform coeficients of dct
+
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+    typedef	vector	signed int	vsint32_t;
+    typedef	vector	unsigned int	vuint32_t;
+    typedef	vector	signed short	vsint16_t;
+    typedef	vector	unsigned short	vuint16_t;
+    typedef	vector	signed char	vsint8_t;
+    typedef	vector	unsigned char	vuint8_t;
+
+/***********************************************************************
+ * Functions
+ **********************************************************************/
+    typedef void (*qpel_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h);
+    typedef void (*h264_chroma_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h, int x, int y);
+    typedef void (*h264_idct_func)(uint8_t *dst, short *block, int stride);
+    typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
+    typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd,
+                  int weights, int offset);
+    typedef void(* intra_pred4x4)(uint8_t *src, uint8_t *topright, int stride);
+    typedef void(* intra_pred16x16)(uint8_t *src, int stride);
+    typedef void(* intra_pred8x8)(uint8_t *src, int stride);
+    typedef void(* intra_pred8x8l)(uint8_t *src, int topleft, int topright, int stride);
+
+
+#define AVV(x...) {x}
+	
+	
+#endif // AVCODEC_TYPES_SPU_H
+
+
+
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/dsputil.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/dsputil.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1057 @@
+/*
+ * DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DSP utils
+ */
+
+#include "libavutil/log.h"
+#include "dsputil.h"
+#include "simple_idct.h"
+#include "mathops.h"
+#include "config.h"
+
+uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
+uint32_t ff_squareTbl[512] = {0, };
+
+const uint8_t ff_zigzag_direct[64] = {
+    0,   1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+
+#define PIXOP2(OPNAME, OP) \
+static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
+        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a,b;\
+        a= AV_RN32(&src1[i*src_stride1  ]);\
+        b= AV_RN32(&src2[i*src_stride2  ]);\
+        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
+        a= AV_RN32(&src1[i*src_stride1+4]);\
+        b= AV_RN32(&src2[i*src_stride2+4]);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
+    }\
+}\
+\
+static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a,b;\
+        a= AV_RN32(&src1[i*src_stride1  ]);\
+        b= AV_RN32(&src2[i*src_stride2  ]);\
+        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
+        a= AV_RN32(&src1[i*src_stride1+4]);\
+        b= AV_RN32(&src2[i*src_stride2+4]);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
+    }\
+}\
+\
+static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a,b;\
+        a= AV_RN32(&src1[i*src_stride1  ]);\
+        b= AV_RN32(&src2[i*src_stride2  ]);\
+        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
+    }\
+}\
+\
+static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a,b;\
+        a= AV_RN16(&src1[i*src_stride1  ]);\
+        b= AV_RN16(&src2[i*src_stride2  ]);\
+        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
+    }\
+}\
+\
+static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
+    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
+    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a, b, c, d, l0, l1, h0, h1;\
+        a= AV_RN32(&src1[i*src_stride1]);\
+        b= AV_RN32(&src2[i*src_stride2]);\
+        c= AV_RN32(&src3[i*src_stride3]);\
+        d= AV_RN32(&src4[i*src_stride4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x02020202UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+        a= AV_RN32(&src1[i*src_stride1+4]);\
+        b= AV_RN32(&src2[i*src_stride2+4]);\
+        c= AV_RN32(&src3[i*src_stride3+4]);\
+        d= AV_RN32(&src4[i*src_stride4+4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x02020202UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+    }\
+}\
+\
+static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a, b, c, d, l0, l1, h0, h1;\
+        a= AV_RN32(&src1[i*src_stride1]);\
+        b= AV_RN32(&src2[i*src_stride2]);\
+        c= AV_RN32(&src3[i*src_stride3]);\
+        d= AV_RN32(&src4[i*src_stride4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x01010101UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+        a= AV_RN32(&src1[i*src_stride1+4]);\
+        b= AV_RN32(&src2[i*src_stride2+4]);\
+        c= AV_RN32(&src3[i*src_stride3+4]);\
+        d= AV_RN32(&src4[i*src_stride4+4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x01010101UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+    }\
+}\
+static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+}\
+static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+}\
+\
+static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        int i, a0, b0, a1, b1;\
+        a0= pixels[0];\
+        b0= pixels[1] + 2;\
+        a0 += b0;\
+        b0 += pixels[2];\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            a1= pixels[0];\
+            b1= pixels[1];\
+            a1 += b1;\
+            b1 += pixels[2];\
+\
+            block[0]= (a1+a0)>>2; /* FIXME non put */\
+            block[1]= (b1+b0)>>2;\
+\
+            pixels+=line_size;\
+            block +=line_size;\
+\
+            a0= pixels[0];\
+            b0= pixels[1] + 2;\
+            a0 += b0;\
+            b0 += pixels[2];\
+\
+            block[0]= (a1+a0)>>2;\
+            block[1]= (b1+b0)>>2;\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        int i;\
+        const uint32_t a= AV_RN32(pixels  );\
+        const uint32_t b= AV_RN32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x02020202UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= AV_RN32(pixels  );\
+            uint32_t b= AV_RN32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN32(pixels  );\
+            b= AV_RN32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x02020202UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int j;\
+    for(j=0; j<2; j++){\
+        int i;\
+        const uint32_t a= AV_RN32(pixels  );\
+        const uint32_t b= AV_RN32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x02020202UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= AV_RN32(pixels  );\
+            uint32_t b= AV_RN32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN32(pixels  );\
+            b= AV_RN32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x02020202UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+        pixels+=4-line_size*(h+1);\
+        block +=4-line_size*h;\
+    }\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int j;\
+    for(j=0; j<2; j++){\
+        int i;\
+        const uint32_t a= AV_RN32(pixels  );\
+        const uint32_t b= AV_RN32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x01010101UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= AV_RN32(pixels  );\
+            uint32_t b= AV_RN32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN32(pixels  );\
+            b= AV_RN32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x01010101UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+        pixels+=4-line_size*(h+1);\
+        block +=4-line_size*h;\
+    }\
+}\
+\
+CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
+
+#define op_avg(a, b) a = rnd_avg32(a, b)
+
+#define op_put(a, b) a = b
+
+PIXOP2(avg, op_avg)
+PIXOP2(put, op_put)
+#undef op_avg
+#undef op_put
+
+
+#define H264_CHROMA_MC(OPNAME, OP)\
+static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            OP(dst[1], (A*src[1] + E*src[step+1]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
+\
+static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
+            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
+            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            OP(dst[1], (A*src[1] + E*src[step+1]));\
+            OP(dst[2], (A*src[2] + E*src[step+2]));\
+            OP(dst[3], (A*src[3] + E*src[step+3]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
+\
+static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
+            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
+            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
+            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
+            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
+            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
+            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            OP(dst[1], (A*src[1] + E*src[step+1]));\
+            OP(dst[2], (A*src[2] + E*src[step+2]));\
+            OP(dst[3], (A*src[3] + E*src[step+3]));\
+            OP(dst[4], (A*src[4] + E*src[step+4]));\
+            OP(dst[5], (A*src[5] + E*src[step+5]));\
+            OP(dst[6], (A*src[6] + E*src[step+6]));\
+            OP(dst[7], (A*src[7] + E*src[step+7]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}
+
+#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
+#define op_put(a, b) a = (((b) + 32)>>6)
+
+H264_CHROMA_MC(put_       , op_put)
+H264_CHROMA_MC(avg_       , op_avg)
+#undef op_avg
+#undef op_put
+
+
+#define H264_LOWPASS(OPNAME, OP, OP2) \
+static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int h=2;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w=2;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB= src[-2*srcStride];\
+        const int srcA= src[-1*srcStride];\
+        const int src0= src[0 *srcStride];\
+        const int src1= src[1 *srcStride];\
+        const int src2= src[2 *srcStride];\
+        const int src3= src[3 *srcStride];\
+        const int src4= src[4 *srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    const int h=2;\
+    const int w=2;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    src -= 2*srcStride;\
+    for(i=0; i<h+5; i++)\
+    {\
+        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
+        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
+        tmp+=tmpStride;\
+        src+=srcStride;\
+    }\
+    tmp -= tmpStride*(h+5-2);\
+    for(i=0; i<w; i++)\
+    {\
+        const int tmpB= tmp[-2*tmpStride];\
+        const int tmpA= tmp[-1*tmpStride];\
+        const int tmp0= tmp[0 *tmpStride];\
+        const int tmp1= tmp[1 *tmpStride];\
+        const int tmp2= tmp[2 *tmpStride];\
+        const int tmp3= tmp[3 *tmpStride];\
+        const int tmp4= tmp[4 *tmpStride];\
+        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
+        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
+        dst++;\
+        tmp++;\
+    }\
+}\
+static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int h=4;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
+        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
+        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w=4;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB= src[-2*srcStride];\
+        const int srcA= src[-1*srcStride];\
+        const int src0= src[0 *srcStride];\
+        const int src1= src[1 *srcStride];\
+        const int src2= src[2 *srcStride];\
+        const int src3= src[3 *srcStride];\
+        const int src4= src[4 *srcStride];\
+        const int src5= src[5 *srcStride];\
+        const int src6= src[6 *srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
+        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    const int h=4;\
+    const int w=4;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    src -= 2*srcStride;\
+    for(i=0; i<h+5; i++)\
+    {\
+        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
+        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
+        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
+        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
+        tmp+=tmpStride;\
+        src+=srcStride;\
+    }\
+    tmp -= tmpStride*(h+5-2);\
+    for(i=0; i<w; i++)\
+    {\
+        const int tmpB= tmp[-2*tmpStride];\
+        const int tmpA= tmp[-1*tmpStride];\
+        const int tmp0= tmp[0 *tmpStride];\
+        const int tmp1= tmp[1 *tmpStride];\
+        const int tmp2= tmp[2 *tmpStride];\
+        const int tmp3= tmp[3 *tmpStride];\
+        const int tmp4= tmp[4 *tmpStride];\
+        const int tmp5= tmp[5 *tmpStride];\
+        const int tmp6= tmp[6 *tmpStride];\
+        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
+        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
+        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
+        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
+        dst++;\
+        tmp++;\
+    }\
+}\
+\
+static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int h=8;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
+        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
+        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
+        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
+        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
+        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
+        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w=8;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB= src[-2*srcStride];\
+        const int srcA= src[-1*srcStride];\
+        const int src0= src[0 *srcStride];\
+        const int src1= src[1 *srcStride];\
+        const int src2= src[2 *srcStride];\
+        const int src3= src[3 *srcStride];\
+        const int src4= src[4 *srcStride];\
+        const int src5= src[5 *srcStride];\
+        const int src6= src[6 *srcStride];\
+        const int src7= src[7 *srcStride];\
+        const int src8= src[8 *srcStride];\
+        const int src9= src[9 *srcStride];\
+        const int src10=src[10*srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
+        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
+        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
+        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
+        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
+        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    const int h=8;\
+    const int w=8;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    src -= 2*srcStride;\
+    for(i=0; i<h+5; i++)\
+    {\
+        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
+        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
+        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
+        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
+        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
+        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
+        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
+        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
+        tmp+=tmpStride;\
+        src+=srcStride;\
+    }\
+    tmp -= tmpStride*(h+5-2);\
+    for(i=0; i<w; i++)\
+    {\
+        const int tmpB= tmp[-2*tmpStride];\
+        const int tmpA= tmp[-1*tmpStride];\
+        const int tmp0= tmp[0 *tmpStride];\
+        const int tmp1= tmp[1 *tmpStride];\
+        const int tmp2= tmp[2 *tmpStride];\
+        const int tmp3= tmp[3 *tmpStride];\
+        const int tmp4= tmp[4 *tmpStride];\
+        const int tmp5= tmp[5 *tmpStride];\
+        const int tmp6= tmp[6 *tmpStride];\
+        const int tmp7= tmp[7 *tmpStride];\
+        const int tmp8= tmp[8 *tmpStride];\
+        const int tmp9= tmp[9 *tmpStride];\
+        const int tmp10=tmp[10*tmpStride];\
+        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
+        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
+        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
+        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
+        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
+        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
+        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
+        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
+        dst++;\
+        tmp++;\
+    }\
+}\
+\
+static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
+    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
+    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
+}\
+
+#define H264_MC(OPNAME, SIZE) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t half[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t half[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    uint8_t half[SIZE*SIZE];\
+    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
+    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    uint8_t half[SIZE*SIZE];\
+    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    uint8_t halfH[SIZE*SIZE];\
+    uint8_t halfV[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
+    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    uint8_t halfH[SIZE*SIZE];\
+    uint8_t halfV[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
+    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    uint8_t halfH[SIZE*SIZE];\
+    uint8_t halfV[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
+    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    uint8_t halfH[SIZE*SIZE];\
+    uint8_t halfV[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
+    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
+    int16_t tmp[SIZE*(SIZE+5)];\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
+    int16_t tmp[SIZE*(SIZE+5)];\
+    uint8_t halfH[SIZE*SIZE];\
+    uint8_t halfHV[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
+    int16_t tmp[SIZE*(SIZE+5)];\
+    uint8_t halfH[SIZE*SIZE];\
+    uint8_t halfHV[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    int16_t tmp[SIZE*(SIZE+5)];\
+    uint8_t halfV[SIZE*SIZE];\
+    uint8_t halfHV[SIZE*SIZE];\
+    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
+    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    int16_t tmp[SIZE*(SIZE+5)];\
+    uint8_t halfV[SIZE*SIZE];\
+    uint8_t halfHV[SIZE*SIZE];\
+    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
+    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
+    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+
+#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
+#define op_put(a, b)  a = cm[((b) + 16)>>5]
+#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
+#define op2_put(a, b)  a = cm[((b) + 512)>>10]
+
+H264_LOWPASS(put_       , op_put, op2_put)
+H264_LOWPASS(avg_       , op_avg, op2_avg)
+H264_MC(put_, 2)
+H264_MC(put_, 4)
+H264_MC(put_, 8)
+H264_MC(put_, 16)
+H264_MC(avg_, 4)
+H264_MC(avg_, 8)
+H264_MC(avg_, 16)
+
+#undef op_avg
+#undef op_put
+#undef op2_avg
+#undef op2_put
+
+static void clear_block_c(DCTELEM *block)
+{
+    memset(block, 0, sizeof(DCTELEM)*64);
+}
+
+/**
+ * memset(blocks, 0, sizeof(DCTELEM)*6*64)
+ */
+static void clear_blocks_c(DCTELEM *blocks)
+{
+    memset(blocks, 0, sizeof(DCTELEM)*6*64);
+}
+
+static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
+
+/* init static data */
+av_cold void dsputil_static_init(void)
+{
+    int i;
+
+    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
+    for(i=0;i<MAX_NEG_CROP;i++) {
+        ff_cropTbl[i] = 0;
+        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
+    }
+
+    for(i=0;i<512;i++) {
+        ff_squareTbl[i] = (i - 256) * (i - 256);
+    }
+}
+
+int ff_check_alignment(void){
+    static int did_fail=0;
+    DECLARE_ALIGNED(16, int, aligned);
+
+    if((intptr_t)&aligned & 15){
+        if(!did_fail){
+#if HAVE_MMX || HAVE_ALTIVEC
+            av_log(AV_LOG_ERROR,
+                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
+                "and may be very slow or crash. This is not a bug in libavcodec,\n"
+                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
+                "Do not report crashes to FFmpeg developers.\n");
+#endif
+            did_fail=1;
+        }
+        return -1;
+    }
+    return 0;
+}
+
+av_cold void dsputil_init(DSPContext* c)
+{
+    (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function.
+    ff_check_alignment();
+    dsputil_static_init();
+ 
+    c->idct_put= ff_simple_idct_put;
+    c->idct_add= ff_simple_idct_add;
+    c->idct    = ff_simple_idct;
+
+    c->clear_block = clear_block_c;
+    c->clear_blocks = clear_blocks_c;
+
+#define dspfunc(PFX, IDX, NUM) \
+    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
+    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
+    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
+    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
+    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
+    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
+    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
+
+
+    dspfunc(put_h264_qpel, 0, 16);
+    dspfunc(put_h264_qpel, 1, 8);
+    dspfunc(put_h264_qpel, 2, 4);
+    dspfunc(put_h264_qpel, 3, 2);
+    dspfunc(avg_h264_qpel, 0, 16);
+    dspfunc(avg_h264_qpel, 1, 8);
+    dspfunc(avg_h264_qpel, 2, 4);
+
+#undef dspfunc
+    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
+    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
+    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
+    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
+    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
+    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
+
+
+    c->prefetch= just_return;
+
+    if (HAVE_MMX)        dsputil_init_mmx   (c);
+    if (ARCH_ARM)        dsputil_init_arm   (c);
+    if (HAVE_ALTIVEC)    dsputil_init_ppc   (c); //fixme PPC prefetch
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/dsputil.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/dsputil.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,465 @@
+/*
+ * DSP utils
+ * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DSP utils.
+ * note, many functions in here may use MMX which trashes the FPU state, it is
+ * absolutely necessary to call emms_c() between dsp & float/double code
+ */
+
+#ifndef AVCODEC_DSPUTIL_H
+#define AVCODEC_DSPUTIL_H
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "h264_idct.h"
+// 
+void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
+                             const float *win, float add_bias, int len);
+void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
+
+/* encoding scans */
+extern const uint8_t ff_alternate_horizontal_scan[64];
+extern const uint8_t ff_alternate_vertical_scan[64];
+extern const uint8_t ff_zigzag_direct[64];
+extern const uint8_t ff_zigzag248_direct[64];
+
+/* pixel operations */
+#define MAX_NEG_CROP 1024
+
+/* temporary */
+extern uint32_t ff_squareTbl[512];
+extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
+
+/* VP3 DSP functions */
+void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
+void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
+
+void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
+void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
+
+/* VP6 DSP functions */
+void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
+                           const int16_t *h_weights, const int16_t *v_weights);
+
+/* Bink functions */
+void ff_bink_idct_c    (DCTELEM *block);
+void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
+void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
+
+/* CAVS functions */
+void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
+void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
+
+/* VC1 functions */
+void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
+void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
+
+/* EA functions */
+void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
+
+/* 1/2^n downscaling functions from imgconvert.c */
+void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+
+void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
+
+/* minimum alignment rules ;)
+If you notice errors in the align stuff, need more alignment for some ASM code
+for some CPU or need to use a function with less aligned data then send a mail
+to the ffmpeg-devel mailing list, ...
+
+!warning These alignments might not match reality, (missing attribute((align))
+stuff somewhere possible).
+I (Michael) did not check them, these are just the alignments which I think
+could be reached easily ...
+
+!future video codecs might need functions with less strict alignment
+*/
+
+/*
+void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
+void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
+void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
+void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
+void clear_blocks_c(DCTELEM *blocks);
+*/
+
+/* add and put pixel (decoding) */
+// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
+//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
+typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
+typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
+typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
+typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+
+typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
+
+#define DEF_OLD_QPEL(name)\
+void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
+void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
+void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
+
+DEF_OLD_QPEL(qpel16_mc11_old_c)
+DEF_OLD_QPEL(qpel16_mc31_old_c)
+DEF_OLD_QPEL(qpel16_mc12_old_c)
+DEF_OLD_QPEL(qpel16_mc32_old_c)
+DEF_OLD_QPEL(qpel16_mc13_old_c)
+DEF_OLD_QPEL(qpel16_mc33_old_c)
+DEF_OLD_QPEL(qpel8_mc11_old_c)
+DEF_OLD_QPEL(qpel8_mc31_old_c)
+DEF_OLD_QPEL(qpel8_mc12_old_c)
+DEF_OLD_QPEL(qpel8_mc32_old_c)
+DEF_OLD_QPEL(qpel8_mc13_old_c)
+DEF_OLD_QPEL(qpel8_mc33_old_c)
+
+#define CALL_2X_PIXELS(a, b, n)\
+static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    b(block  , pixels  , line_size, h);\
+    b(block+n, pixels+n, line_size, h);\
+}
+
+/* motion estimation */
+// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
+// although currently h<4 is not used as functions with width <8 are neither used nor implemented
+typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
+
+/**
+ * Scantable.
+ */
+typedef struct ScanTable{
+    const uint8_t *scantable;
+    uint8_t permutated[64];
+    uint8_t raster_end[64];
+#if ARCH_PPC
+                /** Used by dct_quantize_altivec to find last-non-zero */
+    DECLARE_ALIGNED(16, uint8_t, inverse)[64];
+#endif
+} ScanTable;
+
+void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
+
+void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
+                         int block_w, int block_h,
+                         int src_x, int src_y, int w, int h);
+
+
+/**
+ * DSPContext.
+ */
+typedef struct DSPContext {
+    /* pixel ops : interface with DCT */
+    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
+    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
+    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
+    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
+    void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
+    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
+    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
+    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
+    
+    void (*clear_block)(DCTELEM *block/*align 16*/);
+    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
+
+
+    /**
+     * Halfpel motion compensation with rounding (a+b+1)>>1.
+     * this is an array[4][4] of motion compensation functions for 4
+     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * @param block destination where the result is stored
+     * @param pixels source
+     * @param line_size number of bytes in a horizontal line of block
+     * @param h height
+     */
+    op_pixels_func put_pixels_tab[4][4];
+
+    /**
+     * Halfpel motion compensation with rounding (a+b+1)>>1.
+     * This is an array[4][4] of motion compensation functions for 4
+     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * @param block destination into which the result is averaged (a+b+1)>>1
+     * @param pixels source
+     * @param line_size number of bytes in a horizontal line of block
+     * @param h height
+     */
+    op_pixels_func avg_pixels_tab[4][4];
+
+    /**
+     * Halfpel motion compensation with no rounding (a+b)>>1.
+     * this is an array[2][4] of motion compensation functions for 2
+     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * @param block destination where the result is stored
+     * @param pixels source
+     * @param line_size number of bytes in a horizontal line of block
+     * @param h height
+     */
+    op_pixels_func put_no_rnd_pixels_tab[4][4];
+
+    /**
+     * Halfpel motion compensation with no rounding (a+b)>>1.
+     * this is an array[2][4] of motion compensation functions for 2
+     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * @param block destination into which the result is averaged (a+b)>>1
+     * @param pixels source
+     * @param line_size number of bytes in a horizontal line of block
+     * @param h height
+     */
+    op_pixels_func avg_no_rnd_pixels_tab[4][4];
+
+    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
+
+
+    qpel_mc_func put_qpel_pixels_tab[2][16];
+    qpel_mc_func avg_qpel_pixels_tab[2][16];
+    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
+    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
+    qpel_mc_func put_mspel_pixels_tab[8];
+
+    /**
+     * h264 Chroma MC
+     */
+    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
+    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
+    /* This is really one func used in VC-1 decoding */
+    h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
+    h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3];
+
+    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
+    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
+
+    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
+    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
+
+   
+    /* (I)DCT */
+    void (*fdct)(DCTELEM *block/* align 16*/);
+    void (*fdct248)(DCTELEM *block/* align 16*/);
+
+    /* IDCT really*/
+    void (*idct)(DCTELEM *block/* align 16*/);
+
+    /**
+     * block -> idct -> clip to unsigned 8 bit -> dest.
+     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
+     * @param line_size size in bytes of a horizontal line of dest
+     */
+    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+
+    /**
+     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
+     * @param line_size size in bytes of a horizontal line of dest
+     */
+    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+
+    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
+#define EDGE_WIDTH 32
+
+    void (*prefetch)(void *mem, int stride, int h);
+
+} DSPContext;
+
+void dsputil_static_init(void);
+void dsputil_init(DSPContext* p);
+
+int ff_check_alignment(void);
+
+/**
+ * permute block according to permuatation.
+ * @param last last non zero element in scantable order
+ */
+void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
+
+void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
+
+#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
+
+static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
+{
+    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
+}
+
+static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
+{
+    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
+}
+
+
+/**
+ * Empty mmx state.
+ * this must be called between any dsp function and float/double code.
+ * for example sin(); dsp->idct_put(); emms_c(); cos()
+ */
+#define emms_c()
+
+/* should be defined by architectures supporting
+   one or more MultiMedia extension */
+int mm_support(void);
+extern int mm_flags;
+
+void dsputil_init_arm(DSPContext* c);
+void dsputil_init_mmx(DSPContext* c);
+void dsputil_init_ppc(DSPContext* c);
+
+void ff_dsputil_init_dwt(DSPContext *c);
+
+#if HAVE_MMX
+
+#undef emms_c
+
+static inline void emms(void)
+{
+    __asm__ volatile ("emms;":::"memory");
+}
+
+
+#define emms_c() \
+{\
+    if (mm_flags & FF_MM_MMX)\
+        emms();\
+}
+
+#elif ARCH_ARM
+
+#if HAVE_NEON
+#   define STRIDE_ALIGN 16
+#endif
+
+#elif ARCH_PPC || ARCH_PPC64 || ARCH_CELL
+
+#define STRIDE_ALIGN 16
+
+#endif
+
+#ifndef STRIDE_ALIGN
+#   define STRIDE_ALIGN 8
+#endif
+
+#define WRAPPER8_16(name8, name16)\
+static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
+    return name8(s, dst           , src           , stride, h)\
+          +name8(s, dst+8         , src+8         , stride, h);\
+}
+
+#define WRAPPER8_16_SQ(name8, name16)\
+static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
+    int score=0;\
+    score +=name8(s, dst           , src           , stride, 8);\
+    score +=name8(s, dst+8         , src+8         , stride, 8);\
+    if(h==16){\
+        dst += 8*stride;\
+        src += 8*stride;\
+        score +=name8(s, dst           , src           , stride, 8);\
+        score +=name8(s, dst+8         , src+8         , stride, 8);\
+    }\
+    return score;\
+}
+
+static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN16(dst   , AV_RN16(src   ));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN32(dst   , AV_RN32(src   ));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN32(dst   , AV_RN32(src   ));
+        AV_WN32(dst+4 , AV_RN32(src+4 ));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN32(dst   , AV_RN32(src   ));
+        AV_WN32(dst+4 , AV_RN32(src+4 ));
+        dst[8]= src[8];
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN32(dst   , AV_RN32(src   ));
+        AV_WN32(dst+4 , AV_RN32(src+4 ));
+        AV_WN32(dst+8 , AV_RN32(src+8 ));
+        AV_WN32(dst+12, AV_RN32(src+12));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN32(dst   , AV_RN32(src   ));
+        AV_WN32(dst+4 , AV_RN32(src+4 ));
+        AV_WN32(dst+8 , AV_RN32(src+8 ));
+        AV_WN32(dst+12, AV_RN32(src+12));
+        dst[16]= src[16];
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+#endif /* AVCODEC_DSPUTIL_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/get_bits.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/get_bits.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,325 @@
+/*
+ * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * bitstream reader API header.
+ */
+
+#ifndef AVCODEC_GET_BITS_H
+#define AVCODEC_GET_BITS_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "libavutil/bswap.h"
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "mathops.h"
+
+
+typedef struct GetBitContext {
+    uint8_t *rbsp;
+    unsigned int rbsp_size;
+    uint8_t *raw;
+    const uint8_t *buffer, *buffer_end;
+    unsigned int alloc_size;
+    unsigned int buf_size;
+    uint32_t *buffer_ptr;
+    uint32_t cache0;
+    uint32_t cache1;
+    int bit_count;
+    int size_in_bits;
+} GetBitContext;
+
+/* Bitstream reader API docs:
+name
+    arbitrary name which is used as prefix for the internal variables
+
+gb
+    getbitcontext
+
+OPEN_READER(name, gb)
+    loads gb into local variables
+
+CLOSE_READER(name, gb)
+    stores local vars in gb
+
+UPDATE_CACHE(name, gb)
+    refills the internal cache from the bitstream
+    after this call at least MIN_CACHE_BITS will be available,
+
+GET_CACHE(name, gb)
+    will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit)
+
+SHOW_UBITS(name, gb, num)
+    will return the next num bits
+
+SHOW_SBITS(name, gb, num)
+    will return the next num bits and do sign extension
+
+SKIP_BITS(name, gb, num)
+    will skip over the next num bits
+    note, this is equivalent to SKIP_CACHE; SKIP_COUNTER
+
+SKIP_CACHE(name, gb, num)
+    will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER)
+
+SKIP_COUNTER(name, gb, num)
+    will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS)
+
+LAST_SKIP_CACHE(name, gb, num)
+    will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing
+
+LAST_SKIP_BITS(name, gb, num)
+    is equivalent to LAST_SKIP_CACHE; SKIP_COUNTER
+
+for examples see get_bits, show_bits, skip_bits, get_vlc
+*/
+
+#define MIN_CACHE_BITS 32
+
+#define OPEN_READER(name, gb)\
+	int name##_bit_count=(gb)->bit_count;\
+	uint32_t name##_cache0= (gb)->cache0;\
+	uint32_t name##_cache1= (gb)->cache1;\
+	uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\
+
+#define CLOSE_READER(name, gb)\
+	(gb)->bit_count= name##_bit_count;\
+	(gb)->cache0= name##_cache0;\
+	(gb)->cache1= name##_cache1;\
+	(gb)->buffer_ptr= name##_buffer_ptr;\
+
+#define UPDATE_CACHE(name, gb)\
+	if(name##_bit_count > 0){\
+		const uint32_t next= be2me_32( *name##_buffer_ptr );\
+		name##_cache0 |= NEG_USR32(next,name##_bit_count);\
+		name##_cache1 |= next<<name##_bit_count;\
+		name##_buffer_ptr++;\
+		name##_bit_count-= 32;\
+	}\
+
+#if ARCH_X86
+#   define SKIP_CACHE(name, gb, num)\
+        __asm__(\
+            "shldl %2, %1, %0          \n\t"\
+            "shll %2, %1               \n\t"\
+            : "+r" (name##_cache0), "+r" (name##_cache1)\
+            : "Ic" ((uint8_t)(num))\
+           );
+#else
+#   define SKIP_CACHE(name, gb, num)\
+        name##_cache0 <<= (num);\
+        name##_cache0 |= NEG_USR32(name##_cache1,num);\
+        name##_cache1 <<= (num);
+#endif
+
+#define SKIP_COUNTER(name, gb, num)\
+	name##_bit_count += (num);\
+
+#define SKIP_BITS(name, gb, num)\
+	{\
+		SKIP_CACHE(name, gb, num)\
+		SKIP_COUNTER(name, gb, num)\
+	}\
+
+#define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
+#define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
+
+#define SHOW_UBITS(name, gb, num)\
+	NEG_USR32(name##_cache0, num)
+
+#define SHOW_SBITS(name, gb, num)\
+        NEG_SSR32(name##_cache0, num)
+
+#define GET_CACHE(name, gb)\
+	(name##_cache0)
+
+static inline int get_bits_count(const GetBitContext *s){
+    return ((uint8_t*)s->buffer_ptr - s->buffer)*8 - 32 + s->bit_count;
+}
+
+static inline void skip_bits_long(GetBitContext *s, int n){
+    OPEN_READER(re, s)
+    re_bit_count += n;
+    re_buffer_ptr += re_bit_count>>5;
+    re_bit_count &= 31;
+    re_cache0 = be2me_32( re_buffer_ptr[-1] ) << re_bit_count;
+    re_cache1 = 0;
+    UPDATE_CACHE(re, s)
+    CLOSE_READER(re, s)
+}
+
+/**
+ * read mpeg1 dc style vlc (sign bit + mantisse with no MSB).
+ * if MSB not set it is negative
+ * @param n length in bits
+ * @author BERO
+ */
+static inline int get_xbits(GetBitContext *s, int n){
+    register int sign;
+    register int32_t cache;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    cache = GET_CACHE(re,s);
+    sign=(~cache)>>31;
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+    return (NEG_USR32(sign ^ cache, n) ^ sign) - sign;
+}
+
+static inline int get_sbits(GetBitContext *s, int n){
+    register int tmp;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    tmp= SHOW_SBITS(re, s, n);
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+    return tmp;
+}
+
+/**
+ * reads 1-17 bits.
+ * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't
+ */
+static inline unsigned int get_bits(GetBitContext *s, int n){
+    register int tmp;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    tmp= SHOW_UBITS(re, s, n);
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+    return tmp;
+}
+
+/**
+ * shows 1-17 bits.
+ * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't
+ */
+static inline unsigned int show_bits(GetBitContext *s, int n){
+    register int tmp;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    tmp= SHOW_UBITS(re, s, n);
+//    CLOSE_READER(re, s)
+    return tmp;
+}
+
+static inline void skip_bits(GetBitContext *s, int n){
+ //Note gcc seems to optimize this to s->index+=n for the ALT_READER :))
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+}
+
+static inline unsigned int get_bits1(GetBitContext *s){
+    return get_bits(s, 1);
+}
+
+static inline unsigned int show_bits1(GetBitContext *s){
+    return show_bits(s, 1);
+}
+
+static inline void skip_bits1(GetBitContext *s){
+    skip_bits(s, 1);
+}
+
+/**
+ * reads 0-32 bits.
+ */
+static inline unsigned int get_bits_long(GetBitContext *s, int n){
+    if(n<=MIN_CACHE_BITS) return get_bits(s, n);
+    else{
+        int ret= get_bits(s, 16) << (n-16);
+        return ret | get_bits(s, n-16);
+    }
+}
+
+/**
+ * reads 0-32 bits as a signed integer.
+ */
+static inline int get_sbits_long(GetBitContext *s, int n) {
+    return sign_extend(get_bits_long(s, n), n);
+}
+
+/**
+ * shows 0-32 bits.
+ */
+static inline unsigned int show_bits_long(GetBitContext *s, int n){
+    if(n<=MIN_CACHE_BITS) return show_bits(s, n);
+    else{
+        GetBitContext gb= *s;
+        return get_bits_long(&gb, n);
+    }
+}
+
+static inline int check_marker(GetBitContext *s, const char *msg)
+{
+    int bit= get_bits1(s);
+    if(!bit)
+        av_log(AV_LOG_INFO, "Marker bit missing %s\n", msg);
+
+    return bit;
+}
+
+/**
+ * init GetBitContext.
+ * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits
+ * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end
+ * @param bit_size the size of the buffer in bits
+ *
+ * While GetBitContext stores the buffer size, for performance reasons you are
+ * responsible for checking for the buffer end yourself (take advantage of the padding)!
+ */
+static inline void init_get_bits(GetBitContext *s,
+                   const uint8_t *buffer, int bit_size)
+{
+    int buffer_size= (bit_size+7)>>3;
+    if(buffer_size < 0 || bit_size < 0) {
+        buffer_size = bit_size = 0;
+        buffer = NULL;
+    }
+
+    s->buffer= buffer;
+    s->size_in_bits= bit_size;
+    s->buffer_end= buffer + buffer_size;
+
+    s->buffer_ptr = (uint32_t*)((intptr_t)buffer&(~3));
+    s->bit_count = 32 + 8*((intptr_t)buffer&3);
+    skip_bits_long(s, 0);
+}
+
+static inline void align_get_bits(GetBitContext *s)
+{
+    int n= (-get_bits_count(s)) & 7;
+    if(n) skip_bits(s, n);
+}
+
+#define tprintf(p, ...) {}
+
+static inline int get_bits_left(GetBitContext *gb)
+{
+    return gb->size_in_bits - get_bits_count(gb);
+}
+
+#endif /* AVCODEC_GET_BITS_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/golomb.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/golomb.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,184 @@
+/*
+ * exp golomb vlc stuff
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @brief
+ *     exp golomb vlc stuff
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "libavutil/common.h"
+
+const uint8_t ff_log2_tab[256]={
+    0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+    5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+const uint8_t ff_golomb_vlc_len[512]={
+14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+
+const uint8_t ff_ue_golomb_vlc_code[512]={
+31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
+ 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+const int8_t ff_se_golomb_vlc_code[512]={
+ 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  8, -8,  9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15,
+  4,  4,  4,  4, -4, -4, -4, -4,  5,  5,  5,  5, -5, -5, -5, -5,  6,  6,  6,  6, -6, -6, -6, -6,  7,  7,  7,  7, -7, -7, -7, -7,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+
+const uint8_t ff_ue_golomb_len[256]={
+ 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11,
+11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13,
+13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15,
+15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17,
+};
+
+const uint8_t ff_interleaved_golomb_vlc_len[256]={
+9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
+9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
+9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+};
+
+const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={
+ 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3,
+ 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5,
+ 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+const int8_t ff_interleaved_se_golomb_vlc_code[256]={
+  8, -8,  4,  4,  9, -9, -4, -4,  2,  2,  2,  2,  2,  2,  2,  2,
+ 10,-10,  5,  5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+ 12,-12,  6,  6, 13,-13, -6, -6,  3,  3,  3,  3,  3,  3,  3,  3,
+ 14,-14,  7,  7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={
+0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2,
+12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3,
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,};
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/golomb.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/golomb.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,410 @@
+/*
+ * exp golomb vlc stuff
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2004 Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @brief
+ *     exp golomb vlc stuff
+ * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi
+ */
+
+#ifndef AVCODEC_GOLOMB_H
+#define AVCODEC_GOLOMB_H
+
+#include <stdint.h>
+#include "get_bits.h"
+
+#define INVALID_VLC           0x80000000
+
+extern const uint8_t ff_golomb_vlc_len[512];
+extern const uint8_t ff_ue_golomb_vlc_code[512];
+extern const  int8_t ff_se_golomb_vlc_code[512];
+extern const uint8_t ff_ue_golomb_len[256];
+
+extern const uint8_t ff_interleaved_golomb_vlc_len[256];
+extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256];
+extern const  int8_t ff_interleaved_se_golomb_vlc_code[256];
+extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256];
+
+
+ /**
+ * read unsigned exp golomb code.
+ */
+static inline int get_ue_golomb(GetBitContext *gb){
+    unsigned int buf;
+    int log;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    if(buf >= (1<<27)){
+        buf >>= 32 - 9;
+        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
+
+        return ff_ue_golomb_vlc_code[buf];
+    }else{
+        log= 2*av_log2_c(buf) - 31;
+        buf>>= log;
+        buf--;
+        LAST_SKIP_BITS(re, gb, 32 - log);
+        CLOSE_READER(re, gb);
+
+        return buf;
+    }
+}
+
+ /**
+ * read unsigned exp golomb code, constraint to a max of 31.
+ * the return value is undefined if the stored value exceeds 31.
+ */
+static inline int get_ue_golomb_31(GetBitContext *gb){
+    unsigned int buf;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    buf >>= 32 - 9;
+    LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+    CLOSE_READER(re, gb);
+
+    return ff_ue_golomb_vlc_code[buf];
+}
+
+static inline int svq3_get_ue_golomb(GetBitContext *gb){
+    uint32_t buf;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    if(buf&0xAA800000){
+        buf >>= 32 - 8;
+        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
+
+        return ff_interleaved_ue_golomb_vlc_code[buf];
+    }else{
+        int ret = 1;
+
+        while (1) {
+            buf >>= 32 - 8;
+            LAST_SKIP_BITS(re, gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
+
+            if (ff_interleaved_golomb_vlc_len[buf] != 9){
+                ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
+                ret |= ff_interleaved_dirac_golomb_vlc_code[buf];
+                break;
+            }
+            ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
+            UPDATE_CACHE(re, gb);
+            buf = GET_CACHE(re, gb);
+        }
+
+        CLOSE_READER(re, gb);
+        return ret - 1;
+    }
+}
+
+/**
+ * read unsigned truncated exp golomb code.
+ */
+static inline int get_te0_golomb(GetBitContext *gb, int range){
+    assert(range >= 1);
+
+    if(range==1)      return 0;
+    else if(range==2) return get_bits1(gb)^1;
+    else              return get_ue_golomb(gb);
+}
+
+/**
+ * read unsigned truncated exp golomb code.
+ */
+static inline int get_te_golomb(GetBitContext *gb, int range){
+    assert(range >= 1);
+
+    if(range==2) return get_bits1(gb)^1;
+    else         return get_ue_golomb(gb);
+}
+
+
+/**
+ * read signed exp golomb code.
+ */
+static inline int get_se_golomb(GetBitContext *gb){
+    unsigned int buf;
+    int log;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    if(buf >= (1<<27)){
+        buf >>= 32 - 9;
+        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
+
+        return ff_se_golomb_vlc_code[buf];
+    }else{
+        log= 2*av_log2_c(buf) - 31;
+        buf>>= log;
+
+        LAST_SKIP_BITS(re, gb, 32 - log);
+        CLOSE_READER(re, gb);
+
+        if(buf&1) buf= -(buf>>1);
+        else      buf=  (buf>>1);
+
+        return buf;
+    }
+}
+
+static inline int svq3_get_se_golomb(GetBitContext *gb){
+    unsigned int buf;
+    int log;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    if(buf&0xAA800000){
+        buf >>= 32 - 8;
+        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
+
+        return ff_interleaved_se_golomb_vlc_code[buf];
+    }else{
+        LAST_SKIP_BITS(re, gb, 8);
+        UPDATE_CACHE(re, gb);
+        buf |= 1 | (GET_CACHE(re, gb) >> 8);
+
+        if((buf & 0xAAAAAAAA) == 0)
+            return INVALID_VLC;
+
+        for(log=31; (buf & 0x80000000) == 0; log--){
+            buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
+        }
+
+        LAST_SKIP_BITS(re, gb, 63 - 2*log - 8);
+        CLOSE_READER(re, gb);
+
+        return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
+    }
+}
+
+static inline int dirac_get_se_golomb(GetBitContext *gb){
+    uint32_t buf;
+    uint32_t ret;
+
+    ret = svq3_get_ue_golomb(gb);
+
+    if (ret) {
+        OPEN_READER(re, gb);
+        UPDATE_CACHE(re, gb);
+        buf = SHOW_SBITS(re, gb, 1);
+        LAST_SKIP_BITS(re, gb, 1);
+        ret = (ret ^ buf) - buf;
+        CLOSE_READER(re, gb);
+    }
+
+    return ret;
+}
+
+/**
+ * read unsigned golomb rice code (ffv1).
+ */
+static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, int esc_len){
+    unsigned int buf;
+    int log;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    log= av_log2_c(buf);
+
+    if(log > 31-limit){
+        buf >>= log - k;
+        buf += (30-log)<<k;
+        LAST_SKIP_BITS(re, gb, 32 + k - log);
+        CLOSE_READER(re, gb);
+
+        return buf;
+    }else{
+        LAST_SKIP_BITS(re, gb, limit);
+        UPDATE_CACHE(re, gb);
+
+        buf = SHOW_UBITS(re, gb, esc_len);
+
+        LAST_SKIP_BITS(re, gb, esc_len);
+        CLOSE_READER(re, gb);
+
+        return buf + limit - 1;
+    }
+}
+
+/**
+ * read unsigned golomb rice code (jpegls).
+ */
+static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit, int esc_len){
+    unsigned int buf;
+    int log;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    log= av_log2_c(buf);
+
+    if(log - k >= 32-MIN_CACHE_BITS+(MIN_CACHE_BITS==32) && 32-log < limit){
+        buf >>= log - k;
+        buf += (30-log)<<k;
+        LAST_SKIP_BITS(re, gb, 32 + k - log);
+        CLOSE_READER(re, gb);
+
+        return buf;
+    }else{
+        int i;
+        for(i=0; SHOW_UBITS(re, gb, 1) == 0; i++){
+            LAST_SKIP_BITS(re, gb, 1);
+            UPDATE_CACHE(re, gb);
+        }
+        SKIP_BITS(re, gb, 1);
+
+        if(i < limit - 1){
+            if(k){
+                buf = SHOW_UBITS(re, gb, k);
+                LAST_SKIP_BITS(re, gb, k);
+            }else{
+                buf=0;
+            }
+
+            CLOSE_READER(re, gb);
+            return buf + (i<<k);
+        }else if(i == limit - 1){
+            buf = SHOW_UBITS(re, gb, esc_len);
+            LAST_SKIP_BITS(re, gb, esc_len);
+            CLOSE_READER(re, gb);
+
+            return buf + 1;
+        }else
+            return -1;
+    }
+}
+
+/**
+ * read signed golomb rice code (ffv1).
+ */
+static inline int get_sr_golomb(GetBitContext *gb, int k, int limit, int esc_len){
+    int v= get_ur_golomb(gb, k, limit, esc_len);
+
+    v++;
+    if (v&1) return v>>1;
+    else return -(v>>1);
+
+//    return (v>>1) ^ -(v&1);
+}
+
+/**
+ * read signed golomb rice code (flac).
+ */
+static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){
+    int v= get_ur_golomb_jpegls(gb, k, limit, esc_len);
+    return (v>>1) ^ -(v&1);
+}
+
+/**
+ * read unsigned golomb rice code (shorten).
+ */
+static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){
+        return get_ur_golomb_jpegls(gb, k, INT_MAX, 0);
+}
+
+/**
+ * read signed golomb rice code (shorten).
+ */
+static inline int get_sr_golomb_shorten(GetBitContext* gb, int k)
+{
+    int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
+    if (uvar & 1)
+        return ~(uvar >> 1);
+    else
+        return uvar >> 1;
+}
+
+
+
+#ifdef TRACE
+
+static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){
+    int show= show_bits(s, 24);
+    int pos= get_bits_count(s);
+    int i= get_ue_golomb(s);
+    int len= get_bits_count(s) - pos;
+    int bits= show>>(24-len);
+
+    print_bin(bits, len);
+
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
+
+    return i;
+}
+
+static inline int get_se(GetBitContext *s, char *file, const char *func, int line){
+    int show= show_bits(s, 24);
+    int pos= get_bits_count(s);
+    int i= get_se_golomb(s);
+    int len= get_bits_count(s) - pos;
+    int bits= show>>(24-len);
+
+    print_bin(bits, len);
+
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
+
+    return i;
+}
+
+static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){
+    int show= show_bits(s, 24);
+    int pos= get_bits_count(s);
+    int i= get_te0_golomb(s, r);
+    int len= get_bits_count(s) - pos;
+    int bits= show>>(24-len);
+
+    print_bin(bits, len);
+
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
+
+    return i;
+}
+
+#define get_ue_golomb(a) get_ue(a, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_se_golomb(a) get_se(a, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_te_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+
+#endif
+
+
+#endif /* AVCODEC_GOLOMB_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,215 @@
+#include "config.h"
+#include "h264.h"
+#include "h264_misc.h"
+#include <math.h>
+
+H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int width, int height, h264_options *opts){
+    int i;
+    const int mb_height = (height + 15) / 16;
+    const int mb_width  = (width  + 15) / 16;
+    const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16
+
+    ff_init_cabac_states();
+
+    H264Context *h= av_mallocz(sizeof(H264Context));
+
+    start_timer(h, TOTAL);
+    h->file_name = file_name;
+    h->profile = opts->profile;
+    for (i=0; i<PROFILE_STAGES; i++)
+        h->total_time[i]=0;
+
+    h->ifile=ifile;
+    h->ofile =ofile;
+
+    h->verbose =opts->verbose;
+    h->no_mbd =opts->no_mbd;
+    h->static_3d =opts->static_3d;
+    h->pipe_bufs = opts->pipe_bufs;
+    h->slice_bufs = opts->slice_bufs;
+
+    h->ed_ppe_threads =0;
+    if (opts->ppe_ed){
+        h->ed_ppe_threads = (opts->threads >opts->ppe_ed)? opts->ppe_ed :opts->threads;
+    }
+
+    h->threads = opts->threads - h->ed_ppe_threads;
+    h->smt = opts->smt;
+    if (h->smt){
+        h->threads *= 2;
+    }
+
+    h->num_frames = opts->numframes;
+
+    h->frame_width = width;
+    h->frame_height = height;
+
+    while ((width/2) %STRIDE_ALIGN)
+        width+=STRIDE_ALIGN;
+    h->width = width;
+    h->height = mb_height*16;
+
+    h->mb_height = mb_height;
+    h->mb_width = mb_width;
+    h->mb_stride = mb_stride;
+    h->b4_stride = mb_width*4 + 1;
+    h->b_stride = mb_width*4;
+
+    h->smb_width = opts->smb_size[0];
+    h->smb_height = opts->smb_size[1] < h->smb_width ?  opts->smb_size[1]  : h->smb_width;
+    h->smbc = getSuperMBContext(h, h->smb_width, h->smb_height);    
+
+    h->wave_order = opts->wave_order;
+
+    h->pipe_bufs = opts->pipe_bufs;
+
+    h->max_dpb_cnt = DPB_SIZE + opts->pipe_bufs;
+    h->free_dpb_cnt = h->max_dpb_cnt;
+    h->dpb = av_mallocz (h->max_dpb_cnt* sizeof (DecodedPicture));
+    
+
+    h->free_sb_cnt = h->threads*opts->slice_bufs + (h->no_mbd != 0) ;  //one extra to overlap some latency of signaling/freeing slicebuffers in entropy only mode
+    h->sb_size = h->free_sb_cnt;
+    h->sb = av_mallocz(h->sb_size* sizeof(SliceBufferEntry));
+
+    h->rl_q.size = FFMAX(1, FFMIN( (h->height-3 - 512)/16, h->mb_width/2)) +1;
+    h->rl_q.free = h->rl_q.size -1;
+    h->rl_q.ready=0;
+    h->rl_q.fi = h->rl_q.fo= 0;
+    h->rl_q.queue = av_malloc(h->rl_q.size* sizeof(RingLineEntry*));
+    for (i=0; i<h->rl_q.size; i++){
+        if( posix_memalign((void**)&h->rl_q.queue[i],64,sizeof(RingLineEntry)))
+            h->rl_q.queue[i]=NULL;
+        h->rl_q.queue[i]->top = av_malloc(h->mb_width*sizeof(TopBorder));
+    }
+
+    h->rl_q.queue[0]->prev_line = h->rl_q.queue[h->rl_q.size-1];
+    for (i=1; i<h->rl_q.size; i++){
+        h->rl_q.queue[i]->prev_line = h->rl_q.queue[i-1];
+    }
+
+    if( HAVE_MMX | HAVE_ALTIVEC| HAVE_NEON ){
+        for(i=0; i<16; i++){
+            #define T(x) (x>>2) | ((x<<2) & 0xF)
+            h->zigzag_scan[i] = T(zigzag_scan[i]);
+            #undef T
+        }
+        for(i=0; i<64; i++){
+            #define T(x) (x>>3) | ((x&7)<<3)
+            h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
+            #undef T
+        }
+    }else{
+        memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
+        memcpy(h->zigzag_scan8x8, ff_zigzag_direct, 64*sizeof(uint8_t));
+    }
+
+    pthread_mutex_init(&h->smb_lock, NULL);
+    pthread_mutex_init(&h->sdl_lock, NULL);
+    pthread_cond_init(&h->sdl_cond, NULL);
+
+    ///pthread initialization
+    pthread_mutex_init(&h->ilock, NULL);
+    pthread_cond_init(&h->icond, NULL);
+    pthread_mutex_init(&h->slock, NULL);
+    pthread_cond_init(&h->scond, NULL);
+    pthread_mutex_init(&h->tlock, NULL);
+    pthread_cond_init(&h->tcond, NULL);
+    pthread_mutex_init(&h->tdlock, NULL);
+    pthread_cond_init(&h->tdcond, NULL);
+    h->start =!opts->numamap; //default dont wait for start signal
+    h->statmbd = opts->statmbd;
+    h->rl_side_touch= opts->numamap;
+    h->touch_start=0;
+    h->setaff =opts->statsched;
+    h->init_threads=0;
+
+    pthread_mutex_init(&h->task_lock, NULL);
+    pthread_cond_init(&h->task_cond, NULL);
+    for (i=0; i<STAGES; i++){
+        pthread_mutex_init (&h->lock[i], NULL);
+        pthread_cond_init (&h->cond[i], NULL);
+
+        pthread_mutex_init (&h->sb_q[i].lock, NULL);
+        pthread_cond_init (&h->sb_q[i].cond, NULL);
+        h->sb_q[i].size = h->free_sb_cnt; //change to num threads later
+        h->sb_q[i].queue = av_malloc(h->free_sb_cnt* sizeof(SliceBufferEntry*));
+        h->sb_q[i].cnt = h->sb_q[i].fi = h->sb_q[i].fo =0;
+    }
+
+#if HAVE_LIBSDL2
+    h->sdlq.size=2;
+    h->sdlq.ready=2;
+    h->sdlq.queue = av_malloc(2* sizeof(SDL_Texture*));
+    pthread_mutex_init (&h->sdlq.sdl_lock, NULL);
+    pthread_cond_init (&h->sdlq.sdl_cond, NULL);
+#endif
+
+    h->display=opts->display;
+    h->fullscreen=opts->fullscreen;
+
+    return h;
+}
+
+
+void free_h264dec_context(H264Context *h) {
+    int i;
+
+    for(i=0; i<h->max_dpb_cnt; i++)
+        free_dp(&h->dpb[i]);
+    av_free (h->dpb);
+
+    for(i=0; i<h->sb_size; i++){
+        if (h->sb[i].initialized){
+            free_sb_entry(&h->sb[i]);
+        }
+    }
+    av_freep(&h->sb);
+
+    for (i=0; i<h->rl_q.size; i++){
+        av_freep(&h->rl_q.queue[i]->top);
+        av_freep(&h->rl_q.queue[i]);
+    }
+    av_freep(&h->rl_q.queue);
+
+    ///pthread cleanup
+    pthread_mutex_destroy (&h->task_lock);
+    pthread_cond_destroy (&h->task_cond);
+    for (i=0; i<STAGES; i++){
+        pthread_mutex_destroy (&h->lock[i]);
+        pthread_cond_destroy (&h->cond[i]);
+
+        pthread_mutex_destroy (&h->sb_q[i].lock);
+        pthread_cond_destroy (&h->sb_q[i].cond);
+        av_freep( &h->sb_q[i].queue);
+    }
+    pthread_mutex_destroy (&h->slock);
+    pthread_cond_destroy (&h->scond);
+    pthread_mutex_destroy (&h->ilock);
+    pthread_cond_destroy (&h->icond);
+
+    pthread_mutex_destroy(&h->smb_lock);
+    pthread_mutex_destroy (&h->sdl_lock);
+    pthread_cond_destroy (&h->sdl_cond);
+#if HAVE_LIBSDL2
+    av_free(h->sdlq.queue);
+    pthread_mutex_destroy (&h->sdlq.sdl_lock);
+    pthread_cond_destroy (&h->sdlq.sdl_cond);
+#endif
+
+    stop_timer(h, TOTAL);
+    if (h->threads==0){
+        for (i=0; i<PROFILE_STAGES; i++)
+            h->total_time[i] /= h->num_frames;
+        double others = h->total_time[TOTAL];
+        for (i=1; i<PROFILE_STAGES; i++)
+            others-=h->total_time[i];
+        if (h->profile == 1){
+            printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [MBREC %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED], h->total_time[REC], others);
+        }else if (h->profile ==2){
+            printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [PRED  %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED],h->total_time[REC], others);
+        }
+    }
+
+    av_free(h);
+}
\ No newline at end of file
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,76 @@
+/*
+* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+/**
+* @file
+* H.264 / AVC / MPEG4 part10 codec.
+* @author Michael Niedermayer <michaelni@gmx.at>
+*/
+
+#ifndef H264_H
+#define H264_H
+
+#include "h264_entropy.h"
+#include "h264_data.h"
+#include "h264_mc.h"
+#include "h264_misc.h"
+#include "h264_dsp.h"
+#include "h264_pred.h"
+#include "h264_parser.h"
+#include "h264_nal.h"
+#include "h264_rec.h"
+#include "h264_deblock.h"
+#include "h264_types.h"
+
+typedef struct h264_options{
+    int statsched;
+    int statmbd;
+    int numamap;
+    int no_mbd;
+    int numframes;
+    int display;
+    int fullscreen;
+    int verbose;
+    int ppe_ed;         // only useful for Cell
+    int profile;
+    int threads;
+    int smb_size[2];    // only useful for OmpSs
+    int wave_order;
+    int static_3d;
+    int pipe_bufs;
+    int slice_bufs;
+    int smt;
+}h264_options;
+
+int h264_decode_cell(H264Context *h);
+int h264_decode_cell_seq(H264Context *h);
+
+int h264_decode_ompss(H264Context *h);
+
+int h264_decode_pthread(H264Context *h);
+int h264_decode_seq(H264Context *h);
+
+
+H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int frame_width, int frame_height, h264_options *opts);
+void free_h264dec_context(H264Context *h);
+
+
+#endif /* AVCODEC_H264_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_cell.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_cell.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1242 @@
+
+#include "h264_types.h"
+#include "h264_parser.h"
+#include "h264_nal.h"
+#include "h264_entropy.h"
+#include "h264_rec.h"
+#include "h264_misc.h"
+#include "cell/h264_types_spu.h"
+#include "h264_pthread.h"
+
+#include <pthread.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include <libspe2.h>
+#include <ppu_intrinsics.h>
+#include <cbe_mfc.h>
+#include <libsync.h>
+
+// spe global variables
+unsigned rl_cnt_var, rl_mutex_var, rl_cond_var;
+atomic_ea_t rl_cnt;
+cond_ea_t rl_cond;
+mutex_ea_t rl_lock;
+
+H264spe * spe_params;
+unsigned mutex_var[16];
+unsigned cond_var[16];
+unsigned atomic_var[16];
+
+pthread_t * spe_tid;
+spe_context_ptr_t *spe_context;
+void** spe_control_area;
+void** spe_ls_area;
+H264slice **spe_slice_buf;
+
+H264spe * spe_ed_params;
+unsigned mutex_ed_var[16];
+unsigned cond_ed_var[16];
+unsigned atomic_ed_var[16];
+
+pthread_t * spe_ed_tid;
+spe_context_ptr_t *spe_ed_context;
+void** spe_ed_control_area;
+void** spe_ed_ls_area;
+EDSlice_spu **spe_ed_slice_buf;
+
+//structs to propagate stop signal
+MBSlice last_slice;
+EDSlice last_ed_slice;
+DecodedPicture last_pic;
+RawFrame last_frm;
+
+static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){
+    int i;
+    int cnt = *poc_cnt;
+    for(i=0; i<cnt; i++){
+        if (poc_list[i]==s->ref_list[1][0]->poc){
+            *poc_cnt=i+1;
+            while(++i<cnt)
+                poc_list[i]=0;
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static void update_IP_poc_list(int *poc_list, int *poc_cnt, int poc) {
+    int i=0;
+    int cnt = *poc_cnt;
+
+    while (poc_list[i] > poc) { i++;}
+    if ( i< cnt)
+        memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int));
+
+    poc_list[i]=poc;
+    (*poc_cnt)++;
+}
+
+static void *spe_ed_thread(void *arg){
+    H264spe *params = (H264spe *)arg;
+    unsigned int idx = params->idx;
+    unsigned int runflags = 0;
+    unsigned int entry = SPE_DEFAULT_ENTRY;
+    // run SPE context
+    spe_context_run(spe_ed_context[idx],  &entry, runflags, (void*) params, NULL, NULL);
+    // done - now exit thread
+    pthread_exit(NULL);
+}
+
+static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) {
+    int i;
+    int num_threads = ip_threads+b_threads;
+    spe_program_handle_t * spe_program = spe_image_open("spe_ed");
+    // reserve memory for spe thread id, context and argument addresses
+    spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t));
+    spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
+    spe_ed_params = av_malloc(num_threads * sizeof (H264spe));
+    spe_ed_control_area = av_malloc(num_threads * sizeof (void*));
+    spe_ed_ls_area = av_malloc(num_threads * sizeof (void*));
+    spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*));
+
+    if (spe_program == NULL)
+        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
+
+    for (i = 0; i < num_threads; i++) {
+        // create context for spe program
+        spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL);
+        if (spe_ed_context[i] == NULL)
+            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
+        // load SPE program into main memory
+        if ((spe_program_load(spe_ed_context[i], spe_program)) == -1)
+            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
+        //get the control_area for fast mailboxing
+        if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL)
+            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
+        //get ls area for inter spe communication
+        if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL)
+            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
+    }
+
+    for (i = 0; i < ip_threads; i++) {
+        spe_ed_params[i].mb_width = h->mb_width;
+        spe_ed_params[i].mb_stride = h->mb_stride;
+        spe_ed_params[i].mb_height = h->mb_height;
+        spe_ed_params[i].type = EDIP;
+        spe_ed_params[i].spe_id = i;
+        spe_ed_params[i].idx = i;
+        //spe_ed_params[i].spe_total = ip_threads; //not used
+        //spe_params[i].slice_params= &slice_params;
+        spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
+        spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads];
+
+        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
+        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
+        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
+
+        mutex_init(spe_ed_params[i].lock);
+        cond_init(spe_ed_params[i].cond);
+        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
+            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
+
+        //slicebufaddr
+        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
+        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
+    }
+    for (int j = 0; j < b_threads; j++) {
+        i = j+ip_threads;
+        spe_ed_params[i].mb_width = h->mb_width;
+        spe_ed_params[i].mb_stride = h->mb_stride;
+        spe_ed_params[i].mb_height = h->mb_height;
+        spe_ed_params[i].type = EDB;
+        spe_ed_params[i].idx = i;
+        spe_ed_params[i].spe_id = j;
+        spe_ed_params[i].spe_total = b_threads;
+        //spe_params[i].slice_params= &slice_params;
+        //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
+        spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads];
+
+        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
+        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
+        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
+
+        mutex_init(spe_ed_params[i].lock);
+        cond_init(spe_ed_params[i].cond);
+        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
+            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
+
+        //slicebufaddr
+        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
+        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
+    }
+    spe_image_close(spe_program);
+
+}
+
+static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){
+    dst->pps 	= src->pps;
+    dst->mbs 	= src->mbs;
+    dst->state 	= src->state;
+    dst->qp_thresh = src->qp_thresh;
+    dst->pic	= *src->current_picture;
+
+    dst->ref_count[0] = src->ref_count[0];
+    dst->ref_count[1] = src->ref_count[1];
+    dst->slice_type	  = src->slice_type;
+    dst->slice_type_nos = src->slice_type_nos;
+    dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag;
+    dst->list_count = src->list_count;
+    dst->coded_pic_num = src->coded_pic_num;
+
+    GetBitContext *gb = &src->gb;
+    align_get_bits( gb);
+    dst->bytestream_start = gb->buffer + get_bits_count(gb)/8;
+    dst->byte_bufsize = (get_bits_left(gb) + 7)/8;
+
+    dst->transform_bypass = src->transform_bypass;
+    dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred;
+    memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int));
+    memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int));
+    dst->cabac_init_idc = src->cabac_init_idc;
+    memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int));
+    dst->chroma_qp[0]= src->chroma_qp[0];
+    dst->chroma_qp[1]= src->chroma_qp[1];
+    dst->qscale = src->qscale;
+    dst->last_qscale_diff = src->last_qscale_diff;
+
+    if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0];
+}
+
+static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){
+    unsigned status;
+
+    spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0);
+    spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status);
+
+
+    _spe_in_mbox_write(spe_ed_control_area[id], 0);
+
+    while (!spe_out_mbox_status(spe_ed_context[id])){
+        //pthread_yield();
+        usleep(1000);
+    }
+    _spe_out_mbox_read(spe_ed_control_area[id]);
+}
+
+static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){
+    int i,j;
+
+    if( !s->pps.cabac ){
+        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
+        return -1;
+    }
+    DECLARE_ALIGNED(16, EDSlice_spu, slice);
+    fill_EDSlice_spu(&slice, s);
+
+    send_slice_to_spe_and_wait(&slice, id);
+
+    return 0;
+}
+
+static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){
+    int i,j;
+
+    if( !s->pps.cabac ){
+        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
+        return -1;
+    }
+    DECLARE_ALIGNED(16, EDSlice_spu, slice);
+    fill_EDSlice_spu(&slice, s);
+
+    send_slice_to_spe_and_wait(&slice, 0);
+    
+    if (s->release_cnt>0) {
+        for (int i=0; i<s->release_cnt; i++){
+            release_pib_entry(h, s->release_ref[i], 2);
+        }
+        s->release_cnt=0;
+    }
+
+    release_pib_entry(h, s->current_picture, 1);
+    av_freep(&s->gb.raw);
+    if (s->gb.rbsp)
+        av_freep(&s->gb.rbsp);
+
+    return 0;
+}
+
+static void *entr_IP_spe_thread(void *arg){
+    EDThreadContext *eip = (EDThreadContext *) arg;
+    H264Context *h = eip->h;
+// 	printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid));
+    for (int i=0; i<SLICE_BUFS; i++){
+        eip->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
+    }
+
+    EntropyContext *ec = get_entropy_context(h);
+    EDSlice *s;
+
+    for(;;){
+        {
+            pthread_mutex_lock(&eip->ed_lock);
+            while (eip->ed_cnt <= 0)
+                pthread_cond_wait(&eip->ed_cond, &eip->ed_lock);
+            s = &eip->ed_q[eip->ed_fo];
+            eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT;
+            pthread_mutex_unlock(&eip->ed_lock);
+        }
+
+        if (s->state<0)
+            break;
+        {
+            pthread_mutex_lock(&eip->mbs_lock);
+            while (eip->mbs_cnt <= 0)
+                pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock);
+
+            s->mbs = eip->mbs[eip->mbs_fo];
+            s->ed = eip;
+            eip->mbs_cnt--;
+            eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS;
+            pthread_mutex_unlock(&eip->mbs_lock);
+        }
+        if (eip->cell){
+            decode_slice_entropy_cell(ec, s, eip->thread_num);
+        }else{
+            decode_slice_entropy(ec, s);
+        }
+
+//         {
+//             pthread_mutex_lock(&h->lock[ENTROPY2]);
+//             h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc;
+//             while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT)
+//                 h->ed_poc_fo++;
+//
+//             pthread_cond_signal(&h->cond[ENTROPY2]);
+//             pthread_mutex_unlock(&h->lock[ENTROPY2]);
+//         }
+
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY4]);
+            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
+                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
+            h->ed_reorder_q[h->ed_reorder_fi] = *s;
+            h->ed_reorder_cnt++;
+            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
+            pthread_cond_signal(&h->cond[ENTROPY4]);
+            pthread_mutex_unlock(&h->lock[ENTROPY4]);
+        }
+
+        {
+            pthread_mutex_lock(&eip->ed_lock);
+            eip->ed_cnt--;
+            pthread_cond_signal(&eip->ed_cond);
+            pthread_mutex_unlock(&eip->ed_lock);
+        }
+    }
+
+    free_entropy_context(ec);
+
+    pthread_exit(NULL);
+    return NULL;
+}
+
+static void *entr_B_spe_thread(void *arg){
+    EDThreadContext *eb = (EDThreadContext *) arg;
+    H264Context *h = eb->h;
+// 	printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid));
+    for (int i=0; i<SLICE_BUFS; i++){
+        eb->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
+    }
+
+    EntropyContext *ec = get_entropy_context(h);
+    EDSlice *s;
+
+    for(;;){
+        {
+            pthread_mutex_lock(&eb->ed_lock);
+            while (eb->ed_cnt <= 0)
+                pthread_cond_wait(&eb->ed_cond, &eb->ed_lock);
+            s = &eb->ed_q[eb->ed_fo];
+            eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT;
+            pthread_mutex_unlock(&eb->ed_lock);
+        }
+
+        if (s->state<0)
+            break;
+        {
+            pthread_mutex_lock(&eb->mbs_lock);
+            while (eb->mbs_cnt <= 0)
+                pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock);
+            s->mbs = eb->mbs[eb->mbs_fo];
+            s->ed = eb;
+            eb->mbs_cnt--;
+            eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS;
+            pthread_mutex_unlock(&eb->mbs_lock);
+        }
+        //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed);
+        decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads);
+
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY4]);
+            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
+                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
+            h->ed_reorder_q[h->ed_reorder_fi] = *s;
+            h->ed_reorder_cnt++;
+            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
+            pthread_cond_signal(&h->cond[ENTROPY4]);
+            pthread_mutex_unlock(&h->lock[ENTROPY4]);
+
+        }
+
+        {
+            pthread_mutex_lock(&eb->ed_lock);
+            eb->ed_cnt--;
+            pthread_cond_signal(&eb->ed_cond);
+            pthread_mutex_unlock(&eb->ed_lock);
+        }
+    }
+    eb->lines_cnt++;
+
+    free_entropy_context(ec);
+
+    pthread_exit(NULL);
+    return NULL;
+}
+
+static void *entr_B_distribute(void *arg){
+    H264Context *h = (H264Context *) arg;
+    EDSlice *s;
+
+    int i, n=0, poc;
+
+// 	printf("eb dist, pid %d\n", syscall(SYS_gettid));
+
+    for(i=0; i<h->edb_threads; i++){
+        h->b[i].h =h;
+        h->b[i].thread_num =i;
+        h->b[i].thread_total =h->edb_threads;
+        pthread_mutex_init(&h->b[i].mbs_lock, NULL);
+        pthread_cond_init(&h->b[i].mbs_cond, NULL);
+        h->b[i].mbs_fo = 0;
+        h->b[i].mbs_cnt = SLICE_BUFS;
+        h->b[i].ed_fi =0;
+        h->b[i].ed_fo =0;
+        h->b[i].ed_cnt =0;
+        h->b[i].lines_cnt =0;
+        h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads];
+        pthread_mutex_init(&h->b[i].ed_lock, NULL);
+        pthread_cond_init(&h->b[i].ed_cond, NULL);
+        pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]);
+    }
+
+    for(;;){
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY3B]);
+            while (h->ed_B_cnt<=0)
+                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
+            s= &h->ed_B_q[h->ed_B_fo];
+            h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT;
+            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
+
+        }
+        if (s->state<0)
+            break;
+
+        if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){
+            while (poc < s->ref_list[1][0]->poc){
+                pthread_mutex_lock(&h->lock[ENTROPY2]);
+                while (poc == h->ed_poc)
+                    pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]);
+                poc = h->ed_poc;
+                pthread_mutex_unlock(&h->lock[ENTROPY2]);
+            }
+        }
+        {
+            pthread_mutex_lock(&h->b[n].ed_lock);
+            while (h->b[n].ed_cnt >= MAX_SLICE_COUNT)
+                pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock);
+            h->b[n].ed_q[ h->b[n].ed_fi] = *s;
+            h->b[n].ed_cnt++;
+            h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT;
+            pthread_cond_signal(&h->b[n].ed_cond);
+            pthread_mutex_unlock(&h->b[n].ed_lock);
+
+            n++; n%=h->edb_threads;
+        }
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY3B]);
+            h->ed_B_cnt--;
+            pthread_cond_signal(&h->cond[ENTROPY3B]);
+            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
+
+        }
+
+    }
+
+    for (i=0; i<h->edb_threads; i++){
+        pthread_mutex_lock(&h->b[i].ed_lock);
+        while (h->b[i].ed_cnt >= MAX_SLICE_COUNT)
+            pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock);
+        h->b[i].ed_q[ h->b[i].ed_fi] = *s;
+        h->b[i].ed_cnt++;
+        h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT;
+        pthread_cond_signal(&h->b[i].ed_cond);
+        pthread_mutex_unlock(&h->b[i].ed_lock);
+
+    }
+    for(int i=0; i<h->edb_threads; i++){
+        pthread_join(h->ed_B_thr[i], NULL);
+    }
+    pthread_exit(NULL);
+    return NULL;
+}
+
+
+static void *entr_IPB_distribute(void *arg){
+    H264Context *h = (H264Context *) arg;
+    EDSlice *s;
+    int i,n=0;
+
+    create_spe_ED_threads(h, h->edip_threads, h->edb_threads);
+    pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h);
+    for(i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
+        h->ip[i].h =h;
+        h->ip[i].cell = (i >= h->edip_ppe_threads);
+        pthread_mutex_init(&h->ip[i].mbs_lock, NULL);
+        pthread_cond_init(&h->ip[i].mbs_cond, NULL);
+        h->ip[i].thread_num = i - h->edip_ppe_threads;
+        h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads;
+        h->ip[i].mbs_fo = 0;
+        h->ip[i].mbs_cnt = SLICE_BUFS;
+        h->ip[i].ed_fi =0;
+        h->ip[i].ed_fo =0;
+        pthread_mutex_init(&h->ip[i].ed_lock, NULL);
+        pthread_cond_init(&h->ip[i].ed_cond, NULL);
+        pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]);
+    }
+
+    for(;;){
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY]);
+            while (h->ed_cnt<=0)
+                pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
+            s= &h->ed_q[h->ed_fo];
+
+            pthread_mutex_unlock(&h->lock[ENTROPY]);
+            h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
+        }
+        if (s->state<0)
+            break;
+
+        assert(s->current_picture);
+        if (s->slice_type_nos == FF_B_TYPE )
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY3B]);
+            while (h->ed_B_cnt>=MAX_SLICE_COUNT)
+                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
+            h->ed_B_q[h->ed_B_fi] = *s;
+            h->ed_B_cnt++;
+            h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
+            pthread_cond_signal(&h->cond[ENTROPY3B]);
+            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
+        }else
+        {
+            ///round robin now, change to based on rawframes size.
+            pthread_mutex_lock(&h->ip[n].ed_lock);
+            while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT)
+                pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock);
+            h->ip[n].ed_q[ h->ip[n].ed_fi] = *s;
+            h->ip[n].ed_cnt++;
+            h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT;
+            pthread_cond_signal(&h->ip[n].ed_cond);
+            pthread_mutex_unlock(&h->ip[n].ed_lock);
+
+            n++; n %=(h->edip_threads+h->edip_ppe_threads);
+        }
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY]);
+            h->ed_cnt--;
+            pthread_cond_signal(&h->cond[ENTROPY]);
+            pthread_mutex_unlock(&h->lock[ENTROPY]);
+
+        }
+    }
+
+    {
+        pthread_mutex_lock(&h->lock[ENTROPY3B]);
+        while (h->ed_B_cnt>=MAX_SLICE_COUNT)
+            pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
+        h->ed_B_q[h->ed_B_fi] = *s;
+        h->ed_B_cnt++;
+        h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
+        pthread_cond_signal(&h->cond[ENTROPY3B]);
+        pthread_mutex_unlock(&h->lock[ENTROPY3B]);
+    }
+    {
+        for (i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
+            pthread_mutex_lock(&h->ip[i].ed_lock);
+            while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT)
+                pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock);
+            h->ip[i].ed_q[ h->ip[i].ed_fi] = *s;
+            h->ip[i].ed_cnt++;
+            h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT;
+            pthread_cond_signal(&h->ip[i].ed_cond);
+            pthread_mutex_unlock(&h->ip[i].ed_lock);
+        }
+    }
+    {
+        pthread_mutex_lock(&h->lock[ENTROPY4]);
+        while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
+            pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
+        h->ed_reorder_q[h->ed_reorder_fi] = *s;
+        h->ed_reorder_cnt++;
+        h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
+        pthread_cond_signal(&h->cond[ENTROPY4]);
+        pthread_mutex_unlock(&h->lock[ENTROPY4]);
+
+    }
+    pthread_join(h->ed_B_dist, NULL);
+    for(i=0; i<h->edip_threads; i++){
+        pthread_join(h->ed_IP_thr[i], NULL);
+    }
+    pthread_exit(NULL);
+    return NULL;
+}
+
+static pthread_t ed_IPB_dist;
+static void *entropy_IPB_cell_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+    int i;
+    EDSlice reorder[MAX_SLICE_COUNT];
+    int ip_poc[MAX_SLICE_COUNT][2]={0,};
+    int next_ip_id=0;
+    int ip_poc_cnt=0;
+    EDSlice *s;
+    int reorder_cnt=0;
+    unsigned next_pic_num=0;
+
+    pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h);
+    int count =0;
+    for(;;){
+        //signals received from the entropy decoders
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY4]);
+            while (h->ed_reorder_cnt<=0)
+                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
+            s= &h->ed_reorder_q[h->ed_reorder_fo];
+            h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT;
+            pthread_mutex_unlock(&h->lock[ENTROPY4]);
+        }
+
+        if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){
+            for (i=0; i<ip_poc_cnt; i++){
+                if (s->ip_id < ip_poc[i][0]){
+                    memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int));
+                    break;
+                }
+            }
+            ip_poc[i][0]= s->ip_id;
+            ip_poc[i][1]= s->current_picture->poc;
+            ip_poc_cnt++;
+
+            while (next_ip_id == ip_poc[0][0]){
+                pthread_mutex_lock(&h->lock[ENTROPY2]);
+                h->ed_poc = ip_poc[0][1];
+
+                pthread_cond_signal(&h->cond[ENTROPY2]);
+                pthread_mutex_unlock(&h->lock[ENTROPY2]);
+                memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int));
+                ip_poc_cnt--;
+                next_ip_id++;
+            }
+        }
+
+        for(i=reorder_cnt; i>0; i--){
+            if (s->coded_pic_num < reorder[i-1].coded_pic_num)
+                break;
+            reorder[i]=reorder[i-1];
+        }
+        reorder[i]=*s;
+
+        while(reorder_cnt>=0){
+            if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){
+                break;
+            }
+            EDSlice *es = &reorder[reorder_cnt];
+
+            {
+                pthread_mutex_lock(&h->lock[MBDEC]);
+                while (h->mbdec_cnt >= MAX_SLICE_COUNT)
+                    pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
+                copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es);
+
+                h->mbdec_cnt++;
+                h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
+                pthread_cond_signal(&h->cond[MBDEC]);
+                pthread_mutex_unlock(&h->lock[MBDEC]);
+
+            }
+
+            if (es->state<0)
+                goto end;
+
+            assert(es->current_picture);
+            for (int i=0; i<es->release_cnt; i++){
+                release_pib_entry(h, es->release_ref[i], 2);
+            }
+            release_pib_entry(h, es->current_picture, 1);
+            av_freep(&es->gb.raw);
+            if (es->gb.rbsp)
+                av_freep(&es->gb.rbsp);
+
+            next_pic_num++;
+            reorder_cnt--;
+        }
+        reorder_cnt++;
+
+        {
+            pthread_mutex_lock(&h->lock[ENTROPY4]);
+            h->ed_reorder_cnt--;
+            pthread_cond_signal(&h->cond[ENTROPY4]);
+            pthread_mutex_unlock(&h->lock[ENTROPY4]);
+        }
+    }
+
+end:
+    pthread_join(ed_IPB_dist, NULL);
+    pthread_exit(NULL);
+    return NULL;
+}
+
+
+static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){
+    dst->deblocking_filter =1;
+    dst->linesize = src->current_picture->linesize[0];
+    dst->uvlinesize = src->current_picture->linesize[1];
+    dst->mb_width = h->mb_width;
+    dst->mb_height = h->mb_height;
+    dst->use_weight = src->use_weight;
+    dst->use_weight_chroma = src->use_weight_chroma;
+    dst->luma_log2_weight_denom = src->luma_log2_weight_denom;
+    dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom;
+
+    //weights later
+    memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t));
+    memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t));
+    memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t));
+
+    for(int list=0; list<2; list++){
+        for (int i=0; i<src->ref_count[list]; i++){
+            Picture_spu *p_dst = &dst->ref_list[list][i];
+            DecodedPicture *p_src = src->ref_list[list][i];
+            if (p_src){
+                p_dst->data[0] = p_src->data[0];
+                p_dst->data[1] = p_src->data[1];
+                p_dst->data[2] = p_src->data[2];
+            }
+        }
+    }
+    dst->state = src->state;
+
+    dst->emu_edge_width  =32;
+    dst->emu_edge_height =32;
+    dst->slice_type = src->slice_type;
+    dst->slice_type_nos = src->slice_type_nos;
+    dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset;
+    dst->slice_beta_offset = src->slice_beta_offset;
+
+    memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64);
+
+    dst->blocks = src->mbs;
+    dst->dst_y = src->current_picture->data[0];
+    dst->dst_cb = src->current_picture->data[1];
+    dst->dst_cr = src->current_picture->data[2];
+}
+
+static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){
+    static int rl_fi=0;
+
+    DECLARE_ALIGNED(16, H264slice, spe_slice);
+    H264spe *p=&spe_params[0];
+    unsigned status;
+    uint8_t *dst_y, *dst_cb, *dst_cr;
+
+    DecodedPicture *dp;
+
+    for (int i=0; i<2; i++){
+        for(int j=0; j< s->ref_count[i]; j++){
+            if (s->ref_list_cpn[i][j] ==-1)
+                continue;
+            int k;
+            for (k=0; k<DPB_SIZE; k++){
+                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
+                    s->ref_list[i][j] = &h->dpb[k];
+                    break;
+                }
+            }
+        }
+    }
+
+    dp = get_dpb_entry(h);
+    init_dpb_entry(dp, s, d->width, d->height);
+
+    if (h->no_mbd)
+        return;
+
+
+    fill_spe_slice(&spe_slice, s, h);
+    spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0);
+    spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status);
+    rl_fi++; rl_fi %= 2;
+
+    _spe_in_mbox_write(spe_control_area[0], 0);
+    while (atomic_read(rl_cnt)<=0){
+        //pthread_yield();
+        usleep(1000);
+    }
+    atomic_dec(rl_cnt);
+
+
+/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/
+// 	memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16);
+// 	memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8);
+// 	memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8);
+//
+// 	memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16);
+// 	memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8);
+// 	memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8);
+//
+// 	decode_slice_mb_seq(d, s);
+//
+// 	for (int i=0; i<h->mb_height*16; i++){
+// 		for (int j=0; j<h->width; j++){
+// 			if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){
+// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]);
+// 				return;
+// 			}
+// 		}
+// 	}
+//
+// 	for (int i=0; i<h->mb_height*8; i++){
+// 		for (int j=0; j<h->width/2; j++){
+// 			if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){
+// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]);
+// 				return;
+// 			}
+// 		}
+// 	}
+//
+// 	for (int i=0; i<h->mb_height*8; i++){
+// 		for (int j=0; j<h->width/2; j++){
+// 			if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){
+// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]);
+// 				return;
+// 			}
+// 		}
+// 	}
+
+
+    //printf("dst_y %p\n", dst_y);
+
+
+     for (int i=0; i<s->release_cnt; i++){
+        for(int j=0; j<DPB_SIZE; j++){
+            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
+                release_dpb_entry(h, &h->dpb[j], 2);
+                break;
+            }
+        }
+    }
+    s->release_cnt=0;
+
+}
+
+static void *h264_spe_thread(void * thread_args ) {
+    H264spe *params = (H264spe *)thread_args;
+    unsigned int spe_id = params->spe_id;
+    unsigned int runflags = 0;
+    unsigned int entry = SPE_DEFAULT_ENTRY;
+    // run SPE context
+    spe_context_run(spe_context[spe_id],  &entry, runflags, (void*) params, NULL, NULL);
+    // done - now exit thread
+    pthread_exit(NULL);
+}
+
+static int create_spe_MBR_threads(H264Context *h, int num_threads) {
+    int i;
+
+    // reserve memory for spe thread id, context and argument addresses
+    spe_tid = av_malloc(num_threads * sizeof (pthread_t));
+    spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
+    spe_params = av_malloc(num_threads * sizeof (H264spe));
+    spe_control_area = av_malloc(num_threads * sizeof (void*));
+    spe_ls_area = av_malloc(num_threads * sizeof (void*));
+    spe_slice_buf = av_malloc(num_threads * sizeof (void*));
+
+    spe_program_handle_t *spe_program = spe_image_open("spe_mbd");
+
+    if (spe_program == NULL)
+        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
+
+    for (i = 0; i < num_threads; i++) {
+        // create context for spe program
+        spe_context[i] = spe_context_create(SPE_MAP_PS, NULL);
+        if (spe_context[i] == NULL)
+            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
+        // load SPE program into main memory
+        if ((spe_program_load(spe_context[i], spe_program)) == -1)
+            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
+        //get the control_area for fast mailboxing
+        if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL)
+            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
+        //get ls area for inter spe communication
+        if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL)
+            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
+    }
+
+    for (i = 0; i < num_threads; i++) {
+        spe_params[i].mb_width = h->mb_width;
+        spe_params[i].mb_height = h->mb_height;
+        spe_params[i].mb_stride = h->mb_stride;
+        spe_params[i].spe_id = i;
+        spe_params[i].spe_total = num_threads;
+        //spe_params[i].slice_params= &slice_params;
+        spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads];
+        spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads];
+
+        spe_params[i].rl_lock = rl_lock;
+        spe_params[i].rl_cond = rl_cond;
+        spe_params[i].rl_cnt = rl_cnt;
+        spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i];
+        spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i];
+        spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0);
+
+        mutex_init(spe_params[i].lock);
+        cond_init(spe_params[i].cond);
+        if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i]))
+            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
+
+        //slicebufaddr
+        spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]);
+
+        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
+    }
+    spe_image_close(spe_program);
+    return 0;
+}
+
+//_spe_out_mbox_read(spe_control_area[i]);
+/**
+* joins all the spe worker threads.
+*/
+static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) {
+    int i;
+    ///just to keep coding consistency.
+    {
+        for (i=0; i<num_threads; i++){
+            H264spe *p=&spe_params[i];
+            unsigned status;
+
+            while (atomic_read(p->cnt)>=2) {//double buffered
+                usleep(1000);//cond_wait(p->cond, p->lock);
+            }
+
+            spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0);
+            spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
+            //mutex_unlock(p->lock);
+            _spe_in_mbox_write(spe_control_area[i], 0);
+        }
+    }
+
+    for (i=0; i<num_threads; i++){
+        pthread_join(spe_tid[i], NULL);
+    }
+
+    for (i=0; i<num_threads; i++){
+        spe_context_destroy(spe_context[i]);
+    }
+    atomic_inc(rl_cnt);
+
+    // destroy memory reserved for spe thread id, context and argument addresses
+    av_freep(&spe_tid);
+    av_freep(&spe_context);
+    av_freep(&spe_params);
+    av_freep(&spe_control_area);
+    av_freep(&spe_slice_buf);
+}
+
+
+static void *rl_dist_thread(void *arg){
+    int i;
+    H264Context *h = (H264Context *) arg;
+    MBSlice *s;
+    DecodedPicture *dp;
+    int rl_fi[16]={0,};
+    DECLARE_ALIGNED(16, H264slice, spe_slice);
+
+    create_spe_MBR_threads(h, h->rl_threads);
+    for(;;){
+        {
+            pthread_mutex_lock(&h->lock[MBDEC]);
+            while (h->mbdec_cnt<=0)
+                pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
+            s= &h->mbdec_q[h->mbdec_fo];
+            h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT;
+            pthread_mutex_unlock(&h->lock[MBDEC]);
+        }
+
+        if (s->state<0){
+            break;
+        }
+        for (int i=0; i<2; i++){
+            for(int j=0; j< s->ref_count[i]; j++){
+                if (s->ref_list_cpn[i][j] ==-1)
+                    continue;
+                int k;
+                for (k=0; k<DPB_SIZE; k++){
+                    if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
+                        s->ref_list[i][j] = &h->dpb[k];
+                        break;
+                    }
+                }
+
+            }
+        }
+        dp = get_dpb_entry(h);
+        init_dpb_entry(dp, s, h->width, h->height);
+        assert(s->current_picture);
+        {
+            while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
+                usleep(1000);
+            }
+            h->mbrel_q[h->mbrel_fi] = *s;
+
+            h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
+        }
+        {
+            if(h->no_mbd){
+                atomic_inc(rl_cnt);
+            }else {
+                fill_spe_slice(&spe_slice, s, h);
+                for (i=0; i<h->rl_threads; i++){
+                    H264spe *p=&spe_params[i];
+                    unsigned status;
+                    while (atomic_read(p->cnt)>=2){ //double buffered
+                        usleep(1000);
+                        //cond_wait(p->cond, p->lock);
+                    }
+                    spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0);
+                    spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
+                    rl_fi[i]++; rl_fi[i] %= 2;
+                    atomic_inc(p->cnt);
+
+                    _spe_in_mbox_write(spe_control_area[i], 0);
+                }
+            }
+        }
+
+        {
+            pthread_mutex_lock(&h->lock[MBDEC]);
+            h->mbdec_cnt--;
+            pthread_cond_signal(&h->cond[MBDEC]);
+            pthread_mutex_unlock(&h->lock[MBDEC]);
+        }
+
+    }
+
+    {
+        while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
+            usleep(1000);
+        }
+        h->mbrel_q[h->mbrel_fi] = *s;
+
+        h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
+    }
+    spe_slice.state=-1;
+    join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi);
+    pthread_exit(NULL);
+    return NULL;
+}
+
+static void *mbdec_cell_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+
+    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
+    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
+    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
+    atomic_set(rl_cnt, 0);
+    mutex_init(rl_lock);
+    cond_init(rl_cond);
+// 	printf("mbdec, pid %d\n", syscall(SYS_gettid));
+    pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h);
+
+    for(;;){
+        MBSlice *s=NULL;
+        {
+            while (atomic_read(rl_cnt)<=0){
+                usleep(1000);
+            }
+            s= &h->mbrel_q[h->mbrel_fo];
+            h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT;
+        }
+
+        if (s->state<0)
+            break;
+
+        for (int i=0; i<s->release_cnt; i++){
+            for(int j=0; j<DPB_SIZE; j++){
+                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
+                    release_dpb_entry(h, &h->dpb[j], 2);
+                    break;
+                }
+            }
+        }
+
+        {
+            EDThreadContext *ed = s->ed;
+            pthread_mutex_lock(&ed->mbs_lock);
+            ed->mbs_cnt++;
+            pthread_cond_signal(&ed->mbs_cond);
+            pthread_mutex_unlock(&ed->mbs_lock);
+        }
+
+        {
+            pthread_mutex_lock(&h->lock[WRITE]);
+            while (h->write_cnt>= DPB_SIZE)
+                pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
+            assert(s);
+            assert(s->current_picture);
+            h->write_q[h->write_fi]= s->current_picture;
+            h->write_cnt++;
+            h->write_fi++; h->write_fi %= DPB_SIZE;
+            pthread_cond_signal(&h->cond[WRITE]);
+            pthread_mutex_unlock(&h->lock[WRITE]);
+
+        }
+        {
+            atomic_dec(rl_cnt);
+        }
+
+    }
+
+    {//propagate exit
+        pthread_mutex_lock(&h->lock[WRITE]);
+        while (h->write_cnt>= DPB_SIZE)
+            pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
+        last_pic.reference = -1;
+        h->write_q[h->write_fi] = &last_pic;
+        h->write_cnt++;
+        h->write_fi++; h->write_fi %= DPB_SIZE;
+        pthread_cond_signal(&h->cond[WRITE]);
+        pthread_mutex_unlock(&h->lock[WRITE]);
+
+    }
+    pthread_join(h->rl_dist_thr, NULL);
+    pthread_exit(NULL);
+    return NULL;
+}
+
+/*
+* The following code is the main loop of the file converter
+*/
+int h264_decode_cell(H264Context *h) {
+
+    pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;   
+
+    start_timer();
+
+    pthread_create(&read_thr, NULL, read_thread, h);
+    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
+    pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h);
+    pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h);
+    pthread_create(&write_thr, NULL, write_thread, h);
+
+    pthread_join(read_thr, NULL);
+    pthread_join(parsenal_thr, NULL);
+    pthread_join(entropy_thr, NULL);
+    pthread_join(mbdec_thr, NULL);
+    pthread_join(write_thr, NULL);
+
+    return 0;
+}
+
+/*
+* The following code is the main loop of the file converter
+*/
+int h264_decode_cell_seq(H264Context *h) {
+ParserContext *pc;
+    NalContext *nc;
+    EntropyContext *ec;
+    MBRecContext *rc;
+    OutputContext *oc;
+
+    RawFrame frm;
+    EDSlice slice, *s=&slice;
+    MBSlice mbslice, *s2=&mbslice;
+    PictureInfo *pic=NULL;
+    DecodedPicture *out;
+    int size;
+    int frames=0;
+    
+    pc = get_parse_context(h->ifile);
+    nc = get_nal_context(h->width, h->height);
+    ec = get_entropy_context( h );
+    rc = get_mbrec_context(h);
+    oc = get_output_context( h );
+
+    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
+    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
+    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
+    atomic_set(rl_cnt, 0);
+    mutex_init(rl_lock);
+    cond_init(rl_cond);
+
+    memset(s, 0, sizeof(EDSlice));
+    ff_init_slice(nc, s);
+    s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
+
+    DecodedPicture tmp;
+    tmp.base[0]=0;
+    ///fix this when want to debug the Cell errors
+    //init_dpb_entry(&tmp, h->width, h->height);
+
+    create_spe_ED_threads(h, 1, 0);
+    create_spe_MBR_threads(h, 1);
+    
+    start_timer();
+
+    while(!pc->final_frame && frames++ < h->num_frames){
+
+        av_read_frame_internal(pc, &frm);
+        
+        PictureInfo *pic=get_pib_entry(h);
+        ff_alloc_picture_info(nc, s, pic);
+        decode_nal_units(nc, s, &frm);
+
+        copyEDtoMBSlice(s2, s);
+        decode_slice_entropy_cell_seq(h, ec, s);
+        
+        decode_slice_mb_seq_cell(h, rc, s2, &tmp);
+
+        out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height);
+        
+        if (out){
+            release_dpb_entry(h, out, 1);
+        }
+        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
+    }
+    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
+
+    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
+
+    /* finished ! */
+    av_freep(&s->mbs);
+
+    free_parse_context(pc);
+    free_nal_context  (nc);
+    free_entropy_context(ec);
+    free_mbrec_context(rc);
+    free_output_context(oc);                
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_data.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_data.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,243 @@
+/*
+ * H26L/H264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @brief
+ *     H264 / AVC / MPEG4 part10 codec data table
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_H264DATA_H
+#define AVCODEC_H264DATA_H
+
+#include <stdint.h>
+#include "avcodec.h"
+//#include "h264.h"
+
+/*
+o-o o-o
+ / / /
+o-o o-o
+ ,---'
+o-o o-o
+ / / /
+o-o o-o
+*/
+//This table must be here because scan8[constant] must be known at compiletime
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+static const uint8_t golomb_to_pict_type[5]=
+{FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
+
+static const uint8_t golomb_to_intra4x4_cbp[48]={
+ 47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
+ 16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
+  8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
+};
+
+static const uint8_t golomb_to_inter_cbp[48]={
+  0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
+ 14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
+ 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
+};
+
+static const uint8_t zigzag_scan[16]={
+ 0+0*4, 1+0*4, 0+1*4, 0+2*4,
+ 1+1*4, 2+0*4, 3+0*4, 2+1*4,
+ 1+2*4, 0+3*4, 1+3*4, 2+2*4,
+ 3+1*4, 3+2*4, 2+3*4, 3+3*4,
+};
+
+static const uint8_t field_scan[16]={
+ 0+0*4, 0+1*4, 1+0*4, 0+2*4,
+ 0+3*4, 1+1*4, 1+2*4, 1+3*4,
+ 2+0*4, 2+1*4, 2+2*4, 2+3*4,
+ 3+0*4, 3+1*4, 3+2*4, 3+3*4,
+};
+
+static const uint8_t luma_dc_zigzag_scan[16]={
+ 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
+ 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
+ 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
+ 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
+};
+
+static const uint8_t luma_dc_field_scan[16]={
+ 0*16 + 0*64, 2*16 + 0*64, 1*16 + 0*64, 0*16 + 2*64,
+ 2*16 + 2*64, 3*16 + 0*64, 1*16 + 2*64, 3*16 + 2*64,
+ 0*16 + 1*64, 2*16 + 1*64, 0*16 + 3*64, 2*16 + 3*64,
+ 1*16 + 1*64, 3*16 + 1*64, 1*16 + 3*64, 3*16 + 3*64,
+};
+
+static const uint8_t chroma_dc_scan[4]={
+ (0+0*2)*16, (1+0*2)*16,
+ (0+1*2)*16, (1+1*2)*16,  //FIXME
+};
+
+
+static const uint8_t field_scan8x8[64]={
+ 0+0*8, 0+1*8, 0+2*8, 1+0*8,
+ 1+1*8, 0+3*8, 0+4*8, 1+2*8,
+ 2+0*8, 1+3*8, 0+5*8, 0+6*8,
+ 0+7*8, 1+4*8, 2+1*8, 3+0*8,
+ 2+2*8, 1+5*8, 1+6*8, 1+7*8,
+ 2+3*8, 3+1*8, 4+0*8, 3+2*8,
+ 2+4*8, 2+5*8, 2+6*8, 2+7*8,
+ 3+3*8, 4+1*8, 5+0*8, 4+2*8,
+ 3+4*8, 3+5*8, 3+6*8, 3+7*8,
+ 4+3*8, 5+1*8, 6+0*8, 5+2*8,
+ 4+4*8, 4+5*8, 4+6*8, 4+7*8,
+ 5+3*8, 6+1*8, 6+2*8, 5+4*8,
+ 5+5*8, 5+6*8, 5+7*8, 6+3*8,
+ 7+0*8, 7+1*8, 6+4*8, 6+5*8,
+ 6+6*8, 6+7*8, 7+2*8, 7+3*8,
+ 7+4*8, 7+5*8, 7+6*8, 7+7*8,
+};
+
+typedef struct IMbInfo{
+    uint16_t type;
+    uint8_t pred_mode;
+    uint8_t cbp;
+} IMbInfo;
+
+static const IMbInfo i_mb_type_info[26]={
+{MB_TYPE_INTRA4x4  , -1, -1},
+{MB_TYPE_INTRA16x16,  2,  0},
+{MB_TYPE_INTRA16x16,  1,  0},
+{MB_TYPE_INTRA16x16,  0,  0},
+{MB_TYPE_INTRA16x16,  3,  0},
+{MB_TYPE_INTRA16x16,  2,  16},
+{MB_TYPE_INTRA16x16,  1,  16},
+{MB_TYPE_INTRA16x16,  0,  16},
+{MB_TYPE_INTRA16x16,  3,  16},
+{MB_TYPE_INTRA16x16,  2,  32},
+{MB_TYPE_INTRA16x16,  1,  32},
+{MB_TYPE_INTRA16x16,  0,  32},
+{MB_TYPE_INTRA16x16,  3,  32},
+{MB_TYPE_INTRA16x16,  2,  15+0},
+{MB_TYPE_INTRA16x16,  1,  15+0},
+{MB_TYPE_INTRA16x16,  0,  15+0},
+{MB_TYPE_INTRA16x16,  3,  15+0},
+{MB_TYPE_INTRA16x16,  2,  15+16},
+{MB_TYPE_INTRA16x16,  1,  15+16},
+{MB_TYPE_INTRA16x16,  0,  15+16},
+{MB_TYPE_INTRA16x16,  3,  15+16},
+{MB_TYPE_INTRA16x16,  2,  15+32},
+{MB_TYPE_INTRA16x16,  1,  15+32},
+{MB_TYPE_INTRA16x16,  0,  15+32},
+{MB_TYPE_INTRA16x16,  3,  15+32},
+{MB_TYPE_INTRA_PCM , -1, -1},
+};
+
+typedef struct PMbInfo{
+    uint16_t type;
+    uint8_t partition_count;
+} PMbInfo;
+
+static const PMbInfo p_mb_type_info[5]={
+{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
+};
+
+static const PMbInfo p_sub_mb_type_info[4]={
+{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
+{MB_TYPE_16x8 |MB_TYPE_P0L0             , 2},
+{MB_TYPE_8x16 |MB_TYPE_P0L0             , 2},
+{MB_TYPE_8x8  |MB_TYPE_P0L0             , 4},
+};
+
+static const PMbInfo b_mb_type_info[23]={
+{MB_TYPE_DIRECT2|MB_TYPE_L0L1                                      , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
+{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
+};
+
+static const PMbInfo b_sub_mb_type_info[13]={
+{MB_TYPE_DIRECT2                                                   , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
+{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
+{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
+{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 4, },
+{MB_TYPE_8x8               |MB_TYPE_P0L1             |MB_TYPE_P1L1, 4, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
+};
+
+static const uint8_t dequant4_coeff_init[6][3]={
+  {10,13,16},
+  {11,14,18},
+  {13,16,20},
+  {14,18,23},
+  {16,20,25},
+  {18,23,29},
+};
+
+static const uint8_t dequant8_coeff_init_scan[16] = {
+  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
+};
+static const uint8_t dequant8_coeff_init[6][6]={
+  {20,18,32,19,25,24},
+  {22,19,35,21,28,26},
+  {26,23,42,24,33,31},
+  {28,25,45,26,35,33},
+  {32,28,51,30,40,38},
+  {36,32,58,34,46,43},
+};
+
+#endif /* AVCODEC_H264DATA_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_deblock.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_deblock.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,507 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... loop filter
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 loop filter.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "dsputil.h"
+#include "mathops.h"
+#include "rectangle.h"
+#include "h264_types.h"
+#include "h264_misc.h"
+#include "h264_data.h"
+//#undef NDEBUG
+#include <assert.h>
+
+/* Deblocking filter (p153) */
+static const uint8_t alpha_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+   255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+};
+static const uint8_t beta_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+};
+static const uint8_t tc0_table[52*3][4] = {
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
+    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
+    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
+    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
+    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
+    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
+    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+};
+
+av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) {
+    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
+        mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
+    } else {
+        mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
+    }
+}
+
+av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
+    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
+        mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
+    } else {
+        mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
+    }
+}
+
+
+av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
+    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
+        mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
+    } else {
+        mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
+    }
+}
+
+av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
+    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp + s->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
+        mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
+    } else {
+        mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
+    }
+}
+
+static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) {
+    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
+    const int qp_xy= m->qscale_mb_xy;
+    const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy;
+    const int linesize = mrc->linesize;
+    const int uvlinesize = mrc->uvlinesize;
+    const int mb_type = m->mb_type;
+    int edge;
+    const int edges = mrs->edges[dir];
+
+    if(mbm_type){
+        int16_t* bS=mrs->bS[dir][0];
+        /* Filter edge */
+        // Do not use s->qscale as luma quantizer because it has not the same
+        // value in IPCM macroblocks.
+        if(bS[0]+bS[1]+bS[2]+bS[3]){
+            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
+            if( dir == 0 ) {
+                filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s );
+                {
+                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
+                    filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s);
+                    filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s);
+                }
+            } else {
+                filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s );
+                {
+                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
+                    filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s);
+                    filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s);
+                }
+            }
+        }
+    }
+
+    for( edge = 1; edge < edges; edge++ ) {
+        int16_t* bS=mrs->bS[dir][edge];
+        int qp = qp_xy;
+
+        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
+            continue;
+
+        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+            continue;
+
+        /* Filter edge */
+        // Do not use s->qscale as luma quantizer because it has not the same
+        // value in IPCM macroblocks.
+
+        if( dir == 0 ) {
+            filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s);
+            if( (edge&1) == 0 ) {
+                filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
+                filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
+            }
+        } else {
+            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s );
+            if( (edge&1) == 0 ) {
+                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
+                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
+            }
+        }
+    }
+}
+
+static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){
+    int v;
+    v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx];
+    if(!v && mrs->ref_cache[0][b_idx]!=-1)
+        // absolute value >= 7 | ...
+        v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
+        ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
+
+    if(s->list_count==2){
+        if(!v)
+            v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) |
+            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
+            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit);
+
+        if(v){
+            if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) |
+                (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx]))
+                return 1;
+            return
+            ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
+            ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
+            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
+            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
+        }
+    }
+
+    return v;
+}
+
+static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) {
+    int mb_type = m->mb_type;
+    int edge;
+    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
+
+    // how often to recheck mv-based bS when iterating between edges
+    static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
+    {0,3,1,1,3,3,3,3}};
+    const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
+    const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
+    // how often to recheck mv-based bS when iterating along each edge
+    const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
+
+    mrs->edges[dir]= edges;
+
+    if(mbm_type){
+        int16_t* bS=mrs->bS[dir][0];
+        if( IS_INTRA(mb_type|mbm_type)) {
+            AV_WN64A(bS, 0x0004000400040004ULL);
+        } else {
+            int i;
+            int mv_done;
+            if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
+                int b_idx= 8 + 4;
+                int bn_idx= b_idx - (dir ? 8:1);
+
+                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit);
+                mv_done = 1;
+            }
+            else
+                mv_done = 0;
+
+            for( i = 0; i < 4; i++ ) {
+                int x = dir == 0 ? 0 : i;
+                int y = dir == 0 ? i    : 0;
+                int b_idx= 8 + 4 + x + 8*y;
+                int bn_idx= b_idx - (dir ? 8:1);
+
+                if( mrs->non_zero_count_cache[b_idx] |
+                    mrs->non_zero_count_cache[bn_idx] ) {
+                    bS[i] = 2;
+                }
+                else if(!mv_done)
+                {
+                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
+                }
+            }
+        }
+    }
+
+    /* Calculate bS */
+    for( edge = 1; edge < edges; edge++ ) {
+        int16_t* bS=mrs->bS[dir][edge];
+
+        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
+            continue;
+
+        if( IS_INTRA(mb_type)) {
+            AV_WN64A(bS, 0x0003000300030003ULL);
+        } else {
+            int i;
+            int mv_done;
+
+            if( edge & mask_edge ) {
+                AV_ZERO64(bS);
+                mv_done = 1;
+            }
+            else if( mask_par0 ) {
+                int b_idx= 8 + 4 + edge * (dir ? 8:1);
+                int bn_idx= b_idx - (dir ? 8:1);
+
+                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
+                mv_done = 1;
+            }
+            else
+                mv_done = 0;
+
+            for( i = 0; i < 4; i++ ) {
+                int x = dir == 0 ? edge : i;
+                int y = dir == 0 ? i    : edge;
+                int b_idx= 8 + 4 + x + 8*y;
+                int bn_idx= b_idx - (dir ? 8:1);
+
+                if( mrs->non_zero_count_cache[b_idx] |
+                    mrs->non_zero_count_cache[bn_idx] ) {
+                    bS[i] = 2;
+                }
+                else if(!mv_done)
+                {
+                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
+                }
+            }
+
+            if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+                continue;
+        }
+
+    }
+}
+
+
+/**
+*
+* @return zero if the loop filter can be skiped
+*/
+static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
+    H264Mb *m_top = m - mrc->mb_width;
+    H264Mb *m_left = m - 1;
+    const int mb_x = m->mb_x;
+    const int mb_y = m->mb_y;
+    int top_type, left_type;
+    int qp, top_qp, left_qp;
+    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
+
+    qp = m->qscale_mb_xy ;
+    left_qp = m->qscale_left_mb_xy ;
+    top_qp  = m->qscale_top_mb_xy ;
+
+    //for sufficiently low qp, filtering wouldn't do anything
+    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
+    if(qp <= qp_thresh
+        && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
+        && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
+        return 0;
+    }
+
+    if(IS_INTRA(mb_type)){
+        return 1;
+    }
+
+    {
+        int list;
+        for(list=0; list<s->list_count; list++){
+            int8_t *ref;
+
+            if(!USES_LIST(mb_type, list)){
+                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
+                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
+                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+                continue;
+            }
+
+            ref = &mrs->ref_index[list][4*mb_x];
+            {
+                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
+                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+                ref += 2;
+
+                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+                AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
+            }
+        }
+    }
+
+    /*
+    0 . T T. T T T T
+    1 L . .L . . . .
+    2 L . .L . . . .
+    3 . T TL . . . .
+    4 L . .L . . . .
+    5 L . .. . . . .
+    */
+
+    if (IS_SKIP(mb_type)){
+        memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+    }
+
+    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
+    top_type  = mrs->top_type;
+    left_type = mrs->left_type;
+    if(top_type){
+        AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]);
+    }
+
+    if(left_type){
+        mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4];
+        mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4];
+        mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4];
+        mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4];
+    }
+
+    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
+        int list;
+        for(list=0; list<s->list_count; list++){
+            if(USES_LIST(top_type, list)){
+                const int b_xy= 4*mb_x + 3*mrc->b_stride;
+                const int b8_x= 4*mb_x + 2;
+                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
+                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
+
+                mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
+                mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]];
+                mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
+                mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]];
+            }else{
+                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
+                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
+            }
+
+            if(USES_LIST(left_type, list)){
+                const int b_x = 4*(mb_x-1) + 3;
+                const int b8_x= 4*(mb_x-1) + 1;
+                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
+                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]);
+                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]);
+                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]);
+                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]);
+
+                mrs->ref_cache[list][scan8[0] - 1 + 0 ]=
+                mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]];
+                mrs->ref_cache[list][scan8[0] - 1 +16 ]=
+                mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]];
+
+            }else{
+                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]);
+                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]);
+                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]);
+                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]);
+
+                mrs->ref_cache[list][scan8[0] - 1 + 0  ]=
+                mrs->ref_cache[list][scan8[0] - 1 + 8  ]=
+                mrs->ref_cache[list][scan8[0] - 1 + 16 ]=
+                mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
+            }
+        }
+    }
+    return 1;
+}
+
+void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
+    if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){
+        calc_bS_values(mrc, mrs, s, m, 4, 0);
+        calc_bS_values(mrc, mrs, s, m, 4, 1);
+        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0);
+        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1);
+    }
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_deblock.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_deblock.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,8 @@
+#ifndef H264_LOOPFILTER_H
+#define H264_LOOPFILTER_H
+
+#include "h264_types.h"
+
+void ff_h264_filter_mb(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_dsp.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_dsp.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,320 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 DSP functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include <stdint.h>
+#include "avcodec.h"
+#include "h264_dsp.h"
+
+#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
+#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
+#define H264_WEIGHT(W,H) \
+static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
+    int y; \
+    offset <<= log2_denom; \
+    if(log2_denom) offset += 1<<(log2_denom-1); \
+    for(y=0; y<H; y++, block += stride){ \
+        op_scale1(0); \
+        op_scale1(1); \
+        if(W==2) continue; \
+        op_scale1(2); \
+        op_scale1(3); \
+        if(W==4) continue; \
+        op_scale1(4); \
+        op_scale1(5); \
+        op_scale1(6); \
+        op_scale1(7); \
+        if(W==8) continue; \
+        op_scale1(8); \
+        op_scale1(9); \
+        op_scale1(10); \
+        op_scale1(11); \
+        op_scale1(12); \
+        op_scale1(13); \
+        op_scale1(14); \
+        op_scale1(15); \
+    } \
+} \
+static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    int y; \
+    offset = ((offset + 1) | 1) << log2_denom; \
+    for(y=0; y<H; y++, dst += stride, src += stride){ \
+        op_scale2(0); \
+        op_scale2(1); \
+        if(W==2) continue; \
+        op_scale2(2); \
+        op_scale2(3); \
+        if(W==4) continue; \
+        op_scale2(4); \
+        op_scale2(5); \
+        op_scale2(6); \
+        op_scale2(7); \
+        if(W==8) continue; \
+        op_scale2(8); \
+        op_scale2(9); \
+        op_scale2(10); \
+        op_scale2(11); \
+        op_scale2(12); \
+        op_scale2(13); \
+        op_scale2(14); \
+        op_scale2(15); \
+    } \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16,8)
+H264_WEIGHT(8,16)
+H264_WEIGHT(8,8)
+H264_WEIGHT(8,4)
+H264_WEIGHT(4,8)
+H264_WEIGHT(4,4)
+H264_WEIGHT(4,2)
+H264_WEIGHT(2,4)
+H264_WEIGHT(2,2)
+
+#undef op_scale1
+#undef op_scale2
+#undef H264_WEIGHT
+
+static av_always_inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
+{
+    int i, d;
+    for( i = 0; i < 4; i++ ) {
+        if( tc0[i] < 0 ) {
+            pix += 4*ystride;
+            continue;
+        }
+        for( d = 0; d < 4; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int p2 = pix[-3*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+            const int q2 = pix[2*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int tc = tc0[i];
+                int i_delta;
+
+                if( FFABS( p2 - p0 ) < beta ) {
+                    if(tc0[i])
+                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
+                    tc++;
+                }
+                if( FFABS( q2 - q0 ) < beta ) {
+                    if(tc0[i])
+                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
+                    tc++;
+                }
+
+                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
+                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
+}
+static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
+}
+
+static av_always_inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
+{
+    int d;
+    for( d = 0; d < 16; d++ ) {
+        const int p2 = pix[-3*xstride];
+        const int p1 = pix[-2*xstride];
+        const int p0 = pix[-1*xstride];
+
+        const int q0 = pix[ 0*xstride];
+        const int q1 = pix[ 1*xstride];
+        const int q2 = pix[ 2*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                if( FFABS( p2 - p0 ) < beta)
+                {
+                    const int p3 = pix[-4*xstride];
+                    /* p0', p1', p2' */
+                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                } else {
+                    /* p0' */
+                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                }
+                if( FFABS( q2 - q0 ) < beta)
+                {
+                    const int q3 = pix[3*xstride];
+                    /* q0', q1', q2' */
+                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                } else {
+                    /* q0' */
+                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                }
+            }else{
+                /* p0', q0' */
+                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+        }
+        pix += ystride;
+    }
+}
+static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
+}
+static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
+}
+
+static av_always_inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
+{
+    int i, d;
+    for( i = 0; i < 4; i++ ) {
+        const int tc = tc0[i];
+        if( tc <= 0 ) {
+            pix += 2*ystride;
+            continue;
+        }
+        for( d = 0; d < 2; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
+                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
+}
+static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
+}
+
+static av_always_inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
+{
+    int d;
+    for( d = 0; d < 8; d++ ) {
+        const int p0 = pix[-1*xstride];
+        const int p1 = pix[-2*xstride];
+        const int q0 = pix[0];
+        const int q1 = pix[1*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+        }
+        pix += ystride;
+    }
+}
+static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
+}
+static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
+}
+
+void ff_h264dsp_init(H264DSPContext *c)
+{
+    c->h264_idct_add= ff_h264_idct_add_c;
+    c->h264_idct8_add= ff_h264_idct8_add_c;
+    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
+    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
+    c->h264_idct_add16     = ff_h264_idct_add16_c;
+    c->h264_idct8_add4     = ff_h264_idct8_add4_c;
+    c->h264_idct_add8      = ff_h264_idct_add8_c;
+    c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
+
+    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
+    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
+    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
+    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
+    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
+    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
+    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
+    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
+    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
+    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
+    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
+    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
+    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
+    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
+    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
+    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
+    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
+    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
+    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
+    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
+
+    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
+    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
+    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
+    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
+    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
+    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
+    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
+    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
+    c->h264_loop_filter_strength= NULL;
+
+    if (ARCH_ARM) ff_h264dsp_init_arm(c);
+    if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c);
+    if (HAVE_MMX) ff_h264dsp_init_x86(c);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_dsp.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_dsp.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 DSP functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_H264DSP_H
+#define AVCODEC_H264DSP_H
+
+#include <stdint.h>
+#include "dsputil.h"
+
+//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+
+/**
+ * Context for storing H.264 DSP functions
+ */
+typedef struct H264DSPContext{
+    /* weighted MC */
+    h264_weight_func weight_h264_pixels_tab[10];
+    h264_biweight_func biweight_h264_pixels_tab[10];
+
+    /* loop filter */
+    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
+    /* v/h_loop_filter_luma_intra: align 16 */
+    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
+    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
+    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
+                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
+
+    /* IDCT */
+    /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
+       NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
+        The reason for above, is that no 2 out of one list may use a different permutation.
+    */
+    void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
+    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
+    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
+    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
+    void (*h264_dct)(DCTELEM block[4][4]);
+    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+
+    qpel_mc_func (*qpel_put)[16];
+    qpel_mc_func (*qpel_avg)[16];
+}H264DSPContext;
+
+void ff_h264dsp_init(H264DSPContext *c);
+void ff_h264dsp_init_arm(H264DSPContext *c);
+void ff_h264dsp_init_ppc(H264DSPContext *c);
+void ff_h264dsp_init_x86(H264DSPContext *c);
+
+#endif /* AVCODEC_H264DSP_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_entropy.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_entropy.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,2065 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 cabac decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "avcodec.h"
+#include "h264_types.h"
+#include "h264_data.h"
+#include "cabac.h"
+#include "rectangle.h"
+#include "h264_misc.h"
+
+// #undef NDEBUG
+#include <assert.h>
+
+/* Cabac pre state table */
+
+static const int8_t cabac_context_init_I[460][2] =
+{
+    /* 0 - 10 */
+    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
+    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
+    { -6,  53 }, { -1, 54 },  {  7,  51 },
+
+    /* 11 - 23 unsused for I */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },
+
+    /* 24- 39 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+
+    /* 40 - 53 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 54 - 59 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 60 - 69 */
+    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+    { 13, 41 },  { 3, 62 },
+
+    /* 70 -> 87 */
+    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
+    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
+    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
+    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
+    { -12, 115 },{ -16, 122 },
+
+    /* 88 -> 104 */
+    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
+    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
+    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
+    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
+    { -22, 125 },
+
+    /* 105 -> 135 */
+    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
+    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
+    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
+    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
+    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
+    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
+    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
+    { 14, 62 },  { -13, 108 },{ -15, 100 },
+
+    /* 136 -> 165 */
+    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
+    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
+    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
+    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
+    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
+    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
+    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
+    { 0, 62 },   { 12, 72 },
+
+    /* 166 -> 196 */
+    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
+    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
+    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
+    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
+    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
+    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
+    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
+    { 0, 89 },   { 26, -19 }, { 22, -17 },
+
+    /* 197 -> 226 */
+    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
+    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
+    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
+    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
+    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
+    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
+    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
+    { 12, 68 },  { 2, 97 },
+
+    /* 227 -> 251 */
+    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
+    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
+    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
+    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
+    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
+    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
+    { -4, 65 },
+
+    /* 252 -> 275 */
+    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
+    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
+    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
+    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
+    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
+    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
+
+    /* 276 a bit special (not used, bypass is used instead) */
+    { 0, 0 },
+
+    /* 277 -> 307 */
+    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
+    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
+    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
+    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
+    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
+    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
+    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
+    { 9, 64 },   { -12, 104 },{ -11, 97 },
+
+    /* 308 -> 337 */
+    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
+    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
+    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
+    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
+    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
+    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
+    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
+    { 5, 64 },   { 12, 70 },
+
+    /* 338 -> 368 */
+    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
+    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
+    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
+    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
+    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
+    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
+    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
+    { -12, 109 },{ 36, -35 }, { 36, -34 },
+
+    /* 369 -> 398 */
+    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
+    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
+    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
+    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
+    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
+    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
+    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
+    { 29, 39 },  { 19, 66 },
+
+    /* 399 -> 435 */
+    {  31,  21 }, {  31,  31 }, {  25,  50 },
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
+    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
+    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
+    {   0,  68 }, {  -9,  92 },
+
+    /* 436 -> 459 */
+    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
+    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
+    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
+    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
+};
+
+static const int8_t cabac_context_init_PB[3][460][2] =
+{
+    /* i_cabac_init_idc == 0 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
+        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
+        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
+        {  17,  50 },
+
+        /* 24 - 39 */
+        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
+        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
+        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
+        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
+
+        /* 40 - 53 */
+        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
+        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
+        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
+        {  -3,  81 }, {   0,  88 },
+
+        /* 54 - 59 */
+        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
+        {  -7,  72 }, {   1,  58 },
+
+        /* 60 - 69 */
+        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
+        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
+        {  13,  41 }, {   3,  62 },
+
+        /* 70 - 87 */
+        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
+        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
+        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
+        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
+        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
+        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
+        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
+        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
+        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
+
+        /* 105 -> 165 */
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
+        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
+        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
+        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
+        {   9,  69 },
+
+        /* 166 - 226 */
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
+        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
+        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
+        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
+        {  -9, 108 },
+
+        /* 227 - 275 */
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
+        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
+        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
+        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
+        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
+        {  -8,  85 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
+        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
+        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
+        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
+        {  26,  43 },
+
+        /* 338 - 398 */
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
+        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
+        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
+        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
+        {  11,  86 },
+
+        /* 399 - 435 */
+        {  12,  40 }, {  11,  51 }, {  14,  59 },
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
+        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
+        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
+        {  -8,  66 }, {  -8,  76 },
+
+        /* 436 - 459 */
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
+    },
+
+    /* i_cabac_init_idc == 1 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
+        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
+        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
+        {  10,  54 },
+
+        /* 24 - 39 */
+        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
+        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
+        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
+        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
+
+        /* 40 - 53 */
+        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
+        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
+        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
+        {  -7,  86 },{  -5,  95 },
+
+        /* 54 - 59 */
+        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
+        {  -5,  72 },{   0,  61 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
+        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
+        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
+        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
+        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
+        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
+        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
+        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
+        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
+
+        /* 105 -> 165 */
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
+        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
+        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
+        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
+        {   0,  89 },
+
+        /* 166 - 226 */
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
+        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
+        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
+        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
+        { -10, 116 },
+
+        /* 227 - 275 */
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
+        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
+        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
+        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
+        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
+        {  -4,  78 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
+        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
+        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
+        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
+        {  18,  50 },
+
+        /* 338 - 398 */
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
+        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
+        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
+        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
+        {  11,  83 },
+
+        /* 399 - 435 */
+        {  25,  32 }, {  21,  49 }, {  21,  54 },
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
+        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
+        {  -4,  67 }, {  -7,  82 },
+
+        /* 436 - 459 */
+        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
+        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
+        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
+        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+    },
+
+    /* i_cabac_init_idc == 2 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
+        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
+        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
+        {  14,  57 },
+
+        /* 24 - 39 */
+        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
+        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
+        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
+        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
+
+        /* 40 - 53 */
+        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
+        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
+        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
+        {  -3,  90 },{  -1,  101 },
+
+        /* 54 - 59 */
+        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
+        {  -7,  50 },{   1,  60 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
+        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
+        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
+        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
+        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
+        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
+        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
+        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
+        {   3,  68 }, {  -8,  71 }, { -13,  98 },
+
+        /* 105 -> 165 */
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
+        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
+        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
+        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
+        { -22, 127 },
+
+        /* 166 - 226 */
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
+        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
+        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
+        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
+        { -24, 127 },
+
+        /* 227 - 275 */
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
+        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
+        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
+        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
+        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
+        { -10,  87 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
+        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
+        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
+        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
+        {  25,  42 },
+
+        /* 338 - 398 */
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
+        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
+        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
+        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
+        {  25,  61 },
+
+        /* 399 - 435 */
+        {  21,  33 }, {  19,  50 }, {  17,  61 },
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
+        {  -6,  68 }, { -10,  79 },
+
+        /* 436 - 459 */
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+    }
+};
+
+static const uint8_t left_block_options[4][16]={
+    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
+    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
+    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
+    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
+};
+
+static const uint8_t rem6[52]={
+0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+};
+
+static const uint8_t div6[52]={
+0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+};
+
+static void init_dequant8_coeff_table(H264Slice *s, EntropyContext *ec){
+    int i,q,x;
+    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
+    ec->dequant8_coeff[0] = ec->dequant8_buffer[0];
+    ec->dequant8_coeff[1] = ec->dequant8_buffer[1];
+
+    for(i=0; i<2; i++){
+        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
+            ec->dequant8_coeff[1] = ec->dequant8_buffer[0];
+            break;
+        }
+
+        for(q=0; q<52; q++){
+            int shift = div6[q];
+            int idx = rem6[q];
+            for(x=0; x<64; x++)
+                ec->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
+                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
+                    s->pps.scaling_matrix8[i][x]) << shift;
+        }
+    }
+}
+
+static void init_dequant4_coeff_table(H264Slice *s, EntropyContext *ec){
+    int i,j,q,x;
+    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
+    for(i=0; i<6; i++ ){
+        ec->dequant4_coeff[i] = ec->dequant4_buffer[i];
+        for(j=0; j<i; j++){
+            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
+                ec->dequant4_coeff[i] = ec->dequant4_buffer[j];
+                break;
+            }
+        }
+        if(j<i)
+            continue;
+
+        for(q=0; q<52; q++){
+            int shift = div6[q] + 2;
+            int idx = rem6[q];
+            for(x=0; x<16; x++)
+                ec->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
+                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
+                    s->pps.scaling_matrix4[i][x]) << shift;
+        }
+    }
+}
+
+void init_dequant_tables(H264Slice *s, EntropyContext *ec){
+    int i,x;
+
+    init_dequant4_coeff_table(s, ec);
+    if(s->pps.transform_8x8_mode)
+        init_dequant8_coeff_table(s, ec);
+    if(s->transform_bypass){
+        for(i=0; i<6; i++)
+            for(x=0; x<16; x++)
+                ec->dequant4_coeff[i][0][x] = 1<<6;
+        if(s->pps.transform_8x8_mode)
+            for(i=0; i<2; i++)
+                for(x=0; x<64; x++)
+                    ec->dequant8_coeff[i][0][x] = 1<<6;
+    }
+}
+
+void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c) {
+    int i;
+    const int8_t (*tab)[2];
+
+    if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I;
+    else                                 tab = cabac_context_init_PB[s->cabac_init_idc];
+
+    /* calculate pre-state */
+    for( i= 0; i < 460; i++ ) {
+        int pre = 2*(((tab[i][0] * ec->curr_qscale) >>4 ) + tab[i][1]) - 127;
+
+        pre^= pre>>31;
+        if(pre > 124)
+            pre= 124 + (pre&1);
+
+        c->cabac_state[i] =  pre;
+    }
+}
+
+static void fill_decode_neighbors(EntropyContext *ec, H264Slice *s){
+    H264Mb *m = ec->m;
+	const int mb_x = m->mb_x;
+
+    if (m->mb_y){
+        ec->top_type     = ec->mb_type_top[mb_x];
+        ec->topright_type= ec->mb_type_top[mb_x+1];
+        ec->topleft_type = ec->mb_type_top[mb_x-1];
+        m->qscale_top_mb_xy = ec->qscale_top[mb_x];
+    } else {
+        ec->top_type     = 0;
+        ec->topright_type= 0;
+        ec->topleft_type = 0;
+        m->qscale_top_mb_xy = 0;
+    }
+
+    ec->left_type    = ec->mb_type[mb_x-1] ;
+    m->qscale_left_mb_xy = ec->qscale[mb_x-1];
+
+}
+
+static void fill_decode_caches(EntropyContext *ec, H264Slice *s, int mb_type){
+    H264Mb *m = ec->m;
+    int topleft_type, top_type, topright_type, left_type;
+    const uint8_t * left_block= left_block_options[0];
+	const int mb_x = m->mb_x;
+    int i;
+
+    topleft_type = ec->topleft_type;
+	top_type     = ec->top_type;
+    topright_type= ec->topright_type;
+	left_type    = ec->left_type;
+
+    if(!IS_SKIP(mb_type)){
+        if(top_type){
+            AV_COPY32(&ec->non_zero_count_cache[4+8*0], &ec->non_zero_count_top[mb_x][0]);
+            ec->non_zero_count_cache[1+8*0]= ec->non_zero_count_top[mb_x][4];
+            ec->non_zero_count_cache[2+8*0]= ec->non_zero_count_top[mb_x][5];
+            ec->non_zero_count_cache[1+8*3]= ec->non_zero_count_top[mb_x][6];
+            ec->non_zero_count_cache[2+8*3]= ec->non_zero_count_top[mb_x][7];
+
+        }else {
+            ec->non_zero_count_cache[1+8*0]=
+            ec->non_zero_count_cache[2+8*0]=
+            ec->non_zero_count_cache[1+8*3]=
+            ec->non_zero_count_cache[2+8*3]=
+            AV_WN32A(&ec->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040);
+        }
+
+        if(left_type){
+            for (i=0; i<2; i++) {
+                ec->non_zero_count_cache[3+8*1 + 2*8*i]= ec->non_zero_count_left[i*2+0];
+                ec->non_zero_count_cache[3+8*2 + 2*8*i]= ec->non_zero_count_left[i*2+1];
+                ec->non_zero_count_cache[0+8*1 + 3*8*i]= ec->non_zero_count_left[4+i*2+0];
+                ec->non_zero_count_cache[0+8*2 + 3*8*i]= ec->non_zero_count_left[4+i*2+1];
+            }
+        }
+        else{
+            for (i=0; i<2; i++) {
+                ec->non_zero_count_cache[3+8*1 + 2*8*i]=
+                ec->non_zero_count_cache[3+8*2 + 2*8*i]=
+                ec->non_zero_count_cache[0+8*1 + 3*8*i]=
+                ec->non_zero_count_cache[0+8*2 + 3*8*i]= !IS_INTRA(mb_type) ? 0 : 64;
+            }
+        }
+
+		// top_cbp
+		if(top_type) {
+			ec->top_cbp = ec->cbp_top[mb_x];
+		} else {
+			ec->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
+		}
+		// left_cbp
+		if (left_type) {
+			ec->left_cbp = (ec->cbp[mb_x-1] & 0x1f0)
+			|  ((ec->cbp[mb_x-1]>>(left_block[0]&(~1)))&2)
+			| (((ec->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2);
+		} else {
+			ec->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
+		}
+    }
+
+    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
+        int list;
+
+        ec->ref_cache[0][scan8[5 ]+1] = ec->ref_cache[0][scan8[7 ]+1] = ec->ref_cache[0][scan8[13]+1] =
+        ec->ref_cache[1][scan8[5 ]+1] = ec->ref_cache[1][scan8[7 ]+1] = ec->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
+
+        for(list=0; list<s->list_count; list++){
+            if(!USES_LIST(mb_type, list)){
+                continue;
+            }
+            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
+
+            if(USES_LIST(top_type, list)){
+                ec->ref_cache[list][scan8[0] + 0 - 1*8]=
+                ec->ref_cache[list][scan8[0] + 1 - 1*8]= ec->ref_index_top[list][4*mb_x + 2];
+                ec->ref_cache[list][scan8[0] + 2 - 1*8]=
+                ec->ref_cache[list][scan8[0] + 3 - 1*8]= ec->ref_index_top[list][4*mb_x + 3];
+            }else{
+                AV_WN32A(&ec->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
+            }
+
+            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
+                for(i=0; i<2; i++){
+                    int cache_idx = scan8[0] - 1 + i*2*8;
+                    if(USES_LIST(left_type, list)){
+                        const int b8_x= 4*(mb_x-1) + 1;
+                        ec->ref_cache[list][cache_idx  ]= ec->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
+                        ec->ref_cache[list][cache_idx+8]= ec->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
+                    }else{
+                        ec->ref_cache[list][cache_idx  ]=
+                        ec->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
+                    }
+                }
+            }else{
+                if(USES_LIST(left_type, list)){
+                    const int b8_x= 4*(mb_x-1) + 1;
+                    ec->ref_cache[list][scan8[0] - 1]= ec->ref_index[list][b8_x + (left_block[0]&~1)];
+                }else{
+                    ec->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if(USES_LIST(topright_type, list)){
+                ec->ref_cache[list][scan8[0] + 4 - 1*8]= ec->ref_index_top[list][4*(mb_x+1) + 2];
+            }else{
+                ec->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+            }
+            if(ec->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
+                int topleft_partition= -1;
+                if(USES_LIST(topleft_type, list)){
+                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
+                    ec->ref_cache[list][scan8[0] - 1 - 1*8]= ec->ref_index_top[list][b8_x];
+                }else{
+                    ec->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
+                continue;
+
+            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
+                ec->ref_cache[list][scan8[4 ]] =
+                ec->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
+
+				/* XXX beurk, Load mvd */
+				if(USES_LIST(top_type, list)){
+					AV_COPY64(ec->mvd_cache[list][scan8[0] + 0 - 1*8], ec->mvd_top[list][8*mb_x + 0]);
+				}else{
+					AV_ZERO64(ec->mvd_cache[list][scan8[0] + 0 - 1*8]);
+				}
+				if(USES_LIST(left_type, list)){
+					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 0*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[0]]);
+					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 1*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[1]]);
+				}else{
+					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 0*8]);
+					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 1*8]);
+				}
+				if(USES_LIST(left_type, list)){
+					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 2*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[2]]);
+					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 3*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[3]]);
+				}else{
+					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 2*8]);
+					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 3*8]);
+				}
+				AV_ZERO16(ec->mvd_cache [list][scan8[4 ]]);
+				AV_ZERO16(ec->mvd_cache [list][scan8[12]]);
+				if(s->slice_type_nos == FF_B_TYPE){
+					fill_rectangle(&ec->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
+
+					if(IS_DIRECT(top_type)){
+						AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1));
+					}else if(IS_8X8(top_type)){
+						int b8_x = 4*mb_x;
+						ec->direct_cache[scan8[0] + 0 - 1*8]= ec->direct_top[b8_x + 2];
+						ec->direct_cache[scan8[0] + 2 - 1*8]= ec->direct_top[b8_x + 3];
+					}else{
+						AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1));
+					}
+
+					if(IS_DIRECT(left_type))
+						ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1;
+					else if(IS_8X8(left_type))
+						ec->direct_cache[scan8[0] - 1 + 0*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)];
+					else
+						ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1;
+
+					if(IS_DIRECT(left_type))
+						ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1;
+					else if(IS_8X8(left_type))
+						ec->direct_cache[scan8[0] - 1 + 2*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)];
+					else
+						ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1;
+				}
+            }
+        }
+    }
+    ec->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type);
+}
+
+static inline void write_back_non_zero_count(EntropyContext *ec, H264Slice *s){
+    H264Mb *m = ec->m;
+    const int mb_x= m->mb_x;
+
+    //bottom nnz
+    AV_COPY32(&ec->non_zero_count[mb_x][0], &ec->non_zero_count_cache[4+8*4] );
+    ec->non_zero_count[mb_x][4] = ec->non_zero_count_cache[1+8*2];
+    ec->non_zero_count[mb_x][5] = ec->non_zero_count_cache[2+8*2];
+    ec->non_zero_count[mb_x][6] = ec->non_zero_count_cache[1+8*5];
+    ec->non_zero_count[mb_x][7] = ec->non_zero_count_cache[2+8*5];
+
+    for (int i=0; i<2; i++) {
+        ec->non_zero_count_left[i*2+0]   = ec->non_zero_count_cache[7+8*1 + 2*8*i];
+        ec->non_zero_count_left[i*2+1]   = ec->non_zero_count_cache[7+8*2 + 2*8*i];
+        ec->non_zero_count_left[4+i*2+0] = ec->non_zero_count_cache[2+8*1 + 3*8*i];
+        ec->non_zero_count_left[4+i*2+1] = ec->non_zero_count_cache[2+8*2 + 3*8*i];
+    }
+
+    AV_COPY32(&m->non_zero_count[ 0], &ec->non_zero_count_cache[4+8*1]);
+    AV_COPY32(&m->non_zero_count[ 4], &ec->non_zero_count_cache[4+8*2]);
+    AV_COPY32(&m->non_zero_count[ 8], &ec->non_zero_count_cache[4+8*3]);
+    AV_COPY32(&m->non_zero_count[12], &ec->non_zero_count_cache[4+8*4]);
+
+    for (int i=0; i<2; i++) {
+        m->non_zero_count[16 + i*2   ] = ec->non_zero_count_cache[8*1 + 8*i + 1];
+        m->non_zero_count[16 + i*2 +1] = ec->non_zero_count_cache[8*1 + 8*i + 2];
+        m->non_zero_count[20 + i*2   ] = ec->non_zero_count_cache[8*4 + 8*i + 1];
+        m->non_zero_count[20 + i*2 +1] = ec->non_zero_count_cache[8*4 + 8*i + 2];
+    }
+}
+
+static inline void write_back_motion(EntropyContext *ec, H264Slice *s, int mb_type){
+    H264Mb *m = ec->m;
+	const int mb_x = m->mb_x;
+    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
+    int list;
+
+    for(list=0; list<s->list_count; list++){
+        if(!USES_LIST(mb_type, list))
+            continue;
+
+        {
+            uint8_t (*mvd_dst)[2] = (void *) ec->mvd[list][8*mb_x];
+            uint8_t (*mvd_src)[2] = &ec->mvd_cache[list][scan8[0]];
+            if(IS_SKIP(mb_type))
+                AV_ZERO128(mvd_dst);
+            else{
+				AV_COPY64(mvd_dst, mvd_src + 8*3);
+                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
+                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
+                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
+            }
+        }
+        int8_t *ref_index = &ec->ref_index[list][b_x];
+        {
+            ref_index[0+0*2]= ec->ref_cache[list][scan8[0]];
+            ref_index[1+0*2]= ec->ref_cache[list][scan8[4]];
+            ref_index[0+1*2]= ec->ref_cache[list][scan8[8]];
+            ref_index[1+1*2]= ec->ref_cache[list][scan8[12]];
+        }
+    }
+
+    if(s->slice_type_nos == FF_B_TYPE){
+        if(IS_8X8(mb_type)){
+            uint8_t *direct = &ec->direct[4*mb_x];
+            direct[1] = m->sub_mb_type[1]>>1;
+            direct[2] = m->sub_mb_type[2]>>1;
+            direct[3] = m->sub_mb_type[3]>>1;
+        }
+    }
+}
+
+static inline int get_dct8x8_allowed(EntropyContext *ec, H264Slice *s){
+    H264Mb *m = ec->m;
+    if(s->direct_8x8_inference_flag)
+        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
+    else
+        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
+}
+
+/**
+ * decodes a P_SKIP or B_SKIP macroblock
+ */
+static void decode_mb_skip(EntropyContext *ec, H264Slice *s){
+    H264Mb *m = ec->m;
+	const int mb_x = m->mb_x;
+    int mb_type;
+
+    if( s->slice_type_nos == FF_B_TYPE )
+        mb_type= MB_TYPE_16x16|MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
+    else
+        mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+
+    fill_rectangle(&ec->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+    write_back_motion(ec, s, mb_type);
+    m->mb_type = ec->mb_type[mb_x] = mb_type;
+    m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale;
+
+    AV_ZERO64(ec->non_zero_count[mb_x]);
+    AV_ZERO64(ec->non_zero_count_left);
+    memset(m->non_zero_count, 0, 24);
+}
+
+static int decode_cabac_intra_mb_type(EntropyContext *ec, H264Slice *s, CABACContext *c, int ctx_base, int intra_slice) {
+    uint8_t *state= &c->cabac_state[ctx_base];
+    int mb_type;
+
+    if(intra_slice){
+        int ctx=0;
+        if( ec->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
+            ctx++;
+        if( ec->top_type     & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
+            ctx++;
+        if( get_cabac_noinline( c, &state[ctx] ) == 0 )
+            return 0;   /* I4x4 */
+        state += 2;
+    }else{
+        if( get_cabac_noinline( c, state ) == 0 )
+            return 0;   /* I4x4 */
+    }
+
+    if( get_cabac_terminate( c ) )
+        return 25;  /* PCM */
+
+    mb_type = 1; /* I16x16 */
+    mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */
+    if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */
+        mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] );
+    mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] );
+    mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] );
+    return mb_type;
+}
+
+static int decode_cabac_mb_skip(EntropyContext *ec, H264Slice *s, H264Mb *m, CABACContext *c) {
+    int ctx = 0;
+
+	if( m->mb_x>0 && !IS_SKIP( ec->left_type ))
+        ctx++;
+	if( m->mb_y>0 && !IS_SKIP( ec->top_type ))
+        ctx++;
+
+    if( s->slice_type_nos == FF_B_TYPE )
+        ctx += 13;
+    return get_cabac_noinline(c, &c->cabac_state[11+ctx] );
+}
+
+static int decode_cabac_mb_intra4x4_pred_mode_delta( CABACContext *c) {
+    int mode = 0;
+
+    if( get_cabac(c, &c->cabac_state[68] ) )
+        return -1;
+
+    mode += 1 * get_cabac(c, &c->cabac_state[69] );
+    mode += 2 * get_cabac(c, &c->cabac_state[69] );
+    mode += 4 * get_cabac(c, &c->cabac_state[69] );
+
+    return mode;
+}
+
+static int decode_cabac_mb_chroma_pre_mode(EntropyContext *ec, H264Slice *s, CABACContext *c) {
+    H264Mb *m = ec->m;
+	const int mb_x = m->mb_x;
+
+    int ctx = 0;
+
+    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */
+    if( ec->left_type && ec->chroma_pred_mode[mb_x-1] != 0 )
+        ctx++;
+
+    if( ec->top_type     && ec->chroma_pred_mode_top[mb_x] != 0 )
+        ctx++;
+
+    if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 )
+        return 0;
+
+    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
+        return 1;
+    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
+        return 2;
+    else
+        return 3;
+}
+
+static int decode_cabac_mb_cbp_luma(EntropyContext *ec, CABACContext *c) {
+    int cbp_b, cbp_a, ctx, cbp = 0;
+
+    cbp_a = ec->left_cbp;
+    cbp_b = ec->top_cbp;
+
+    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]);
+    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1;
+    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2;
+    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
+    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3;
+    return cbp;
+}
+static int decode_cabac_mb_cbp_chroma(EntropyContext *ec, CABACContext *c) {
+    int ctx;
+    int cbp_a, cbp_b;
+
+    cbp_a = (ec->left_cbp>>4)&0x03;
+    cbp_b = (ec-> top_cbp>>4)&0x03;
+
+    ctx = 0;
+    if( cbp_a > 0 ) ctx++;
+    if( cbp_b > 0 ) ctx += 2;
+    if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 )
+        return 0;
+
+    ctx = 4;
+    if( cbp_a == 2 ) ctx++;
+    if( cbp_b == 2 ) ctx += 2;
+    return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] );
+}
+
+static int decode_cabac_p_mb_sub_type( CABACContext *c) {
+    if( get_cabac(c, &c->cabac_state[21] ) )
+        return 0;   /* 8x8 */
+    if( !get_cabac(c, &c->cabac_state[22] ) )
+        return 1;   /* 8x4 */
+    if( get_cabac(c, &c->cabac_state[23] ) )
+        return 2;   /* 4x8 */
+    return 3;       /* 4x4 */
+}
+static int decode_cabac_b_mb_sub_type(CABACContext *c) {
+    int type;
+    if( !get_cabac(c, &c->cabac_state[36] ) )
+        return 0;   /* B_Direct_8x8 */
+    if( !get_cabac(c, &c->cabac_state[37] ) )
+        return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
+    type = 3;
+    if( get_cabac(c, &c->cabac_state[38] ) ) {
+        if( get_cabac(c, &c->cabac_state[39] ) )
+            return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
+        type += 4;
+    }
+    type += 2*get_cabac(c, &c->cabac_state[39] );
+    type +=   get_cabac(c, &c->cabac_state[39] );
+    return type;
+}
+
+static int decode_cabac_mb_ref(EntropyContext *ec, H264Slice *s, CABACContext *c, int list, int n ) {
+    int refa = ec->ref_cache[list][scan8[n] - 1];
+    int refb = ec->ref_cache[list][scan8[n] - 8];
+    int ref  = 0;
+    int ctx  = 0;
+
+    if( s->slice_type_nos == FF_B_TYPE) {
+        if( refa > 0 && !(ec->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) )
+            ctx++;
+        if( refb > 0 && !(ec->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) )
+            ctx += 2;
+    } else {
+        if( refa > 0 )
+            ctx++;
+        if( refb > 0 )
+            ctx += 2;
+    }
+
+    while( get_cabac(c, &c->cabac_state[54+ctx] ) ) {
+        ref++;
+        ctx = (ctx>>2)+4;
+        if(ref >= 32 /*h->ref_list[list]*/){
+            return -1;
+        }
+    }
+    return ref;
+}
+
+static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) {
+    int mvd;
+
+    if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
+        *mvda= 0;
+        return 0;
+    }
+
+    mvd= 1;
+    ctxbase+= 3;
+    while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) {
+        if( mvd < 4 )
+            ctxbase++;
+        mvd++;
+    }
+
+    if( mvd >= 9 ) {
+        int k = 3;
+        while( get_cabac_bypass(c ) ) {
+            mvd += 1 << k;
+            k++;
+            if(k>24){
+                av_log(AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
+                return INT_MIN;
+            }
+        }
+        while( k-- ) {
+            mvd += get_cabac_bypass(c )<<k;
+        }
+        *mvda=mvd < 70 ? mvd : 70;
+    }else
+        *mvda=mvd;
+    return get_cabac_bypass_sign(c, -mvd );
+}
+
+#define DECODE_CABAC_MB_MVD( ec, c, list,  n )\
+{\
+    int amvd0 = ec->mvd_cache[list][scan8[n] - 1][0] +\
+                ec->mvd_cache[list][scan8[n] - 8][0];\
+    int amvd1 = ec->mvd_cache[list][scan8[n] - 1][1] +\
+                ec->mvd_cache[list][scan8[n] - 8][1];\
+\
+    m->mvd[list][mp][0] = decode_cabac_mb_mvd( c, 40, amvd0, &mpx ); \
+    m->mvd[list][mp][1] = decode_cabac_mb_mvd( c, 47, amvd1, &mpy ); \
+    mp++; \
+}
+
+static av_always_inline int get_cabac_cbf_ctx(EntropyContext *ec, H264Slice *s, int cat, int idx, int is_dc ) {
+    int nza, nzb;
+    int ctx = 0;
+
+    if( is_dc ) {
+        if( cat == 0 ) {
+            nza = ec->left_cbp&0x100;
+            nzb = ec-> top_cbp&0x100;
+        } else {
+            nza = (ec->left_cbp>>(6+idx))&0x01;
+            nzb = (ec-> top_cbp>>(6+idx))&0x01;
+        }
+    } else {
+        assert(cat == 1 || cat == 2 || cat == 4);
+        nza = ec->non_zero_count_cache[scan8[idx] - 1];
+        nzb = ec->non_zero_count_cache[scan8[idx] - 8];
+    }
+
+    if( nza > 0 )
+        ctx++;
+
+    if( nzb > 0 )
+        ctx += 2;
+
+    return ctx + 4 * cat;
+}
+
+DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
+};
+
+static const int significant_coeff_flag_offset[2][6] = {
+    { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
+    { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
+};
+static const int last_coeff_flag_offset[2][6] = {
+    { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
+    { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
+};
+static const int coeff_abs_level_m1_offset[6] = {
+    227+0, 227+10, 227+20, 227+30, 227+39, 426
+};
+static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
+    { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
+    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
+    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
+    12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
+    { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
+    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
+    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
+    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
+};
+/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
+* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
+* map node ctx => cabac ctx for level=1 */
+static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
+/* map node ctx => cabac ctx for level>1 */
+static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+static const uint8_t coeff_abs_level_transition[2][8] = {
+    /* update node ctx after decoding a level=1 */
+    { 1, 2, 3, 3, 4, 5, 6, 7 },
+    /* update node ctx after decoding a level>1 */
+    { 4, 4, 4, 4, 5, 6, 7, 7 }
+};
+
+static av_always_inline void decode_cabac_residual_internal(EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
+    H264Mb *m = ec->m;
+	const int mb_x = m->mb_x;
+    int index[64];
+
+    int av_unused last;
+    int coeff_count = 0;
+    int node_ctx = 0;
+
+    uint8_t *significant_coeff_ctx_base;
+    uint8_t *last_coeff_ctx_base;
+    uint8_t *abs_level_m1_ctx_base;
+
+    /* read coded block flag */
+    if( is_dc || cat != 5 ) {
+        if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( ec, s, cat, n, is_dc ) ] ) == 0 ) {
+            if( !is_dc )
+                ec->non_zero_count_cache[scan8[n]] = 0;
+            return;
+        }
+    }
+
+    significant_coeff_ctx_base = c->cabac_state
+        + significant_coeff_flag_offset[0][cat];
+    last_coeff_ctx_base = c->cabac_state
+        + last_coeff_flag_offset[0][cat];
+    abs_level_m1_ctx_base = c->cabac_state
+        + coeff_abs_level_m1_offset[cat];
+
+    if( !is_dc && cat == 5 ) {
+#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
+        for(last= 0; last < coefs; last++) { \
+            uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
+            if( get_cabac( c, sig_ctx )) { \
+                uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
+                index[coeff_count++] = last; \
+                if( get_cabac( c, last_ctx ) ) { \
+                    last= max_coeff; \
+                    break; \
+                } \
+            } \
+        }\
+        if( last == max_coeff -1 ) {\
+            index[coeff_count++] = last;\
+        }
+
+        const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0];
+        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
+    } else {
+        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
+    }
+    assert(coeff_count > 0);
+
+    if( is_dc ) {
+        if( cat == 0 )
+            ec->cbp[mb_x] |= 0x100;
+        else
+            ec->cbp[mb_x] |= 0x40 << n;
+    } else {
+        if( cat == 5 )
+            fill_rectangle(&ec->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
+        else {
+            assert( cat == 1 || cat == 2 || cat == 4 );
+            ec->non_zero_count_cache[scan8[n]] = coeff_count;
+        }
+    }
+
+    do {
+        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
+
+        int j= scantable[index[--coeff_count]];
+
+        if( get_cabac( c, ctx ) == 0 ) {
+            node_ctx = coeff_abs_level_transition[0][node_ctx];
+            if( is_dc ) {
+                block[j] = get_cabac_bypass_sign( c, -1);
+            }else{
+                block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6;
+            }
+        } else {
+            int coeff_abs = 2;
+            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
+            node_ctx = coeff_abs_level_transition[1][node_ctx];
+
+            while( coeff_abs < 15 && get_cabac( c, ctx ) ) {
+                coeff_abs++;
+            }
+
+            if( coeff_abs >= 15 ) {
+                int j = 0;
+                while( get_cabac_bypass( c ) ) {
+                    j++;
+                }
+
+                coeff_abs=1;
+                while( j-- ) {
+                    coeff_abs += coeff_abs + get_cabac_bypass( c );
+                }
+                coeff_abs+= 14;
+            }
+
+            if( is_dc ) {
+                block[j] = get_cabac_bypass_sign( c, -coeff_abs );
+            }else{
+                block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6;
+            }
+        }
+    } while( coeff_count );
+
+}
+
+static void decode_cabac_residual_dc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
+    decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, NULL, max_coeff, 1);
+}
+
+static void decode_cabac_residual_nondc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
+    decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, qmul, max_coeff, 0);
+}
+
+/**
+ * decodes a macroblock
+ * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c) {
+    H264Mb *m = ec->m;
+	int mb_x = m->mb_x;
+    int mb_type, partition_count, cbp = 0;
+    int dct8x8_allowed= s->pps.transform_8x8_mode;
+
+    fill_decode_neighbors(ec, s);
+
+    if( s->slice_type_nos != FF_I_TYPE ) {
+        int skip;
+        /* a skipped mb needs the aff flag from the following mb */
+        skip = decode_cabac_mb_skip( ec, s, m, c);
+
+        /* read skip flags */
+        if( skip ) {
+            decode_mb_skip(ec, s);
+            m->cbp = ec->cbp[mb_x] = 0;
+            ec->chroma_pred_mode[mb_x] = 0;
+            ec->last_qscale_diff = 0;
+            return 0;
+        }
+    }
+
+    if( s->slice_type_nos == FF_B_TYPE ) {
+        int ctx = 0;
+
+        if( !IS_DIRECT( ec->left_type-1 ) )
+            ctx++;
+        if( !IS_DIRECT( ec->top_type-1 ) )
+            ctx++;
+
+        if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){
+            mb_type= 0; /* B_Direct_16x16 */
+        }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) {
+            mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */
+        }else{
+            int bits;
+            bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3;
+            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2;
+            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1;
+            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] );
+            if( bits < 8 ){
+                mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
+            }else if( bits == 13 ){
+                mb_type= decode_cabac_intra_mb_type(ec, s, c, 32, 0);
+                goto decode_intra_mb;
+            }else if( bits == 14 ){
+                mb_type= 11; /* B_L1_L0_8x16 */
+            }else if( bits == 15 ){
+                mb_type= 22; /* B_8x8 */
+            }else{
+                bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] );
+                mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
+            }
+        }
+        partition_count= b_mb_type_info[mb_type].partition_count;
+        mb_type=         b_mb_type_info[mb_type].type;
+    } else if( s->slice_type_nos == FF_P_TYPE ) {
+        if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) {
+            /* P-type */
+            if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) {
+                /* P_L0_D16x16, P_8x8 */
+                mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] );
+            } else {
+                /* P_L0_D8x16, P_L0_D16x8 */
+                mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] );
+            }
+            partition_count= p_mb_type_info[mb_type].partition_count;
+            mb_type=         p_mb_type_info[mb_type].type;
+        } else {
+            mb_type= decode_cabac_intra_mb_type(ec, s, c, 17, 0);
+            goto decode_intra_mb;
+        }
+    } else {
+        mb_type= decode_cabac_intra_mb_type(ec, s ,c, 3, 1);
+        if(s->slice_type == FF_SI_TYPE && mb_type)
+            mb_type--;
+        assert(s->slice_type_nos == FF_I_TYPE);
+decode_intra_mb:
+        partition_count = 0;
+        cbp= i_mb_type_info[mb_type].cbp;
+        m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
+        mb_type= i_mb_type_info[mb_type].type;
+    }
+
+    if(IS_INTRA_PCM(mb_type)) {
+        const uint8_t *ptr;
+        // We assume these blocks are very rare so we do not optimize it.
+        // FIXME The two following lines get the bitstream position in the cabac
+        // decode, I think it should be done by a function in cabac.h (or cabac.c).
+        ptr=c->bytestream;
+        if(c->low&0x1) ptr--;
+        if(CABAC_BITS==16){
+            if(c->low&0x1FF) ptr--;
+        }
+		//printf("pcm\n");
+        // The pixels are stored in the same order as levels in h->mb array.
+        memcpy(m->mb, ptr, 256); ptr+=256;
+		memcpy(m->mb+128, ptr, 128); ptr+=128;
+
+        ff_init_cabac_decoder(c, ptr, c->bytestream_end - ptr);
+
+        // All blocks are present
+        m->cbp= ec->cbp[mb_x] = 0x1ef;
+        ec->chroma_pred_mode[mb_x] = 0;
+        // In deblocking, the quantizer is 0
+        m->qscale_mb_xy = ec->qscale[mb_x]= 0;
+        // All coeffs are present
+        memset(ec->non_zero_count[mb_x], 16, 8);
+		m->mb_type = ec->mb_type[mb_x]=  mb_type;
+        ec->last_qscale_diff = 0;
+
+        return 0;
+    }
+
+    fill_decode_caches(ec, s, mb_type);
+
+    int mp = 0;
+    if( IS_INTRA( mb_type ) ) {
+        int i, pred_mode;
+        if( IS_INTRA4x4( mb_type ) ) {
+            if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ) ) {
+                mb_type |= MB_TYPE_8x8DCT;
+                for( i = 0; i < 16; i+=4 ) {
+                    m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c);
+                }
+            } else {
+                for( i = 0; i < 16; i++ ) {
+                    m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c);
+                }
+            }
+        }
+
+        m->chroma_pred_mode= ec->chroma_pred_mode[mb_x] =
+		pred_mode = decode_cabac_mb_chroma_pre_mode( ec, s, c );
+
+    } else if( partition_count == 4 ) {
+        int i, j, sub_partition_count[4], list;
+
+        if( s->slice_type_nos == FF_B_TYPE ) {
+            for( i = 0; i < 4; i++ ) {
+                m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c );
+                sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
+                m->sub_mb_type[i]=      b_sub_mb_type_info[ m->sub_mb_type[i] ].type;
+            }
+            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
+                          m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
+                ec->ref_cache[0][scan8[4]] =
+                ec->ref_cache[1][scan8[4]] =
+                ec->ref_cache[0][scan8[12]] =
+                ec->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
+
+                for( i = 0; i < 4; i++ )
+                    fill_rectangle( &ec->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 );
+            }
+        } else {
+            for( i = 0; i < 4; i++ ) {
+                m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c );
+                sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
+                m->sub_mb_type[i]=      p_sub_mb_type_info[ m->sub_mb_type[i] ].type;
+            }
+        }
+
+        for( list = 0; list < s->list_count; list++ ) {
+            for( i = 0; i < 4; i++ ) {
+                if(IS_DIRECT(m->sub_mb_type[i])) continue;
+                if(IS_DIR(m->sub_mb_type[i], 0, list)){
+                    if( s->ref_count[list] > 1 ){
+                        m->ref_index[list][i] = decode_cabac_mb_ref(ec, s, c, list, 4*i );
+                        if(m->ref_index[list][i] >= s->ref_count[list]){
+                            av_log(AV_LOG_ERROR, "Reference %d >= %d\n", m->ref_index[list][i], s->ref_count[list]);
+                            return -1;
+                        }
+                    }else
+                        m->ref_index[list][i] = 0;
+                } else {
+                    m->ref_index[list][i] = -1;
+                }
+                ec->ref_cache[list][ scan8[4*i]   ]=ec->ref_cache[list][ scan8[4*i]+1 ]=
+                ec->ref_cache[list][ scan8[4*i]+8 ]=ec->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i];
+            }
+        }
+
+        if(dct8x8_allowed){
+//             assert(0);
+            dct8x8_allowed = get_dct8x8_allowed(ec, s);
+        }
+
+        for(list=0; list<s->list_count; list++){
+            for(i=0; i<4; i++){
+//                 ec->ref_cache[list][ scan8[4*i]   ]=ec->ref_cache[list][ scan8[4*i]+1 ];
+                if(IS_DIRECT(m->sub_mb_type[i])){
+                    fill_rectangle(ec->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
+                    continue;
+                }
+
+                if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){
+                    const int sub_mb_type= m->sub_mb_type[i];
+                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
+                    for(j=0; j<sub_partition_count[i]; j++){
+                        int mpx, mpy;
+                        const int index= 4*i + block_width*j;
+                        uint8_t (* mvd_cache)[2]= &ec->mvd_cache[list][ scan8[index]];
+
+                        DECODE_CABAC_MB_MVD( ec, c, list, index)
+
+                        if(IS_SUB_8X8(sub_mb_type)){
+                            mvd_cache[ 1 ][0]=
+                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx;
+                            mvd_cache[ 1 ][1]=
+                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy;
+                        }else if(IS_SUB_8X4(sub_mb_type)){
+                            mvd_cache[ 1 ][0]=  mpx;
+                            mvd_cache[ 1 ][1]= mpy;
+                        }else if(IS_SUB_4X8(sub_mb_type)){
+                            mvd_cache[ 8 ][0]= mpx;
+                            mvd_cache[ 8 ][1]= mpy;
+                        }
+                        mvd_cache[ 0 ][0]= mpx;
+                        mvd_cache[ 0 ][1]= mpy;
+                    }
+                }else{
+                    fill_rectangle(ec->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
+                }
+            }
+        }
+    } else if( IS_DIRECT(mb_type) ) {
+        mb_type |= MB_TYPE_16x16;
+        fill_rectangle(ec->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
+        fill_rectangle(ec->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
+        dct8x8_allowed &= s->direct_8x8_inference_flag;
+    } else {
+        int list, i;
+        if(IS_16X16(mb_type)){
+            for(list=0; list<s->list_count; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    int ref;
+                    if(s->ref_count[list] > 1){
+                        ref= decode_cabac_mb_ref(ec, s, c, list, 0);
+                        if(ref >= s->ref_count[list]){
+                            av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
+                            return -1;
+                        }
+                    }else
+                        ref=0;
+                    m->ref_index[list][0]= ref;
+                    fill_rectangle(&ec->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
+                }
+            }
+            for(list=0; list<s->list_count; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    int mpx,mpy;
+                    DECODE_CABAC_MB_MVD( ec, c, list, 0)
+
+                    fill_rectangle(ec->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
+                }
+
+            }
+        }
+        else if(IS_16X8(mb_type)){
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        int ref;
+                        if(s->ref_count[list] > 1){
+                            ref= decode_cabac_mb_ref(ec, s, c, list, 8*i );
+                            if(ref >= s->ref_count[list]){
+                                av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
+                                return -1;
+                            }
+                        }else
+                            ref=0;
+                        m->ref_index[list][i]= ref;
+                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
+                    }else{
+                        m->ref_index[list][i]= LIST_NOT_USED;
+                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
+                    }
+                }
+            }
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        int mpx,mpy;
+                        DECODE_CABAC_MB_MVD( ec, c, list, 8*i)
+
+                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
+                    }else{
+                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
+                    }
+                }
+            }
+        }else{
+            assert(IS_8X16(mb_type));
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){ //FIXME optimize
+                        int ref;
+                        if(s->ref_count[list] > 1){
+                            ref= decode_cabac_mb_ref(ec, s, c, list, 4*i );
+                            if(ref >= s->ref_count[list]){
+                                av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
+                                return -1;
+                            }
+                        }else
+                            ref=0;
+                        m->ref_index[list][i]= ref;
+                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
+                    }else{
+                        m->ref_index[list][i]= LIST_NOT_USED;
+                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
+                    }
+                }
+            }
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        int mpx,mpy;
+                        DECODE_CABAC_MB_MVD( ec, c, list, 4*i)
+
+                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
+                    }else{
+                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
+                    }
+                }
+            }
+        }
+    }
+
+    if( IS_INTER( mb_type ) ||(IS_DIRECT(mb_type))) {
+        ec->chroma_pred_mode[mb_x] = 0;
+        write_back_motion( ec, s, mb_type );
+    }
+
+    if( !IS_INTRA16x16( mb_type ) ) {
+        cbp  = decode_cabac_mb_cbp_luma( ec, c);
+		cbp |= decode_cabac_mb_cbp_chroma( ec, c ) << 4;
+    }
+
+    ec->cbp[mb_x] = m->cbp = cbp;
+
+    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
+        int t = get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] );
+        mb_type |= MB_TYPE_8x8DCT * t;
+    }
+    m->mb_type = ec->mb_type[mb_x] = mb_type;
+
+    if( cbp || IS_INTRA16x16( mb_type ) ) {
+        const uint8_t *scan, *scan8x8, *dc_scan;
+        const uint32_t *qmul;
+
+
+        if (s->transform_bypass && ec->curr_qscale){
+            scan8x8= ff_zigzag_direct;
+            scan= zigzag_scan;
+        }else{
+            scan8x8= ec->zigzag_scan8x8;
+            scan= ec->zigzag_scan;
+        }
+        dc_scan= luma_dc_zigzag_scan;
+
+        // decode_cabac_mb_dqp
+        if(get_cabac_noinline(c, &c->cabac_state[60 + (ec->last_qscale_diff != 0)])){
+            int val = 1;
+            int ctx= 2;
+
+            while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) {
+                ctx= 3;
+                val++;
+                if(val > 102){ //prevent infinite loop
+                    av_log(AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", m->mb_x, m->mb_y);
+                    return -1;
+                }
+            }
+
+            if( val&0x01 )
+                val=   (val + 1)>>1 ;
+            else
+                val= -((val + 1)>>1);
+            ec->last_qscale_diff = val;
+            ec->curr_qscale += val;
+            if(((unsigned)ec->curr_qscale) > 51){
+                if(ec->curr_qscale<0) ec->curr_qscale+= 52;
+                else            ec->curr_qscale-= 52;
+            }
+            ec->chroma_qp[0] = get_chroma_qp( s, 0, ec->curr_qscale);
+            ec->chroma_qp[1] = get_chroma_qp( s, 1, ec->curr_qscale);
+        }else
+            ec->last_qscale_diff=0;
+
+        memset(m->mb, 0, 16*16 * sizeof(DCTELEM));
+        if( IS_INTRA16x16( mb_type ) ) {
+            int i;
+
+            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
+            decode_cabac_residual_dc( ec, s, c, m->mb, 0, 0, dc_scan, 16);
+            qmul = ec->dequant4_coeff[0][ec->curr_qscale];
+            if( cbp&15 ) {
+                for( i = 0; i < 16; i++ ) {
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
+                    decode_cabac_residual_nondc( ec, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15);
+                }
+            } else {
+                fill_rectangle(&ec->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+            }
+            h264_luma_dc_dequant_idct_c(m->mb, qmul[0]);
+        } else {
+
+            int i8x8, i4x4;
+            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+                if( cbp & (1<<i8x8) ) {
+                    if( IS_8x8DCT(mb_type) ) {
+                        decode_cabac_residual_nondc(ec, s, c, m->mb + 64*i8x8, 5, 4*i8x8,
+                            scan8x8, ec->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][ec->curr_qscale], 64);
+                    } else {
+                        qmul = ec->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][ec->curr_qscale];
+                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+                            const int index = 4*i8x8 + i4x4;
+                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
+//START_TIMER
+                            decode_cabac_residual_nondc(ec, s, c, m->mb + 16*index, 2, index, scan, qmul, 16);
+//STOP_TIMER("decode_residual")
+                        }
+                    }
+                } else {
+                    uint8_t * const nnz= &ec->non_zero_count_cache[ scan8[4*i8x8] ];
+                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                }
+            }
+        }
+
+        if( cbp&0x30 ){
+            memset(m->mb + 256, 0, 2*64 * sizeof(DCTELEM));
+            for( int i = 0; i < 2; i++ ) {
+                const uint32_t dequant4_coeff = ec->dequant4_coeff[IS_INTRA(mb_type) ? 1+i:4+i][ec->chroma_qp[i]][0];
+
+                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+                decode_cabac_residual_dc(ec, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4);
+                chroma_dc_dequant_idct_c(m->mb + 256 + 16*4*i, dequant4_coeff);
+            }
+        }
+
+        if( cbp&0x20 ) {
+            int i, j;
+            for( i = 0; i < 2; i++ ) {
+                qmul = ec->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][ec->chroma_qp[i]];
+                for( j = 0; j < 4; j++ ) {
+                    const int index = 16 + 4 * i + j;
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+                    decode_cabac_residual_nondc( ec, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15);
+                }
+            }
+        } else {
+            uint8_t * const nnz= &ec->non_zero_count_cache[0];
+            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        }
+
+    } else {
+        uint8_t * const nnz= &ec->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        ec->last_qscale_diff = 0;
+    }
+
+    m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale;
+    write_back_non_zero_count(ec, s);
+
+
+    return 0;
+}
+
+void free_entropy_context(EntropyContext *ec){
+    av_freep(&ec->non_zero_count_row[0]);
+    av_freep(&ec->non_zero_count_row[1]);
+    av_freep(&ec->mvd_table[0][0]);
+    av_freep(&ec->mvd_table[0][1]);
+    av_freep(&ec->mvd_table[1][0]);
+    av_freep(&ec->mvd_table[1][1]);
+
+    av_freep(&ec->direct_table[0]);
+    av_freep(&ec->direct_table[1]);
+    av_freep(&ec->chroma_pred_mode_table[0]);
+    av_freep(&ec->chroma_pred_mode_table[1]);
+    av_freep(&ec->cbp_table[0]);
+    av_freep(&ec->cbp_table[1]);
+    av_freep(&ec->qscale_table[0]);
+    av_freep(&ec->qscale_table[1]);
+
+    av_freep(&ec->mb_type_table[0]);
+    av_freep(&ec->mb_type_table[1]);
+    av_freep(&ec->ref_index_table[0][0]);
+    av_freep(&ec->ref_index_table[0][1]);
+    av_freep(&ec->ref_index_table[1][0]);
+    av_freep(&ec->ref_index_table[1][1]);
+
+
+    av_free(ec);
+}
+
+EntropyContext *get_entropy_context(H264Context *h){
+    const int mb_height = h->mb_height;
+    const int mb_width  = h->mb_width;
+    const int mb_stride = h->mb_stride;
+
+    EntropyContext *ec = av_mallocz(sizeof(EntropyContext));
+
+    ec->mb_width = mb_width;
+    ec->mb_height = mb_height;
+    ec->b_stride  = mb_width*4;
+    ec->mb_stride = mb_stride;
+
+    FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[0], mb_stride * 8 * sizeof(uint8_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[1], mb_stride * 8 * sizeof(uint8_t), fail)
+
+    FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][0], 16*mb_stride * sizeof(uint8_t), fail);
+    FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][1], 16*mb_stride * sizeof(uint8_t), fail);
+    FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][0], 16*mb_stride * sizeof(uint8_t), fail);
+    FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][1], 16*mb_stride * sizeof(uint8_t), fail);
+
+    FF_ALLOCZ_OR_GOTO(ec->direct_table[0], 4*mb_stride * sizeof(uint8_t) , fail);
+    FF_ALLOCZ_OR_GOTO(ec->direct_table[1], 4*mb_stride * sizeof(uint8_t) , fail);
+
+    FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[0], mb_stride * sizeof(uint8_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[1], mb_stride * sizeof(uint8_t), fail)
+
+    FF_ALLOCZ_OR_GOTO(ec->cbp_table[0], mb_stride * sizeof(uint16_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->cbp_table[1], mb_stride * sizeof(uint16_t), fail)
+
+    FF_ALLOCZ_OR_GOTO(ec->qscale_table[0], mb_stride * sizeof(uint8_t) , fail)
+    FF_ALLOCZ_OR_GOTO(ec->qscale_table[1], mb_stride * sizeof(uint8_t) , fail)
+
+    FF_ALLOCZ_OR_GOTO(ec->mb_type_table[0] , (mb_stride+1) * sizeof(uint32_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->mb_type_table[1] , (mb_stride+1) * sizeof(uint32_t), fail)
+
+    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][0], 4*mb_stride * sizeof(int8_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][0], 4*mb_stride * sizeof(int8_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][1], 4*mb_stride * sizeof(int8_t), fail)
+    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][1], 4*mb_stride * sizeof(int8_t), fail)
+
+    ec->zigzag_scan = h->zigzag_scan;
+    ec->zigzag_scan8x8 = h->zigzag_scan8x8;
+
+    return ec;
+fail:
+    free_entropy_context(ec);
+    return NULL;
+}
+
+void init_entropy_buf(EntropyContext *ec, H264Slice *s, int line){
+    int top = (line+1)%2;
+    int cur = line%2;
+
+    ec->non_zero_count_top      = ec->non_zero_count_row[top];
+    ec->non_zero_count          = ec->non_zero_count_row[cur];
+    ec->mvd_top[0]              = ec->mvd_table[0][top];
+    ec->mvd[0]                  = ec->mvd_table[0][cur];
+    ec->mvd_top[1]              = ec->mvd_table[1][top];
+    ec->mvd[1]                  = ec->mvd_table[1][cur];
+    ec->direct_top              = ec->direct_table[top];
+    ec->direct                  = ec->direct_table[cur];
+    ec->chroma_pred_mode_top    = ec->chroma_pred_mode_table[top];
+    ec->chroma_pred_mode        = ec->chroma_pred_mode_table[cur];
+    ec->cbp_top                 = ec->cbp_table[top];
+    ec->cbp                     = ec->cbp_table[cur];
+    ec->qscale_top              = ec->qscale_table[top] +1;
+    ec->qscale                  = ec->qscale_table[cur] +1;
+    ec->mb_type_top             = ec->mb_type_table[top]+1;
+    ec->mb_type                 = ec->mb_type_table[cur]+1;
+    ec->ref_index_top[0]        = ec->ref_index_table[0][top];
+    ec->ref_index_top[1]        = ec->ref_index_table[1][top];
+    ec->ref_index[0]            = ec->ref_index_table[0][cur];
+    ec->ref_index[1]            = ec->ref_index_table[1][cur];
+
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_entropy.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_entropy.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,20 @@
+#ifndef H264_CABAC_H
+#define H264_CABAC_H
+
+#include "h264_types.h"
+#include "cabac.h"
+
+/**
+ * decodes a CABAC coded macroblock
+ * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+
+int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c);
+void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c);
+
+int init_entropy_buf(EntropyContext *ec, H264Slice *s, int line);
+EntropyContext * get_entropy_context(H264Context *h);
+void init_dequant_tables(H264Slice *s, EntropyContext *ec);
+void free_entropy_context(EntropyContext *ec);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_idct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_idct.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,270 @@
+/*
+ * H.264 IDCT
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 IDCT.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "dsputil.h"
+#include "h264_data.h"
+
+static av_always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){
+    int i;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+
+    block[0] += 1<<(shift-1);
+
+    for(i=0; i<4; i++){
+        const int z0=  block[0 + block_stride*i]     +  block[2 + block_stride*i];
+        const int z1=  block[0 + block_stride*i]     -  block[2 + block_stride*i];
+        const int z2= (block[1 + block_stride*i]>>1) -  block[3 + block_stride*i];
+        const int z3=  block[1 + block_stride*i]     + (block[3 + block_stride*i]>>1);
+
+        block[0 + block_stride*i]= z0 + z3;
+        block[1 + block_stride*i]= z1 + z2;
+        block[2 + block_stride*i]= z1 - z2;
+        block[3 + block_stride*i]= z0 - z3;
+    }
+
+    for(i=0; i<4; i++){
+        const int z0=  block[i + block_stride*0]     +  block[i + block_stride*2];
+        const int z1=  block[i + block_stride*0]     -  block[i + block_stride*2];
+        const int z2= (block[i + block_stride*1]>>1) -  block[i + block_stride*3];
+        const int z3=  block[i + block_stride*1]     + (block[i + block_stride*3]>>1);
+
+        dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ];
+        dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ];
+        dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ];
+        dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ];
+    }
+}
+
+void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    idct_internal(dst, block, stride, 4, 6, 1);
+}
+
+void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){
+    idct_internal(dst, block, stride, 8, 3, 1);
+}
+
+void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){
+    idct_internal(dst, block, stride, 8, 3, 0);
+}
+
+void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+
+    block[0] += 32;
+
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[0+i*8] + block[4+i*8];
+        const int a2 =  block[0+i*8] - block[4+i*8];
+        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
+        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
+        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
+        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
+        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        block[0+i*8] = b0 + b7;
+        block[7+i*8] = b0 - b7;
+        block[1+i*8] = b2 + b5;
+        block[6+i*8] = b2 - b5;
+        block[2+i*8] = b4 + b3;
+        block[5+i*8] = b4 - b3;
+        block[3+i*8] = b6 + b1;
+        block[4+i*8] = b6 - b1;
+    }
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[i+0*8] + block[i+4*8];
+        const int a2 =  block[i+0*8] - block[i+4*8];
+        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
+        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
+        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
+        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
+        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
+        dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
+        dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
+        dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
+        dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
+        dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
+        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
+        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
+    }
+}
+
+// assumes all AC coefs are 0
+void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i, j;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 4; j++ )
+    {
+        for( i = 0; i < 4; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
+
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i, j;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 8; j++ )
+    {
+        for( i = 0; i < 8; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
+
+void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
+            else                      idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
+        }
+    }
+}
+
+void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ]) idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
+        else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_c   (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_c   (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16])
+            ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+
+/**
+* IDCT transforms the 16 dc values and dequantizes them.
+* @param qp quantization parameter
+*/
+void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul){
+	#define stride 16
+	int i;
+	int temp[16]; //FIXME check if this is a good idea
+	static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
+	static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
+
+	//return;
+	for(i=0; i<4; i++){
+		const int offset= y_offset[i];
+		const int z0= block[offset+stride*0] + block[offset+stride*4];
+		const int z1= block[offset+stride*0] - block[offset+stride*4];
+		const int z2= block[offset+stride*1] - block[offset+stride*5];
+		const int z3= block[offset+stride*1] + block[offset+stride*5];
+
+		temp[4*i+0]= z0+z3;
+		temp[4*i+1]= z1+z2;
+		temp[4*i+2]= z1-z2;
+		temp[4*i+3]= z0-z3;
+	}
+
+	for(i=0; i<4; i++){
+		const int offset= x_offset[i];
+		const int z0= temp[4*0+i] + temp[4*2+i];
+		const int z1= temp[4*0+i] - temp[4*2+i];
+		const int z2= temp[4*1+i] - temp[4*3+i];
+		const int z3= temp[4*1+i] + temp[4*3+i];
+
+		block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
+		block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
+		block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
+		block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+	}
+}
+
+#undef xStride
+#undef stride
+
+void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){
+	const int stride= 16*2;
+	const int xStride= 16;
+	int a,b,c,d,e;
+
+	a= block[stride*0 + xStride*0];
+	b= block[stride*0 + xStride*1];
+	c= block[stride*1 + xStride*0];
+	d= block[stride*1 + xStride*1];
+
+	e= a-b;
+	a= a+b;
+	b= c-d;
+	c= c+d;
+
+	block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
+	block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
+	block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
+	block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_idct.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_idct.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,19 @@
+#ifndef H264_IDCT_H
+#define H264_IDCT_H
+
+#include "avcodec.h"
+
+void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul);
+void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_mc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_mc.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,272 @@
+#include "h264_types.h"
+#include "h264_data.h"
+
+static inline void mc_dir_part(MBRecContext *d, MBRecState *mrs, H264Mb *m, DecodedPicture *pic, int n, int square,
+							   int chroma_height, int delta, int list,uint8_t *dest_y,
+							   uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset,
+							   qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
+	const int mx= mrs->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
+	const int my= mrs->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
+	const int luma_xy= (mx&3) + ((my&3)<<2);
+	const int pic_width  = 16*d->mb_width;
+	const int pic_height = 16*d->mb_height;
+
+	uint8_t *src_y, *src_cb, *src_cr;
+	int ymx= mx>>2;
+	int ymy= my>>2;
+	int cmy= my>>3;
+	int cmx= mx>>3;
+
+	//truncate the motion vectors references
+	if(ymy>= pic_height+2){
+		ymy=pic_height+1;
+	}else if(ymy <=-19){
+		ymy=-18;
+	}
+	if(ymx>= pic_width+2){
+		ymx= pic_width+1;
+	}else if(ymx<=-19){
+		ymx=-19;
+	}
+
+	src_y = pic->data[0] + ymx + ymy*d->linesize;
+	qpix_op[luma_xy](dest_y, src_y, d->linesize); //FIXME try variable height perhaps?
+	if(!square){
+		qpix_op[luma_xy](dest_y + delta, src_y + delta, d->linesize);
+	}
+
+	if(cmy >= pic_height>>1){
+		cmy = (pic_height>>1) -1;
+	}else if(cmy<=-9){
+		cmy=-8;
+	}
+	if(cmx >= pic_width>>1){
+		cmx = (pic_width>>1) -1;
+	}else if(cmx<=-9){
+		cmx=-8;
+	}
+
+	src_cb= pic->data[1] + cmx + cmy*d->uvlinesize;
+	src_cr= pic->data[2] + cmx + cmy*d->uvlinesize;
+
+	chroma_op(dest_cb, src_cb, d->uvlinesize, chroma_height, mx&7, my&7);
+	chroma_op(dest_cr, src_cr, d->uvlinesize, chroma_height, mx&7, my&7);
+}
+
+static inline void mc_part_std(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
+								uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+								int x_offset, int y_offset,
+								qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+								qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+								int list0, int list1){
+	qpel_mc_func *qpix_op=  qpix_put;
+	h264_chroma_mc_func chroma_op= chroma_put;
+
+	dest_y  += 2*x_offset + 2*y_offset*d->  linesize;
+	dest_cb +=   x_offset +   y_offset*d->uvlinesize;
+	dest_cr +=   x_offset +   y_offset*d->uvlinesize;
+	x_offset += 8*m->mb_x;
+	y_offset += 8*m->mb_y;
+
+	if(list0){
+		DecodedPicture *ref= s->dp_ref_list[0][ mrs->ref_cache[0][ scan8[n] ] ];
+		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 0,
+					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op);
+
+		qpix_op=  qpix_avg;
+		chroma_op= chroma_avg;
+	}
+
+	if(list1){
+		DecodedPicture *ref= s->dp_ref_list[1][ mrs->ref_cache[1][ scan8[n] ] ];
+		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 1,
+					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op);
+	}
+}
+
+static inline void mc_part_weighted(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
+									uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+									int x_offset, int y_offset,
+									qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+									h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
+									h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
+									int list0, int list1){
+	dest_y  += 2*x_offset + 2*y_offset*d->  linesize;
+	dest_cb +=   x_offset +   y_offset*d->uvlinesize;
+	dest_cr +=   x_offset +   y_offset*d->uvlinesize;
+	x_offset += 8*m->mb_x;
+	y_offset += 8*m->mb_y;
+
+	if(list0 && list1){
+		/* don't optimize for luma-only case, since B-frames usually
+		* use implicit weights => chroma too. */
+		uint8_t *tmp_y  = d->scratchpad_y  + 2*x_offset +16 ;
+		uint8_t *tmp_cb = d->scratchpad_cb + x_offset + 8;
+		uint8_t *tmp_cr = d->scratchpad_cr + x_offset + 8;
+
+/*
+		uint8_t *tmp_cb = d->scratchpad;
+		uint8_t *tmp_cr = d->scratchpad + 8;
+		uint8_t *tmp_y  = d->scratchpad + 8*d->uvlinesize;*/
+		int refn0 = mrs->ref_cache[0][ scan8[n] ];
+		int refn1 = mrs->ref_cache[1][ scan8[n] ];
+
+		mc_dir_part(d, mrs, m, s->dp_ref_list[0][refn0], n, square, chroma_height, delta, 0,
+					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put);
+		mc_dir_part(d, mrs, m, s->dp_ref_list[1][refn1], n, square, chroma_height, delta, 1,
+					tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put);
+
+		if(s->use_weight == 2){
+			int weight0 = s->implicit_weight[refn0][refn1][m->mb_y&1];
+			int weight1 = 64 - weight0;
+			luma_weight_avg(  dest_y,  tmp_y,  d->  linesize, 5, weight0, weight1, 0);
+			chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, 5, weight0, weight1, 0);
+			chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, 5, weight0, weight1, 0);
+		}else{
+			luma_weight_avg(dest_y, tmp_y, d->linesize, s->luma_log2_weight_denom,
+							s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0],
+							s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]);
+			chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, s->chroma_log2_weight_denom,
+							s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0],
+							s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]);
+			chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, s->chroma_log2_weight_denom,
+							s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0],
+							s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]);
+		}
+	}else{
+		int list = list1 ? 1 : 0;
+		int refn = mrs->ref_cache[list][ scan8[n] ];
+		DecodedPicture *ref= s->dp_ref_list[list][refn];
+		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, list,
+					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put);
+
+		luma_weight_op(dest_y, d->linesize, s->luma_log2_weight_denom,
+						s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]);
+		if(s->use_weight_chroma){
+			chroma_weight_op(dest_cb, d->uvlinesize, s->chroma_log2_weight_denom,
+							s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]);
+			chroma_weight_op(dest_cr, d->uvlinesize, s->chroma_log2_weight_denom,
+							s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]);
+		}
+	}
+}
+
+static inline void mc_part(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
+							uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+							int x_offset, int y_offset,
+							qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+							qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+							h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+							int list0, int list1){
+	if((s->use_weight==2 && list0 && list1
+		&& (s->implicit_weight[ mrs->ref_cache[0][scan8[n]] ][ mrs->ref_cache[1][scan8[n]] ][m->mb_y&1] != 32))
+		|| s->use_weight==1)
+		mc_part_weighted(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+						x_offset, y_offset, qpix_put, chroma_put,
+						weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
+	else
+		mc_part_std(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+					x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
+}
+
+static inline void prefetch_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int list){
+	/* fetch pixels for estimated mv 4 macroblocks ahead
+	* optimized for 64byte cache lines */
+	const int refn = mrs->ref_cache[list][scan8[0]];
+
+	if(refn >= 0){
+		const int mx= (mrs->mv_cache[list][scan8[0]][0]>>2) + 16*m->mb_x + 8;
+		const int my= (mrs->mv_cache[list][scan8[0]][1]>>2) + 16*m->mb_y;
+		uint8_t **src= s->dp_ref_list[list][refn]->data;
+		int off= mx + (my + (m->mb_x&3)*4)*d->linesize + 64;
+
+		d->dsp.prefetch(src[0]+off, d->linesize, 4);
+		off= (mx>>1) + ((my>>1) + (m->mb_x&7))*d->uvlinesize + 64;
+		d->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+	}
+}
+
+void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+					qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+					qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+					h264_weight_func *weight_op, h264_biweight_func *weight_avg){
+	const int mb_type= m->mb_type;
+	assert(IS_INTER(mb_type));
+
+	if (mb_type & MB_TYPE_L0)
+		prefetch_motion(d, mrs, s, m, 0);
+	if (mb_type & MB_TYPE_L1)
+		prefetch_motion(d, mrs, s, m, 1);
+
+	if(IS_16X16(mb_type)){
+		mc_part(d, mrs, s, m, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+				qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
+				weight_op, weight_avg,
+				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+	}else if(IS_16X8(mb_type)){
+		mc_part(d, mrs, s, m, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
+				qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+				&weight_op[1], &weight_avg[1],
+				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+		mc_part(d, mrs, s, m, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
+				qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+				&weight_op[1], &weight_avg[1],
+				IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+	}else if(IS_8X16(mb_type)){
+		mc_part(d, mrs, s, m, 0, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 0, 0,
+				qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+				&weight_op[2], &weight_avg[2],
+				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+		mc_part(d, mrs, s, m, 4, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 4, 0,
+				qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+				&weight_op[2], &weight_avg[2],
+				IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+	}else{
+		int i;
+
+		assert(IS_8X8(mb_type));
+
+		for(i=0; i<4; i++){
+			const int sub_mb_type= m->sub_mb_type[i];
+			const int n= 4*i;
+			int x_offset= (i&1)<<2;
+			int y_offset= (i&2)<<1;
+
+			if(IS_SUB_8X8(sub_mb_type)){
+				mc_part(d, mrs, s, m, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+						qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+						&weight_op[3], &weight_avg[3],
+						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+			}else if(IS_SUB_8X4(sub_mb_type)){
+				mc_part(d, mrs, s, m, n, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+						qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+						&weight_op[4], &weight_avg[4],
+						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+				mc_part(d, mrs, s, m, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+						qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+						&weight_op[4], &weight_avg[4],
+						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+			}else if(IS_SUB_4X8(sub_mb_type)){
+				mc_part(d, mrs, s, m, n, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+						qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+						&weight_op[5], &weight_avg[5],
+						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+				mc_part(d, mrs, s, m, n+1, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+						qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+						&weight_op[5], &weight_avg[5],
+						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+			}else{
+				int j;
+				assert(IS_SUB_4X4(sub_mb_type));
+				for(j=0; j<4; j++){
+					int sub_x_offset= x_offset + 2*(j&1);
+					int sub_y_offset= y_offset +   (j&2);
+					mc_part(d, mrs, s, m, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+							qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+							&weight_op[6], &weight_avg[6],
+							IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+				}
+			}
+		}
+	}
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_mc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_mc.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,12 @@
+#ifndef H264_MC_H
+#define H264_MC_H
+
+#include "dsputil.h"
+#include "h264_types.h"
+
+void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+					qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+					qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+					h264_weight_func *weight_op, h264_biweight_func *weight_avg);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_misc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_misc.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,944 @@
+#include "config.h"
+
+#include "h264_types.h"
+
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <time.h>
+#include <pthread.h>
+#undef NDEBUG
+#include <assert.h>
+
+#if HAVE_LIBSDL2
+#include <SDL2/SDL.h>
+#if HAVE_LIBSDL_TTF
+#include <SDL/SDL_ttf.h>
+#endif
+#endif
+
+void start_timer(H264Context *h, int stage){
+    clock_gettime(CLOCK_REALTIME, &h->start_time[stage]);
+}
+
+void stop_timer(H264Context *h, int stage){
+    clock_gettime(CLOCK_REALTIME, &h->end_time[stage]);
+    double time = (double) 1.e3*(h->end_time[stage].tv_sec - h->start_time[stage].tv_sec) + 1.e-6*(h->end_time[stage].tv_nsec - h->start_time[stage].tv_nsec);
+    h->last_time [stage]  = time;
+    h->total_time[stage] += time;
+}
+
+void init_sb_entry(H264Context *h, SliceBufferEntry *sbe){
+    sbe->mbs = av_malloc(h->mb_width*h->mb_height* sizeof(H264Mb));
+    sbe->initialized = 1;
+}
+
+void free_sb_entry(SliceBufferEntry *sbe){
+    av_free(sbe->mbs);
+    av_freep(&sbe->gb.raw);
+    if (sbe->gb.rbsp)
+        av_freep(&sbe->gb.rbsp);
+    sbe->initialized = 0;
+}
+
+SliceBufferEntry *get_sb_entry(H264Context *h){
+    SliceBufferEntry *sb = NULL;
+
+    pthread_mutex_lock(&h->lock[PARSE]);
+    while (h->free_sb_cnt<=0)
+        pthread_cond_wait(&h->cond[PARSE], &h->lock[PARSE]);
+    /* use first free picture */
+    for(int i=0; i<h->sb_size; i++){
+        if(h->sb[i].state==0){
+            sb= &h->sb[i];
+            sb->state=1;
+            sb->lines_taken=0;
+            sb->lines_total=h->mb_height;
+            break;
+        }
+    }
+    h->free_sb_cnt--;
+
+    pthread_mutex_unlock(&h->lock[PARSE]);
+
+    memset (&sb->slice, 0, sizeof(H264Slice));
+
+    return sb;
+}
+
+void release_sb_entry(H264Context *h, SliceBufferEntry *sb){
+    pthread_mutex_lock(&h->lock[PARSE]);
+
+    sb->state = 0;
+    h->free_sb_cnt++;
+    pthread_cond_signal(&h->cond[PARSE]);
+
+    pthread_mutex_unlock(&h->lock[PARSE]);
+}
+
+int init_dpb_entry(H264Context *h, DecodedPicture *pic, H264Slice *s, int width, int height){
+    int i;
+
+    s->curr_pic=pic;
+    pic->poc = s->poc;
+    pic->key_frame = s->key_frame;
+    pic->mmco_reset = s->mmco_reset;
+    pic->reference = s->nal_ref_idc? 3:1;
+    pic->cpn = s->coded_pic_num;
+
+    if(pic->data[0]==NULL) {
+        int size[3] = {0};
+
+        width+= EDGE_WIDTH*2;
+        height+= EDGE_WIDTH*2;
+
+        pic->linesize[0]= width;
+        pic->linesize[1]=  pic->linesize[2] = width>>1;
+
+        size[0] = width*height;
+        size[1] = size[2] = width*height>>2;
+
+        for(i=0; i<3; i++){
+            pic->base[i]= av_malloc(size[i]);
+        }
+
+        pic->data[0] = pic->base[0] + (pic->linesize[0]*EDGE_WIDTH) + EDGE_WIDTH;
+        pic->data[1] = pic->base[1] + (pic->linesize[1]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1);
+        pic->data[2] = pic->base[2] + (pic->linesize[2]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1);
+    }
+
+    const int big_mb_num= h->mb_stride*(h->mb_height+1) + 1; //the +1 is needed so memset(,,stride*height) does not sig11
+    const int mb_array_size= h->mb_stride*h->mb_height;
+    const int b4_array_size= h->b4_stride*h->mb_height*4;
+
+    if(pic->mb_type_base==NULL){
+        FF_ALLOCZ_OR_GOTO(pic->mb_type_base , big_mb_num * sizeof(uint32_t), fail)
+        pic->mb_type= pic->mb_type_base + h->mb_stride+1;
+
+        for(int i=0; i<2; i++){
+            FF_ALLOCZ_OR_GOTO(pic->motion_val_base[i], 2 * (b4_array_size+4)  * sizeof(int16_t), fail)
+            pic->motion_val[i]= pic->motion_val_base[i]+4;
+            FF_ALLOCZ_OR_GOTO(pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail)
+        }
+        FF_ALLOCZ_OR_GOTO(pic->intra4x4_pred_mode, h->mb_width*h->mb_height * 4* sizeof(int8_t), fail)
+    }
+
+    return 0;
+    fail:
+    return -1;
+}
+
+void free_dp(DecodedPicture *pic){
+    if(pic->base[0]){
+        for (int i=0; i<3; i++){
+            av_free(pic->base[i]);
+            pic->data[i]= NULL;
+        }
+    }
+    if (pic->mb_type_base){
+        av_free(pic->mb_type_base);
+        pic->mb_type= NULL;
+        for(int i=0; i<2; i++){
+            av_free(pic->motion_val_base[i]);
+            av_free(pic->ref_index[i]);
+        }
+        av_free(pic->intra4x4_pred_mode);
+    }
+}
+
+DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s){
+    DecodedPicture *dp = NULL;
+
+    pthread_mutex_lock(&h->lock[REORDER2]);
+    while (h->free_dpb_cnt<=0){
+    #if OMPSS
+        assert(0);
+    #endif
+        pthread_cond_wait(&h->cond[REORDER2], &h->lock[REORDER2]);
+    }
+    /* use first free picture */
+    for(int i=0; i<h->max_dpb_cnt; i++){
+        if(h->dpb[i].reference==0){
+            dp= &h->dpb[i];
+            break;
+        }
+    }
+    assert(dp);
+    init_dpb_entry(h, dp, s, h->width, h->height);
+    h->free_dpb_cnt--;
+    h->acdpb_cnt++; //debug
+    pthread_mutex_unlock(&h->lock[REORDER2]);
+
+    return dp;
+}
+
+void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode){
+    pthread_mutex_lock(&h->lock[REORDER2]);
+    pic->reference &= ~mode;
+    if (pic->reference == 0){
+        h->free_dpb_cnt++;
+        h->reldpb_cnt++; //debug
+        pthread_cond_signal(&h->cond[REORDER2]);
+    }
+    pthread_mutex_unlock(&h->lock[REORDER2]);
+}
+
+
+/**
+*   Extends the edges of a macroblock line.
+*/
+void draw_edges(MBRecContext *d, H264Slice *s, int line){
+    int i;
+    int mb_width=d->mb_width;
+    int mb_height=d->mb_height;
+    int last = (line+1 == mb_height);
+    int lines = last?16:12;
+    int linesize = d->linesize;
+    int uvlinesize = d->uvlinesize;
+    uint8_t *y = s->curr_pic->data[0] + 16*line*linesize;
+    uint8_t *cb = s->curr_pic->data[1] + 8*line*uvlinesize;
+    uint8_t *cr = s->curr_pic->data[2] + 8*line*uvlinesize;
+
+    for (i=-4; i<lines; i++){
+        memset(y + i*linesize - EDGE_WIDTH, y[i*linesize], EDGE_WIDTH);
+        memset(y + i*linesize + mb_width*16, y[i*linesize +mb_width*16 -1], EDGE_WIDTH);
+    }
+    for (i=-2; i<lines/2; i++){
+        memset(cb + i*uvlinesize - EDGE_WIDTH/2, cb[i*uvlinesize], EDGE_WIDTH/2);
+        memset(cb + i*uvlinesize + mb_width*8, cb[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2);
+        memset(cr + i*uvlinesize - EDGE_WIDTH/2, cr[i*uvlinesize], EDGE_WIDTH/2);
+        memset(cr + i*uvlinesize + mb_width*8, cr[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2);
+    }
+
+    if (line==0){
+        y -= EDGE_WIDTH;
+        cb -= EDGE_WIDTH/2;
+        cr -= EDGE_WIDTH/2;
+        for (i=1; i<=21; i++){
+            memcpy(y -i*linesize, y, linesize);
+        }
+        for (i=1; i<=9; i++){
+            memcpy(cb -i*uvlinesize, cb, uvlinesize);
+            memcpy(cr -i*uvlinesize, cr, uvlinesize);
+        }
+    }else if (last){
+        y += -EDGE_WIDTH + 15*linesize;
+        cb += -EDGE_WIDTH/2 + 7*uvlinesize;
+        cr += -EDGE_WIDTH/2 + 7*uvlinesize;
+        for (i=1; i<=21; i++){
+            memcpy(y +i*linesize, y, linesize);
+        }
+        for (i=1; i<=9; i++){
+            memcpy(cb +i*uvlinesize, cb, uvlinesize);
+            memcpy(cr +i*uvlinesize, cr, uvlinesize);
+        }
+    }
+}
+
+static int64_t timer_start;
+int64_t av_gettime(void) {
+    struct timeval tv;
+    gettimeofday(&tv,NULL);
+    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+void av_start_timer(){
+    timer_start = av_gettime();
+}
+
+void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose) {
+    static int64_t last_time = -1;
+    static int64_t last_frame_number = 0;
+    float t=0, t2=0;
+    int64_t cur_time=0;
+    
+    if (!is_last_report) {    
+        /* display the report every 0.5 seconds */
+        cur_time = av_gettime();
+        if (last_time == -1) {
+            last_time = cur_time;
+            return;
+        }
+        if ((cur_time - last_time) < 500000)
+            return;
+        t = (cur_time-timer_start) / 1000000.0;
+        t2 = (cur_time-last_time) / 1000000.0;        
+    }
+
+    if (verbose){
+        fprintf(stderr, "frame=%5d avgfps=%3d curfps=%3d\r", frame_number, (int)(frame_number/t+0.5), (int)((frame_number - last_frame_number)/t2+0.5) );
+        fflush(stderr);
+    }
+    last_frame_number = frame_number;
+    last_time = cur_time;
+
+    if (is_last_report){
+        t = (av_gettime()-timer_start) / 1000000.0;
+        fprintf(stderr, "%c[2Kframe=%5d avgfps=%3d\r", 27, frame_number, (int)(frame_number/t+0.5));
+        fprintf(stderr, "\n");
+        fprintf(stderr, "video:%1.0fkB\n", video_size/1024.0);
+        fflush(stderr);
+    }
+}
+
+/* Sort B-frames into display order */
+static DecodedPicture *get_reordered_picture(OutputContext *w, int flush){
+    int i;
+    int out_idx = 0;
+    DecodedPicture *out = w->delayed_pic[0];
+
+    if (!out)
+        return NULL;
+
+    for(i=1; w->delayed_pic[i] && !w->delayed_pic[i]->key_frame && !w->delayed_pic[i]->mmco_reset; i++){
+        if(w->delayed_pic[i]->poc < out->poc){
+            out = w->delayed_pic[i];
+            out_idx = i;
+        }
+    }
+
+    if(w->dp_cnt > MAX_DELAYED_PIC_COUNT || flush) {
+        for(i=out_idx; w->delayed_pic[i]; i++)
+            w->delayed_pic[i] = w->delayed_pic[i+1];
+        w->dp_cnt--;
+        return out;
+    }
+    return NULL;
+}
+
+/**
+*  Remove the extra borders, and places the three parts of the image after each other.
+*/
+static int raw_encode(const DecodedPicture* src, int width, int height, unsigned char *dest) {
+    int i, j;
+/** To write entire image including extra borders*/
+//  int w = src->linesize[0];
+//  int h = height+64;
+//  int w2 = w>>1;
+//  int h2 = h>>1;
+//     int data_planes=3;
+//     int size = w * h + 2 *w2*h2;
+//     const unsigned char* s;
+//     for (i=0; i<data_planes; i++) {
+//         if (i == 1) {
+//             w = w2;
+//             h = h2;
+//         }
+//         s = src->base[i];
+//         for(j=0; j<h; j++) {
+//             memcpy(dest, s, src->linesize[i]);
+//             dest += w;
+//             s += src->linesize[i];
+//         }
+//     }
+
+    int w = (width*8 + 7)/8;
+    int h = height;
+    int w2 =((width >>1) * 8 + 7) / 8;
+    int h2 = ((height+1) >>1); //not sure about +1
+    int data_planes=3;
+    int size = w * h + 2 *w2*h2;
+    const unsigned char* s;
+
+
+    for (i=0; i<data_planes; i++) {
+        if (i == 1) {
+            w = w2;
+            h = h2;
+        }
+        s = src->data[i];
+        for(j=0; j<h; j++) {
+            memcpy(dest, s, w);
+            dest += w;
+            s += src->linesize[i];
+        }
+    }
+    return size;
+}
+
+#ifdef HAVE_LIBSDL2
+static SDL_Texture *get_next_texture(H264Context *h, int side){
+    SDLTextureQueue *sdlq = &h->sdlq;
+    SDL_Texture *texture;
+    pthread_mutex_lock (&sdlq->sdl_lock);
+    if (side ){ //send
+        while (sdlq->ready >= sdlq->size)
+            pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock);
+        texture = sdlq->queue[sdlq->fi];
+        sdlq->fi++; sdlq->fi %= sdlq->size;
+    } else { //recv
+        while (sdlq->ready <= 0 && !sdlq->exit)
+            pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock);
+
+        if (sdlq->ready == 0 && sdlq->exit){
+            texture = NULL;
+        }else{
+            texture = sdlq->queue[sdlq->fo];
+            sdlq->fo++; sdlq->fo %= sdlq->size;
+        }
+    }
+    pthread_mutex_unlock(&sdlq->sdl_lock);
+
+    return texture;
+}
+
+static void signal_texture(H264Context *h, int side){
+    SDLTextureQueue *sdlq = &h->sdlq;
+    pthread_mutex_lock (&sdlq->sdl_lock);
+    if (side)
+        sdlq->ready++;
+    else
+        sdlq->ready--;
+    pthread_cond_signal(&sdlq->sdl_cond);
+    pthread_mutex_unlock(&sdlq->sdl_lock);
+}
+
+void signal_sdl_exit(H264Context *h){
+    SDLTextureQueue *sdlq = &h->sdlq;
+    pthread_mutex_lock (&sdlq->sdl_lock);
+    sdlq->exit=1;
+    pthread_cond_signal(&sdlq->sdl_cond);
+    pthread_mutex_unlock(&sdlq->sdl_lock);
+}
+
+static void display_frame(H264Context *h, OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height, int dropable){
+    static int64_t last_time = -1;
+    int64_t cur_time;
+//     SDLContext *sdlc = h->sdlc;
+    uint8_t *iyuv_pixels;
+    int pitch;
+
+
+    if (last_time == -1){
+        last_time = av_gettime();
+    }
+
+    
+    /* do not display frames that are less than 8.125 ms apart (120fps)*/
+    if (dropable){
+        cur_time = av_gettime();
+
+        if ((cur_time - last_time) < 8125)
+            return;
+
+        last_time =cur_time;
+    }
+
+    if(in_picture){
+        
+        SDL_Texture *texture= get_next_texture(h, 1);
+
+        SDL_LockTexture( texture, NULL, (void **)&iyuv_pixels, &pitch );
+
+        raw_encode(in_picture, frame_width, frame_height, iyuv_pixels);
+
+        signal_texture(h, 1);
+    }
+}
+#endif
+
+// TODO: Parallelize the raw_encode (either split frame or over frames)
+static void do_video_out(OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height) {
+    int size=0;
+    //remove extra borders
+
+    if(in_picture)
+        size= raw_encode(in_picture, frame_width, frame_height, w->bit_buffer);
+
+    if (size < 0) {
+        fprintf(stderr, "Video encoding failed\n");
+    }else {
+        if (write(fd, w->bit_buffer, size)<0)
+            fprintf(stderr, "Write frame failed\n");
+    }
+
+    w->video_size += size;
+}
+
+DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height) {
+    DecodedPicture *out;
+
+    if (pic){
+        oc->delayed_pic[oc->dp_cnt++]=pic;
+        out = get_reordered_picture(oc, 0);
+    }else{
+        out = get_reordered_picture(oc, 1);
+    }
+
+    if (out){
+        if (fd){
+            do_video_out(oc, fd, out, frame_width, frame_height);
+        }else{
+#ifdef HAVE_LIBSDL2
+            if (h->display){
+                display_frame(h, oc, fd, out, frame_width, frame_height, !(pic==NULL));
+            }
+#endif
+        }
+        oc->frame_number++;
+    }
+
+    return out;
+}
+
+OutputContext *get_output_context(H264Context *h){
+    const int frame_width=h->frame_width;
+    const int frame_height=h->frame_height;
+    const int frame_size = frame_width*frame_height;
+
+    OutputContext *oc = av_mallocz(sizeof(OutputContext));
+    oc->bit_buffer_size= FFMAX(1024*256, frame_size*2); // oversize a little bit to allow extra border write
+    oc->bit_buffer=  av_mallocz(oc->bit_buffer_size);
+
+    return oc;
+}
+
+void free_output_context(OutputContext *oc){
+
+    av_free(oc->bit_buffer);
+    av_free(oc);
+}
+
+SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height){
+    SuperMBContext *smbc = av_mallocz(sizeof(SuperMBContext));
+
+    smbc->smb_width = smb_width;
+    smbc->smb_height = smb_height;
+
+    smbc->nsmb_height = h->mb_height / smbc->smb_height +  (h->mb_height%smbc->smb_height ? 1:0);    //only need one extra if mb_height was not dividable
+    smbc->nsmb_width  = h->mb_width / smbc->smb_width;
+    while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width )
+        smbc->nsmb_width++;
+
+    smbc->nsmb_3dheight= smbc->nsmb_height - ((h->mb_height/2)/smbc->smb_height +1); //assuming max motion vector of half the height
+
+    smbc->smbs[0] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask));
+    smbc->smbs[1] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask));
+    for (int y=0, i=0; i<smbc->nsmb_height; i++, y+=smbc->smb_height){
+        for (int x=0, j=0; j<smbc->nsmb_width; j++, x+=smbc->smb_width){
+            smbc->smbs[0][i*smbc->nsmb_width +j].smb_y = y;
+            smbc->smbs[0][i*smbc->nsmb_width +j].smb_x = x;
+            smbc->smbs[1][i*smbc->nsmb_width +j].smb_y = y;
+            smbc->smbs[1][i*smbc->nsmb_width +j].smb_x = x;
+        }
+    }
+
+    smbc->refcount = 1;
+
+    return smbc;
+}
+
+void freeSuperMBContext(SuperMBContext *smbc){
+    av_free(smbc->smbs[0]);
+    av_free(smbc->smbs[1]);
+    av_free(smbc);
+}
+
+SuperMBContext * acquire_smbc(H264Context *h ){
+    SuperMBContext *smbc;
+
+    pthread_mutex_lock (&h->smb_lock);
+    smbc = h->smbc;
+    smbc->refcount++;
+    pthread_mutex_unlock(&h->smb_lock);
+    return smbc;
+}
+
+void release_smbc(H264Context *h, SuperMBContext *smbc){
+    pthread_mutex_lock (&h->smb_lock);
+    smbc->refcount--;
+    if (smbc->refcount==0){
+        freeSuperMBContext(smbc);
+    }
+    pthread_mutex_unlock(&h->smb_lock);
+
+}
+
+
+#ifdef HAVE_LIBSDL2
+
+// #if OMPSS
+static void draw_sb_border(H264Context *h, uint32_t *rgba_pixels, int smb_x, int smb_y){
+    int mb_width = h->mb_width;
+    int mb_height = h->mb_height;
+    int width = h->frame_width;
+    int height = h->frame_height;
+
+    int mb_x = smb_x * h->smb_width;
+    int mb_y = smb_y * h->smb_height;
+
+    uint32_t pix= 0x0000FFC0;
+
+    for (int k=0, i=mb_y; i< mb_y + h->smb_height; i++, k++){
+        for (int l=0, j=mb_x -k ; j< mb_x - k + h->smb_width; j++, l++){
+            //outside frame
+            if (i<0 || i>=mb_height || j<0 || j>=mb_width) {
+                continue;
+            }
+
+            //draw top
+            if (i==0 || k==0 || l==0){
+                int mx = j*16;
+                int my = i*16;
+                uint32_t *top = rgba_pixels + my*width + mx;
+                int endx = mx+16 < width? 16: width-mx;
+
+                for (int x = 0; x<endx; x++){
+                    top[x] = pix;
+                }
+            }
+
+            //draw bottom
+            if (i==mb_height-1 || k==h->smb_height-1 || l==h->smb_width-1){
+                int mx = j*16;
+                int my = i*16 + 15; my = my < height ? my: height-1;
+                uint32_t *bottom = rgba_pixels + my*width + mx;
+                int endx = mx+16 < width? 16: width-mx;
+
+                for (int x = 0; x<endx; x++){
+                    bottom[x] = pix;
+                }
+            }
+
+            //draw left
+            if (j==0 || l==0 ){
+                int mx = j*16;
+                int my = i*16;
+                uint32_t *left = rgba_pixels + my*width + mx;
+                int endy = my +16 < height ? 16: height - my;
+
+                for (int y = 0; y<endy; y++){
+                    left[y*width] = pix;
+                }
+            }
+
+            //draw right
+            if (j==mb_width -1 || l==h->smb_width-1 ){
+                int mx = j*16 + 15; mx = mx < width ? mx: width-1;
+                int my = i*16;
+                uint32_t *right = rgba_pixels + my*width + mx;
+                int endy = my +16 < height ? 16: height - my;
+
+                for (int y = 0; y<endy; y++){
+                    right[y*width] = pix;
+                }
+            }
+        }
+    }
+}
+
+static void draw_sbmap (H264Context *h, SuperMBContext *smbc, SDLContext *sdlc){
+    int pitch;
+    uint32_t *rgba_pixels;
+    SDL_Texture *sbmap= sdlc->sbmap_texture;
+
+    SDL_LockTexture( sbmap, NULL, (void **)&rgba_pixels, &pitch );
+
+    memset (rgba_pixels, 0, pitch * h->height);
+    for (int i=0; i< smbc->nsmb_height; i++){
+        for (int j=0; j< smbc->nsmb_width; j++){
+            draw_sb_border(h, rgba_pixels, j, i);
+        }
+    }
+
+    SDL_UnlockTexture( sbmap );
+}
+// #endif
+
+// static void calc_sb_sizes (H264Context *h, SuperMBContext *smbc){
+//     smbc->smb_height = h->smb_height;
+//     smbc->smb_width = h->smb_width;
+//
+//     smbc->nsmb_height = h->mb_height / smbc->smb_height +  (h->mb_height%smbc->smb_height ? 1:0);    //only need one extra if mb_height was not dividable
+//     smbc->nsmb_width  = h->mb_width / smbc->smb_width;
+//     while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width )
+//         smbc->nsmb_width++;
+// }
+
+
+static void handle_key_event(H264Context *h, SDLContext *sdlc, SDL_Keysym keysym){
+    int arrow=0;
+
+    switch (keysym.sym){
+        case SDLK_ESCAPE:
+            if (sdlc->fullscreen){
+                SDL_SetWindowFullscreen(sdlc->window, SDL_FALSE);
+                sdlc->fullscreen = 0;
+            }
+            break;
+        case SDLK_SPACE:
+            pthread_mutex_lock(&h->sdl_lock);
+            sdlc->pause = !sdlc->pause;
+            pthread_cond_signal(&h->sdl_cond);
+            pthread_mutex_unlock(&h->sdl_lock);
+            break;
+        case SDLK_f:
+            if (!sdlc->fullscreen){
+                if (keysym.mod == KMOD_LCTRL){
+//                     SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full);
+                    SDL_SetWindowFullscreen(sdlc->window, SDL_TRUE);
+
+                    sdlc->fullscreen = 1;
+                }
+            }
+            break;
+        case SDLK_m:
+            sdlc->showmap = !sdlc->showmap;
+            break;
+        case SDLK_UP:
+            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height < h->mb_height && h->smb_height < h->smb_width){
+                h->smb_height++;
+                arrow =1;
+            }
+            break;
+        case SDLK_DOWN:
+            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height > 1 ){
+                h->smb_height--;
+                arrow =1;
+            }
+            break;
+        case SDLK_LEFT:
+            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width > 1 && h->smb_width > h->smb_height){
+                h->smb_width--;
+                arrow =1;
+            }
+            break;
+        case SDLK_RIGHT:
+            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width < h->mb_width){
+                h->smb_width++;
+                arrow =1;
+            }
+            break;
+    }
+
+    if (arrow){
+        SuperMBContext *smbc = getSuperMBContext(h, h->smb_width, h->smb_height);
+        pthread_mutex_lock(&h->smb_lock);
+        h->smbc->refcount--;
+        if (h->smbc->refcount == 0)
+            freeSuperMBContext(h->smbc);
+        h->smbc = smbc;
+        sdlc->updatemap =1;
+        pthread_mutex_unlock(&h->smb_lock);
+    }
+}
+
+void handle_window_event(H264Context *h, SDLContext *sdlc, SDL_WindowEvent winevent){
+    SDL_Rect nrect;
+    switch (winevent.event){
+        case SDL_WINDOWEVENT_RESIZED:
+
+            sdlc->win_w =  winevent.data1;
+            sdlc->win_h =  winevent.data2;
+
+            double aspect = (double) sdlc->win_w/ sdlc->win_h;
+            if ( aspect < sdlc->aspect){
+                double r = (double) sdlc->win_w / sdlc->rect.w;
+                double h = (double) sdlc->rect.h * r;
+
+                nrect.y = lrint(( (double) sdlc->win_h - h)/2);
+                nrect.h = lrint(h);
+
+                nrect.x=0;
+                nrect.w= sdlc->win_w;
+
+            }else {
+                double r = (double) sdlc->win_h / sdlc->rect.h;
+                double w = (double) sdlc->rect.w * r;
+
+                nrect.x = lrint(( (double) sdlc->win_w - w)/2);
+                nrect.w = lrint(w);
+
+                nrect.y=0;
+                nrect.h= sdlc->win_h;
+            }
+            //prob better to lock
+            sdlc->win_rect = nrect;
+            sdlc->resized=1;
+            break;
+    }
+}
+
+void *sdl_event_listen_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+    SDLContext *sdlc = h->sdlc;
+    SDL_Event event;
+
+    while ( SDL_WaitEvent(&event) ) {
+        switch (event.type) {
+            case SDL_KEYDOWN:
+                handle_key_event(h, sdlc, event.key.keysym);
+                break;
+            case SDL_WINDOWEVENT:
+                handle_window_event(h, sdlc, event.window);
+                break;
+            case SDL_QUIT:
+                h->quit=1;
+                goto finish;
+        }
+    }
+finish:
+    pthread_exit(NULL);
+    return NULL;
+}
+
+//XInitThreads not called in SDL2 library, causes crash
+//remove in future when fixed ...
+#include <X11/Xlib.h>
+
+SDLContext *get_SDL_context(H264Context *h){
+    const int frame_width=h->frame_width;
+    const int frame_height=h->frame_height;
+
+    SDLContext *sdlc = av_mallocz(sizeof(SDLContext));
+    sdlc->display = h->display;
+    sdlc->fullscreen = h->fullscreen;
+
+    sdlc->aspect = (double) frame_width / (double) frame_height;
+    sdlc->rect.x =0;
+    sdlc->rect.y =0;
+    sdlc->rect.w =frame_width;
+    sdlc->rect.h =frame_height;
+
+    XInitThreads(); //workaround
+
+    // Initializes the video subsystem
+    if (SDL_Init(SDL_INIT_VIDEO) < 0) {
+        fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError());
+        #undef exit
+        exit(-1);
+    }
+    SDL_SetHint("SDL_HINT_RENDER_SCALE_QUALITY", "best");
+    SDL_SetHint("SDL_HINT_RENDER_OPENGL_SHADERS", "1");
+
+    SDL_GetDesktopDisplayMode(0, &sdlc->full);
+    sdlc->full.format = SDL_PIXELFORMAT_IYUV;
+
+    sdlc->wind = sdlc->full;
+    if (sdlc->wind.w > frame_width) sdlc->wind.w = frame_width;
+    if (sdlc->wind.h > frame_height) sdlc->wind.h = frame_height;
+
+    sdlc->win_rect.x =0;
+    sdlc->win_rect.y =0;
+    sdlc->win_rect.w =sdlc->wind.w;
+    sdlc->win_rect.h =sdlc->wind.h;
+
+    if (sdlc->fullscreen){
+        sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED,  SDL_WINDOWPOS_UNDEFINED, sdlc->full.w, sdlc->full.h, SDL_WINDOW_FULLSCREEN|SDL_WINDOW_SHOWN|SDL_WINDOW_RESIZABLE);
+        SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full);
+    } else {
+        sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED,  SDL_WINDOWPOS_UNDEFINED, sdlc->wind.w, sdlc->wind.h, SDL_WINDOW_RESIZABLE|SDL_WINDOW_SHOWN);
+        SDL_SetWindowDisplayMode (sdlc->window, &sdlc->wind);
+    }
+
+    sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_ACCELERATED);
+//     sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_SOFTWARE);
+
+    h->sdlq.queue[0] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
+    h->sdlq.queue[1] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
+
+    sdlc->sbmap_texture = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
+    SDL_SetTextureBlendMode(sdlc->sbmap_texture, SDL_BLENDMODE_BLEND);
+    sdlc->updatemap = 1;
+
+#if HAVE_LIBSDL_TTF
+    //not working with SDL 2.0, try again in future when supported
+    if(TTF_Init()==-1) {
+        printf("TTF_Init: %s\n", TTF_GetError());
+        exit(2);
+    }
+
+    // Load a font
+    TTF_Font *font;
+    font = TTF_OpenFont("/usr/share/fonts/truetype/freefont/FreeSans.ttf", 24);
+    if (font == NULL)
+    {
+        printf("TTF_OpenFont() Failed: %s\n", TTF_GetError());
+        TTF_Quit();
+        exit(1);
+    }
+#endif
+    
+    pthread_create(&sdlc->listen_thread, NULL, sdl_event_listen_thread, h);
+
+    return sdlc;
+
+}
+
+void free_SDL_context(H264Context *h){
+    SDLContext *sdlc = h->sdlc;
+    pthread_join(sdlc->listen_thread, NULL);
+
+#if HAVE_LIBSDL_TTF
+    TTF_Quit();
+#endif
+    SDL_DestroyTexture(h->sdlq.queue[0]);
+    SDL_DestroyTexture(h->sdlq.queue[1]);
+    SDL_DestroyTexture(sdlc->sbmap_texture);
+    SDL_DestroyRenderer(sdlc->renderer);
+    SDL_DestroyWindow(sdlc->window);
+    SDL_Quit();
+
+}
+
+void *sdl_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+
+    SDLContext *sdlc = get_SDL_context(h);
+    h->sdlc = sdlc;
+
+    signal_texture(h, 0);
+    signal_texture(h, 0);
+
+    SDL_Texture *texture;
+    for (;;){
+        pthread_mutex_lock(&h->sdl_lock);
+        while (sdlc->pause){
+            pthread_cond_wait(&h->sdl_cond, &h->sdl_lock);
+        }
+        pthread_mutex_unlock(&h->sdl_lock);
+
+        texture = get_next_texture(h, 0);
+        if (texture == NULL)
+            break;
+        
+        SDL_UnlockTexture(texture);
+
+        //clear if resized
+        if (sdlc->resized){
+            // KDE bug prob, reset viewport change after resize from max
+            SDL_RenderSetViewport(sdlc->renderer, NULL);
+            SDL_SetRenderDrawColor(sdlc->renderer, 0, 0, 0, 255);
+            SDL_RenderClear(sdlc->renderer);
+            sdlc->resized = 0;
+        }
+
+        SDL_RenderCopy(sdlc->renderer, texture, &sdlc->rect, &sdlc->win_rect);
+
+        if (sdlc->showmap){
+            if (sdlc->updatemap){
+                SuperMBContext *smbc;
+                pthread_mutex_lock (&h->smb_lock);
+                smbc = h->smbc;
+                smbc->refcount++;
+                sdlc->updatemap=0;
+                pthread_mutex_unlock(&h->smb_lock);
+
+                draw_sbmap(h, smbc, sdlc);
+
+                release_smbc(h, smbc);
+            }
+            SDL_RenderCopy(sdlc->renderer, sdlc->sbmap_texture, &sdlc->rect, &sdlc->win_rect);
+        }
+
+        SDL_RenderPresent(sdlc->renderer);
+        signal_texture(h, 0);
+    }
+
+    free_SDL_context(h);
+
+    pthread_exit(NULL);
+    return NULL;
+}
+#endif
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_misc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_misc.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,52 @@
+#ifndef H264_MISC_H
+#define H264_MISC_H
+
+#include "avcodec.h"
+#include "h264_types.h"
+
+void start_timer(H264Context *h, int stage);
+void stop_timer(H264Context *h, int stage);
+
+void init_sb_entry(H264Context *h, SliceBufferEntry *sbe);
+void free_sb_entry(SliceBufferEntry *sb);
+SliceBufferEntry *get_sb_entry(H264Context *h);
+void release_sb_entry(H264Context *h, SliceBufferEntry *sb);
+
+DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s);
+void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode);
+
+void draw_edges(MBRecContext *d, H264Slice *s, int line);
+
+int ff_init_slice(NalContext *n, H264Slice *s);
+void free_picture(PictureInfo *pic);
+void free_dp(DecodedPicture *pic);
+
+void av_start_timer();
+int copyEDtoH264Slice(H264Slice *ms, H264Slice *es);
+void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose);
+
+int ff_alloc_picture_info(NalContext *n, H264Slice *s, PictureInfo *pic);
+DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height);
+OutputContext *get_output_context(H264Context *h);
+void free_output_context(OutputContext *oc);
+
+void freeSuperMBContext(SuperMBContext *smbc);
+SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height);
+void release_smbc(H264Context *h, SuperMBContext *smbc);
+SuperMBContext * acquire_smbc(H264Context *h );
+
+#if HAVE_LIBSDL2
+void signal_sdl_exit(H264Context *h);
+void *sdl_thread(void *arg);
+SDLContext *get_SDL_context(H264Context *h);
+void free_SDL_context(SDLContext *sdlc);
+#endif
+
+/**
+* gets the chroma qp.
+*/
+static inline int get_chroma_qp(H264Slice *s, int t, int qscale){
+    return s->pps.chroma_qp_table[t][qscale];
+}
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_nal.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_nal.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,628 @@
+#include "h264_types.h"
+#include "h264_data.h"
+
+#include "golomb.h"
+#include "h264_sei.h"
+#include "h264_refs.h"
+#include "h264_ps.h"
+#include "h264_pred_mode.h"
+#include "h264_misc.h"
+
+static int ff_h264_decode_rbsp_trailing(const uint8_t *src){
+    int v= *src;
+    int r;
+
+    for(r=1; r<9; r++){
+        if(v&1) return r;
+        v>>=1;
+    }
+    return 0;
+}
+
+static int pred_weight_table(H264Slice *s, GetBitContext *gb){
+    int luma_def, chroma_def;
+
+    s->use_weight= 0;
+    s->use_weight_chroma= 0;
+    s->luma_log2_weight_denom= get_ue_golomb(gb);
+    s->chroma_log2_weight_denom= get_ue_golomb(gb);
+    luma_def = 1<<s->luma_log2_weight_denom;
+    chroma_def = 1<<s->chroma_log2_weight_denom;
+
+    for(int list=0; list<2; list++){
+        for(int i=0; i<s->ref_count[list]; i++){
+            int luma_weight_flag, chroma_weight_flag;
+
+            luma_weight_flag= get_bits1(gb);
+            if(luma_weight_flag){
+                s->luma_weight[i][list][0]= get_se_golomb(gb);
+                s->luma_weight[i][list][1]= get_se_golomb(gb);
+                if(   s->luma_weight[i][list][0] != luma_def
+                    || s->luma_weight[i][list][1] != 0) {
+                    s->use_weight= 1;
+                }
+            }else{
+                s->luma_weight[i][list][0]= luma_def;
+                s->luma_weight[i][list][1]= 0;
+            }
+
+            chroma_weight_flag= get_bits1(gb);
+            if(chroma_weight_flag){
+                int j;
+                for(j=0; j<2; j++){
+                    s->chroma_weight[i][list][j][0]= get_se_golomb(gb);
+                    s->chroma_weight[i][list][j][1]= get_se_golomb(gb);
+                    if(   s->chroma_weight[i][list][j][0] != chroma_def
+                    || s->chroma_weight[i][list][j][1] != 0) {
+                        s->use_weight_chroma= 1;
+                    }
+                }
+            }else{
+                int j;
+                for(j=0; j<2; j++){
+                    s->chroma_weight[i][list][j][0]= chroma_def;
+                    s->chroma_weight[i][list][j][1]= 0;
+                }
+            }
+        }
+        if(s->slice_type_nos != FF_B_TYPE) break;
+    }
+    s->use_weight= s->use_weight || s->use_weight_chroma;
+    return 0;
+}
+
+/**
+* Initialize implicit_weight table.
+*/
+static void implicit_weight_table(H264Slice *s){
+    int ref0, ref1, cur_poc, ref_start, ref_count0, ref_count1;
+
+    cur_poc = s->poc;
+    if(   s->ref_count[0] == 1 && s->ref_count[1] == 1  && s->ref_list[0][0]->poc + s->ref_list[1][0]->poc == 2*cur_poc){
+        s->use_weight= 0;
+        s->use_weight_chroma= 0;
+        return;
+    }
+    ref_start= 0;
+    ref_count0= s->ref_count[0];
+    ref_count1= s->ref_count[1];
+
+    s->use_weight= 2;
+    s->use_weight_chroma= 2;
+    s->luma_log2_weight_denom= 5;
+    s->chroma_log2_weight_denom= 5;
+
+    for(ref0=ref_start; ref0 < ref_count0; ref0++){
+        int poc0 = s->ref_list[0][ref0]->poc;
+        for(ref1=ref_start; ref1 < ref_count1; ref1++){
+            int poc1 = s->ref_list[1][ref1]->poc;
+            int td = av_clip(poc1 - poc0, -128, 127);
+            int w= 32;
+            if(td){
+                int tb = av_clip(cur_poc - poc0, -128, 127);
+                int tx = (16384 + (FFABS(td) >> 1)) / td;
+                int dist_scale_factor = (tb*tx + 32) >> 8;
+                if(dist_scale_factor >= -64 && dist_scale_factor <= 128)
+                    w = 64 - dist_scale_factor;
+            }
+            s->implicit_weight[ref0][ref1][0]=
+            s->implicit_weight[ref0][ref1][1]= w;
+        }
+    }
+}
+
+/**
+* instantaneous decoder refresh.
+*/
+static void idr(NalContext *n, H264Slice *s){
+    ff_h264_remove_all_refs(n, s);
+    n->prev_frame_num= 0;
+    n->prev_frame_num_offset= 0;
+    n->poc_offset +=  (n->prev_poc_msb<<16) + n->prev_poc_lsb;
+    n->prev_poc_msb=
+    n->prev_poc_lsb= 0;
+}
+
+static int init_poc(NalContext *n, H264Slice *s, GetBitContext *gb){
+    const int max_frame_num= 1<<n->sps.log2_max_frame_num;
+    int frame_poc;
+
+    if(n->sps.poc_type==0){
+        n->poc_lsb= get_bits(gb, n->sps.log2_max_poc_lsb);
+    }
+
+    if(n->sps.poc_type==1 && !n->sps.delta_pic_order_always_zero_flag){
+        n->delta_poc= get_se_golomb(gb);
+    }
+
+    n->frame_num_offset= n->prev_frame_num_offset;
+    if(n->frame_num < n->prev_frame_num)
+        n->frame_num_offset += max_frame_num;
+
+    if(n->sps.poc_type==0){
+        const int max_poc_lsb= 1<<n->sps.log2_max_poc_lsb;
+
+        if(n->poc_lsb < n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb >= max_poc_lsb/2)
+            n->poc_msb = n->prev_poc_msb + max_poc_lsb;
+        else if(n->poc_lsb > n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb < -max_poc_lsb/2)
+            n->poc_msb = n->prev_poc_msb - max_poc_lsb;
+        else
+            n->poc_msb = n->prev_poc_msb;
+
+        frame_poc = n->poc_msb + n->poc_lsb;
+    }else if(n->sps.poc_type==1){
+        int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
+        int i;
+
+        if(n->sps.poc_cycle_length != 0)
+            abs_frame_num = n->frame_num_offset + n->frame_num;
+        else
+            abs_frame_num = 0;
+
+        if(s->nal_ref_idc==0 && abs_frame_num > 0)
+            abs_frame_num--;
+
+        expected_delta_per_poc_cycle = 0;
+        for(i=0; i < n->sps.poc_cycle_length; i++)
+            expected_delta_per_poc_cycle += n->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
+
+        if(abs_frame_num > 0){
+            int poc_cycle_cnt          = (abs_frame_num - 1) / n->sps.poc_cycle_length;
+            int frame_num_in_poc_cycle = (abs_frame_num - 1) % n->sps.poc_cycle_length;
+
+            expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
+            for(i = 0; i <= frame_num_in_poc_cycle; i++)
+                expectedpoc = expectedpoc + n->sps.offset_for_ref_frame[ i ];
+        } else
+            expectedpoc = 0;
+        if(s->nal_ref_idc == 0)
+            expectedpoc = expectedpoc + n->sps.offset_for_non_ref_pic;
+        frame_poc = expectedpoc + n->delta_poc;
+    }else{
+        int poc= 2*(n->frame_num_offset + n->frame_num);
+        if(!s->nal_ref_idc)
+            poc--;
+        frame_poc= poc;
+    }
+    s->current_picture_info->poc= s->poc = frame_poc + n->poc_offset;
+    s->coded_pic_num = n->coded_pic_num++;
+
+    return 0;
+}
+
+static void ref2frame(NalContext *n, H264Slice *s){
+    for(int j=0; j<s->list_count; j++){
+        int *ref2frm= s->ref2frm[j];
+
+        ref2frm[0]=
+        ref2frm[1]= -1;
+
+        for(int i=0; i<s->ref_count[j]; i++){
+            ref2frm[i+2]= 15;
+            if(s->ref_list[j][i]->cpn >=0){
+                int k;
+                for(k=0; k<n->short_ref_count; k++){
+                    if(n->short_ref[k]->cpn == s->ref_list[j][i]->cpn){
+                        ref2frm[i+2]= k;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+* decodes a slice header.
+* This will also call MPV_common_init() and frame_start() as needed.
+*
+* @param h h264context
+* @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
+*
+* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
+*/
+static int decode_slice_header(NalContext *n, H264Slice *s, GetBitContext *gb){
+    unsigned int first_mb_in_slice;
+    unsigned int pps_id;
+    int num_ref_idx_active_override_flag;
+    unsigned int slice_type, tmp;
+
+    first_mb_in_slice= get_ue_golomb(gb);
+    (void) first_mb_in_slice;
+
+    slice_type= get_ue_golomb_31(gb);
+    if(slice_type > 9){
+        av_log(AV_LOG_ERROR, "slice type too large (%d)\n", s->slice_type);
+        return -1;
+    }
+    if(slice_type > 4)
+        slice_type -= 5;
+
+    slice_type= golomb_to_pict_type[ slice_type ];
+
+    s->slice_type= slice_type;
+    s->slice_type_nos= slice_type & 3;
+    s->current_picture_info->slice_type_nos = s->slice_type_nos;
+    s->current_picture_info->reference= s->nal_ref_idc? 2:0;
+    s->key_frame = s->slice_type == FF_I_TYPE;
+
+    pps_id= get_ue_golomb(gb);
+
+    if(pps_id>=MAX_PPS_COUNT){
+        av_log(AV_LOG_ERROR, "pps_id out of range\n");
+        return -1;
+    }
+    if(!n->pps_buffers[pps_id]) {
+        av_log(AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
+        return -1;
+    }
+    s->pps= *n->pps_buffers[pps_id];
+
+    if(!n->sps_buffers[s->pps.sps_id]) {
+        av_log(AV_LOG_ERROR, "non-existing SPS %u referenced\n", s->pps.sps_id);
+        return -1;
+    }
+    n->sps = *n->sps_buffers[s->pps.sps_id];
+
+    n->mb_width= n->sps.mb_width;
+    n->mb_height= n->sps.mb_height;
+
+    int chroma444 = (n->sps.chroma_format_idc == 3);
+    n->width = 16*n->mb_width - (2>>chroma444)*FFMIN(n->sps.crop_right, (8<<chroma444)-1);
+    if(n->sps.frame_mbs_only_flag)
+        n->height= 16*n->mb_height - (2>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1);
+    else
+        n->height= 16*n->mb_height - (4>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1);
+
+    s->direct_8x8_inference_flag = n->sps.direct_8x8_inference_flag;
+    s->transform_bypass = n->sps.transform_bypass;
+
+    n->frame_num= get_bits(gb, n->sps.log2_max_frame_num);
+    if(n->frame_num !=  n->prev_frame_num && n->frame_num != (n->prev_frame_num+1)%(1<<n->sps.log2_max_frame_num)){
+        av_log(AV_LOG_ERROR, "unexpected frame_num \n");
+    }
+
+    s->current_picture_info->frame_num= n->frame_num; //FIXME frame_num cleanup
+    n->max_pic_num= 1<< n->sps.log2_max_frame_num;
+
+    if(s->nal_unit_type == NAL_IDR_SLICE){
+        get_ue_golomb(gb); /* idr_pic_id */
+    }
+
+    init_poc(n, s, gb);
+
+    if(s->pps.redundant_pic_cnt_present){
+        n->redundant_pic_count= get_ue_golomb(gb);
+    }
+
+    //set defaults, might be overridden a few lines later
+    s->ref_count[0]= s->pps.ref_count[0];
+    s->ref_count[1]= s->pps.ref_count[1];
+
+    if(s->slice_type_nos != FF_I_TYPE){
+        if(s->slice_type_nos == FF_B_TYPE){
+            s->direct_spatial_mv_pred= get_bits1(gb);
+        }
+        num_ref_idx_active_override_flag= get_bits1(gb);
+
+        if(num_ref_idx_active_override_flag){
+            s->ref_count[0]= get_ue_golomb(gb) + 1;
+            if(s->slice_type_nos==FF_B_TYPE)
+                s->ref_count[1]= get_ue_golomb(gb) + 1;
+
+            if(s->ref_count[0]-1 > 32-1 || s->ref_count[1]-1 > 32-1){
+                av_log(AV_LOG_ERROR, "reference overflow\n");
+                s->ref_count[0]= s->ref_count[1]= 1;
+                return -1;
+            }
+        }
+        if(s->slice_type_nos == FF_B_TYPE)
+            s->list_count= 2;
+        else
+            s->list_count= 1;
+    }else
+        s->list_count= 0;
+
+
+    if(s->slice_type_nos!=FF_I_TYPE){
+        ff_h264_fill_default_ref_list(n, s);
+        ff_h264_decode_ref_pic_list_reordering(n, s, gb);
+        ref2frame(n, s);
+
+        for(int i=0; i<2; i++){
+            for(int j=0; j<s->ref_count[i]; j++){
+                if (s->ref_list[i][j]==NULL || s->ref_list[i][j]->reference < 2) // Don't know why sometimes the ref_count=1 while there are no references
+                    s->ref_list_cpn[i][j] = -1;
+                else
+                    s->ref_list_cpn[i][j] = s->ref_list[i][j]->cpn;
+            }
+        }
+    }
+
+    if(   (s->pps.weighted_pred          && s->slice_type_nos == FF_P_TYPE )
+    ||  (s->pps.weighted_bipred_idc==1 && s->slice_type_nos== FF_B_TYPE ) ){
+        pred_weight_table(s, gb);
+    }
+    else if(s->pps.weighted_bipred_idc==2 && s->slice_type_nos== FF_B_TYPE){
+        implicit_weight_table( s);
+    }else {
+        s->use_weight = 0;
+    }
+
+    if(s->nal_ref_idc){
+        ff_h264_ref_pic_marking(n, s, gb);
+        n->prev_poc_msb= n->poc_msb;
+        n->prev_poc_lsb= n->poc_lsb;
+    }
+
+    n->prev_frame_num_offset= n->frame_num_offset;
+    n->prev_frame_num= n->frame_num;
+
+    if(s->slice_type_nos != FF_B_TYPE){
+        s->ip_id= n->ip_id++;
+    }
+
+    if(s->slice_type_nos==FF_B_TYPE && !s->direct_spatial_mv_pred){
+        ff_h264_direct_dist_scale_factor(s);
+    }
+    ff_h264_direct_ref_list_init(s);
+
+
+    if( s->slice_type_nos != FF_I_TYPE && s->pps.cabac ){
+        tmp = get_ue_golomb_31(gb);
+        if(tmp > 2){
+            av_log(AV_LOG_ERROR, "cabac_init_idc overflow\n");
+            return -1;
+        }
+        s->cabac_init_idc= tmp;
+    }
+
+    tmp = s->pps.init_qp + get_se_golomb(gb);
+    if(tmp>51){
+        av_log(AV_LOG_ERROR, "QP %u out of range\n", tmp);
+        return -1;
+    }
+    s->qscale= tmp;
+
+    //FIXME qscale / qp ... stuff
+    if(s->slice_type == FF_SP_TYPE){
+        get_bits1(gb); /* sp_for_switch_flag */
+    }
+    if(s->slice_type==FF_SP_TYPE || s->slice_type == FF_SI_TYPE){
+        get_se_golomb(gb); /* slice_qs_delta */
+    }
+
+    s->slice_alpha_c0_offset = 52;
+    s->slice_beta_offset = 52;
+    if( s->pps.deblocking_filter_parameters_present ) {
+        tmp= get_ue_golomb_31(gb);
+        if(tmp > 1){
+            av_log(AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
+            return -1;
+        }
+
+        if(tmp < 2)
+            tmp^= 1; // 1<->0
+
+        if( tmp ) {
+            s->slice_alpha_c0_offset += get_se_golomb(gb) << 1;
+            s->slice_beta_offset     += get_se_golomb(gb) << 1;
+            if( (unsigned) s->slice_alpha_c0_offset > 104U
+            ||(unsigned) s->slice_beta_offset    > 104U){
+                av_log(AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", s->slice_alpha_c0_offset, s->slice_beta_offset);
+                return -1;
+            }
+        }
+    }
+
+    s->qp_thresh= 15 + 52 - FFMIN(s->slice_alpha_c0_offset, s->slice_beta_offset) - FFMAX3(0, s->pps.chroma_qp_index_offset[0], s->pps.chroma_qp_index_offset[1]);
+
+    return 0;
+}
+
+PictureInfo *get_pib_entry(NalContext *nc, int coded_pic_num){
+    PictureInfo *pic = NULL;
+
+    for(int i=0; i<MAX_REF_PIC_COUNT+1; i++){
+        if(nc->picture[i].reference==0){
+            pic= &nc->picture[i];
+            break;
+        }
+    }
+    pic->cpn = coded_pic_num;
+
+    return pic;
+}
+
+int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb1){
+    GetBitContext *gb = gb1;
+    uint8_t *buf = gb1->raw;
+    int buf_size = gb1->buf_size;
+    int next_avc = buf_size;
+    int buf_index=0;
+    uint8_t *dst=NULL;
+//     gb->raw = gb1->raw;
+//     gb->rbsp = NULL;
+    s->release_cnt=0;
+    ff_h264_reset_sei(n);
+
+    s->current_picture_info = get_pib_entry(n, n->coded_pic_num);
+
+    for(;;){
+        int consumed;
+        int dst_length;
+        int bit_length;
+        const uint8_t *ptr;
+        int err;
+
+        if (buf_index >= buf_size){
+            break;
+        } else {
+            // start code prefix search
+            for(; buf_index + 3 < buf_size; buf_index++){
+                // This should always succeed in the first iteration.
+                if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
+                    break;
+            }
+            if(buf_index+3 >= buf_size) break;
+            buf_index+=3;
+        }
+
+        {
+            int length = next_avc - buf_index;
+            int i, si, di;
+            uint8_t *src= buf+buf_index;
+            //    src[0]&0x80;                //forbidden bit
+            s->nal_ref_idc= src[0]>>5;
+            s->nal_unit_type= src[0]&0x1F;
+
+            src++; length--;
+
+            for(i=0; i+1<length; i+=2){
+                if(src[i]) continue;
+                if(i>0 && src[i-1]==0) i--;
+                if(i+2<length && src[i+1]==0 && src[i+2]<=3){
+                    if(src[i+2]!=3){
+                        /* startcode, so we must be past the end */
+                        length=i;
+                    }
+                    break;
+                }
+            }
+
+            if(i>=length-1){ //no escaped 0
+                dst_length= length;
+                consumed= length+1; //+1 for the header
+                ptr=src;
+            }else{
+                av_fast_malloc(&gb->rbsp, &gb->rbsp_size, length+FF_INPUT_BUFFER_PADDING_SIZE);
+                dst = gb->rbsp;
+//                 if (dst){
+//                     av_free(dst);
+//                 }
+//                 dst = av_malloc(length+FF_INPUT_BUFFER_PADDING_SIZE);
+
+                if (dst == NULL){
+                    return -1;
+                }
+
+                //printf("decoding esc\n");
+                memcpy(dst, src, i);
+                si=di=i;
+                while(si+2<length){
+                    //remove escapes (very rare 1:2^22)
+                    if(src[si+2]>3){
+                        dst[di++]= src[si++];
+                        dst[di++]= src[si++];
+                    }else if(src[si]==0 && src[si+1]==0){
+                        if(src[si+2]==3){ //escape
+                            dst[di++]= 0;
+                            dst[di++]= 0;
+                            si+=3;
+                            continue;
+                        }else //next start code
+                            goto nsc;
+                    }
+
+                    dst[di++]= src[si++];
+                }
+                while(si<length)
+                    dst[di++]= src[si++];
+                nsc:
+
+                memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
+
+                dst_length= di;
+                consumed= si + 1;//+1 for the header
+                //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
+                ptr=dst;
+//                 gb->rbsp=ptr;
+            }
+        }
+        if (ptr==NULL || dst_length < 0){
+            return -1;
+        }
+
+        //error prevention, should not touch dst_length
+        while(ptr[dst_length - 1] == 0 && dst_length > 0)
+            dst_length--;
+
+        bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(ptr + dst_length - 1));
+        buf_index += consumed;
+
+        err = 0;
+        init_get_bits(gb, ptr, bit_length);
+        switch(s->nal_unit_type){
+            case NAL_IDR_SLICE:
+                idr(n, s); //FIXME ensure we don't loose some frames if there is reordering
+            case NAL_SLICE:
+                if((err = decode_slice_header(n, s, gb)))
+                    break;
+                s->key_frame |= (s->nal_unit_type == NAL_IDR_SLICE) || (n->sei_recovery_frame_cnt >= 0);
+                break;
+            case NAL_DPA:
+            case NAL_DPB:
+            case NAL_DPC:
+                av_log(AV_LOG_ERROR,"no slices/data partitioning support\n");
+                break;
+            case NAL_SEI:
+                ff_h264_decode_sei(n, gb);
+                break;
+            case NAL_SPS:
+                ff_h264_decode_seq_parameter_set(n, gb);
+                break;
+            case NAL_PPS:
+                ff_h264_decode_picture_parameter_set(n, gb, bit_length);
+                break;
+            case NAL_AUD:
+            case NAL_END_SEQUENCE:
+            case NAL_END_STREAM:
+            case NAL_FILLER_DATA:
+            case NAL_SPS_EXT:
+            case NAL_AUXILIARY_SLICE:
+                break;
+            default:
+                av_log(AV_LOG_ERROR, "Unknown NAL code: %d (%d bits)\n", s->nal_unit_type, bit_length);
+        }
+        if (err < 0)
+            av_log(AV_LOG_ERROR, "decode_slice_header error\n");
+
+    }
+
+    return buf_index;
+}
+
+NalContext *get_nal_context(int width, int height){
+    const int mb_height = (height + 15) / 16;
+    const int mb_width  = (width  + 15) / 16;
+    const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16
+
+    NalContext *nc = av_mallocz(sizeof(NalContext));
+    nc->width = width;
+    nc->height = height;
+    nc->mb_height = mb_height;
+    nc->mb_width  = mb_width;
+    nc->b4_stride = mb_width*4 + 1;
+    nc->mb_stride = mb_stride;
+    nc->outputed_poc = INT_MIN;
+
+    for(int i=0; i<16; i++){
+        nc->picture[i].cpn =-1;
+    }
+
+    return nc;
+}
+
+void free_nal_context(NalContext *nc){
+    for(int i = 0; i < MAX_SPS_COUNT; i++){
+        if (nc->sps_buffers[i]){
+            av_free( nc->sps_buffers[i]);
+        }
+    }
+    for(int i = 0; i < MAX_PPS_COUNT; i++){
+        if (nc->pps_buffers[i]){
+            av_free( nc->pps_buffers[i]);
+        }
+    }
+    av_free(nc);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_nal.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_nal.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,11 @@
+#ifndef H264_NAL_H
+#define H264_NAL_H
+
+#include "avcodec.h"
+#include "h264_types.h"
+
+int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb);
+NalContext *get_nal_context(int width, int height);
+void free_nal_context(NalContext *nc);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_numa.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_numa.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,33 @@
+
+#include <pthread.h>
+#include "h264.h"
+#include "malloc.h"
+
+/*
+* Pthread version with affinity lock for ED and MBD threads. Deprecated
+*/
+int av_transcode_pthread_affinity(int ifile, int ofile, int frame_width, int frame_height, h264_options *opts) {
+	H264Context *h;
+	pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;
+
+	h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height, opts);	
+	timer_start = av_gettime();
+
+	pthread_create(&read_thr, NULL, read_thread, h);
+	pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
+	pthread_create(&entropy_thr, NULL, entropy_IPB_thread, h);
+	pthread_create(&mbdec_thr, NULL, mbdec_thread, h);
+	pthread_create(&write_thr, NULL, write_thread, h);
+
+
+	pthread_join(read_thr, NULL);
+	pthread_join(parsenal_thr, NULL);
+	pthread_join(entropy_thr, NULL);
+	pthread_join(mbdec_thr, NULL);
+	pthread_join(write_thr, NULL);
+
+	/* finished ! */
+	ff_h264_decode_end(h);
+
+	return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_ompss.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_ompss.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,401 @@
+/*
+* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include "h264_types.h"
+#include "h264_parser.h"
+#include "h264_nal.h"
+#include "h264_entropy.h"
+#include "h264_rec.h"
+#include "h264_pred_mode.h"
+#include "h264_misc.h"
+// #undef NDEBUG
+#include <assert.h>
+
+#pragma omp task inout(*pc, *nc) output(*sbe)
+static void parse_task(H264Context *h, ParserContext *pc, NalContext *nc, SliceBufferEntry *sbe){
+    H264Slice *s;
+
+    if (!sbe->initialized){
+        init_sb_entry(h, sbe);
+        sbe->lines_total=h->mb_height;
+    }
+
+    av_read_frame_internal(pc, &sbe->gb);
+    s = &sbe->slice;
+
+    decode_nal_units(nc, s, &sbe->gb);
+}
+
+#pragma omp task inout(*ec) inout(*sbe)
+static void decode_slice_entropy_task(H264Context *h, EntropyContext *ec, SliceBufferEntry *sbe){
+    int i,j;
+    H264Slice *s = &sbe->slice;
+    GetBitContext *gb = &sbe->gb;
+    H264Mb *mbs = sbe->mbs;
+//     GetBitContext *gb = s->gb;
+    CABACContext *c = &ec->c;
+
+    if( !s->pps.cabac ){
+        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
+        return ;
+    }
+
+    init_dequant_tables(s, ec);
+    ec->curr_qscale = s->qscale;
+    ec->last_qscale_diff = 0;
+    ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale);
+    ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale);
+
+    /* realign */
+    align_get_bits( gb );
+    /* init cabac */
+    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
+
+    ff_h264_init_cabac_states(ec, s, c);
+
+    for(j=0; j<ec->mb_height; j++){
+        init_entropy_buf(ec, s, j);
+        for(i=0; i<ec->mb_width; i++){
+            int eos,ret;
+            H264Mb *m = &mbs[i + j*ec->mb_width];
+            m->mb_x=i;
+            m->mb_y=j;
+            ec->m = m;
+
+            ret = ff_h264_decode_mb_cabac(ec, s, c);
+            eos = get_cabac_terminate( c);
+            (void) eos;
+            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
+                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
+                return ;
+            }
+        }
+    }
+}
+
+static void decode_super_mb_block(MBRecContext *d, H264Slice *s, SuperMBContext *smbc, H264Mb *mbs, int smb_x, int smb_y){
+    MBRecState mrs;
+//     memset(&mrs, 0, sizeof(MBRecState));
+
+    for (int k=0, i= smb_y; i< smb_y + smbc->smb_height; i++, k++){
+        init_mbrec_context(d, &mrs, s, i);
+        for (int j= smb_x -k ; j< smb_x - k + smbc->smb_width; j++){
+            if (i< d->mb_height && j >= 0 && j < d->mb_width){
+                h264_decode_mb_internal (d, &mrs, s, &mbs[i*d->mb_width+j]);
+            }
+        }
+    }
+}
+
+#pragma omp task input(*d, *sbe, *ml, *mur) inout(*m)
+static void decode_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml,
+SuperMBTask *mur, SuperMBTask *m){
+    H264Slice *s = &sbe->slice;
+    H264Mb *mbs = sbe->mbs;
+    decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y);
+}
+
+#pragma omp task input(*d, *sbe) inout(*sm)
+static void draw_edges_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *sm, int line){
+    H264Slice *s = &sbe->slice;
+    for (int i=line*smbc->smb_height; i< (line+1)*smbc->smb_height && i< d->mb_height; i++)
+        draw_edges(d, s, i);
+}
+
+static void decode_mb_in_slice(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){
+    int i,j;
+
+    SuperMBContext *smbc = acquire_smbc(h);
+    int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width;
+    SuperMBTask *smbs = smbc->smbs[0];
+
+    SuperMBTask *sm=NULL, *sml, *smur;
+    for(j=0; j< smb_height; j++){
+        for(i=0; i< smb_width; i++){
+            sm = smbs + j*smb_width + i;
+            sml  = sm - ((i > 0) ? 1: 0);
+            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
+            decode_super_mb_task(d, sbe, smbc, sml, smur, sm);
+        }
+        draw_edges_task(d, sbe, smbc, sm, j);
+    }
+    #pragma omp taskwait on(*sm)
+
+    release_smbc(h, smbc);
+}
+
+#pragma omp task inout(*d) inout(*sbe)
+static void decode_slice_mb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){
+    H264Slice *s = &sbe->slice;
+
+    for (int i=0; i<2; i++){
+        for(int j=0; j< s->ref_count[i]; j++){
+            if (s->ref_list_cpn[i][j] ==-1)
+                continue;
+            int k;
+            for (k=0; k< h->max_dpb_cnt; k++){
+                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
+                    s->dp_ref_list[i][j] = &h->dpb[k];
+                    break;
+                }
+            }
+        }
+    }
+
+    #pragma omp critical (dpb)
+    get_dpb_entry(h, s);
+
+    if (!h->no_mbd){
+        decode_mb_in_slice (h, d, sbe);
+    }
+
+    for (int i=0; i<s->release_cnt; i++){
+        for(int j=0; j<h->max_dpb_cnt; j++){
+            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
+                #pragma omp critical (dpb)
+                release_dpb_entry(h, &h->dpb[j], 2);
+                break;
+            }
+        }
+    }
+    s->release_cnt=0;
+}
+
+// for static 3d wave
+/*-------------------------------------------------------------------------------*/
+#pragma omp task input(*d, *sbe, *ml, *mur, *mprev) inout(*m)
+static void decode_3dwave_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml,
+SuperMBTask *mur, SuperMBTask *mprev, SuperMBTask *m){
+    H264Slice *s = &sbe->slice;
+    H264Mb *mbs = sbe->mbs;
+
+    decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y);
+}
+
+// int init_ref_count=0;
+#pragma omp task inout(*d, *sbe, *init)
+static void init_ref_list_and_get_dpb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe, int *init){
+    H264Slice *s = &sbe->slice;
+    for (int i=0; i<2; i++){
+        for(int j=0; j< s->ref_count[i]; j++){
+            if (s->ref_list_cpn[i][j] ==-1)
+                continue;
+            int k;
+            for (k=0; k<h->max_dpb_cnt; k++){
+                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
+                    s->dp_ref_list[i][j] = &h->dpb[k];
+                    break;
+                }
+            }
+        }
+    }
+
+    #pragma omp critical (dpb)
+    get_dpb_entry(h, s);
+
+}
+
+static SuperMBTask* add_decode_slice_3dwave_tasks(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc){
+    int i,j;
+    
+    int smb_3d_height =smbc->nsmb_3dheight;
+    int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width;
+    int smb_diff_prev = smb_height - smb_3d_height;
+    SuperMBTask *sm=NULL, *sml, *smur, *smprev;
+
+    SuperMBTask *smbs = smbc->smbs[smbc->index++]; smbc->index%=2; 
+    SuperMBTask *smbs_prev = smbc->smbs[smbc->index]; // index rotates -> next == prev
+    
+    for(j=0; j<smb_3d_height ; j++){
+        for(i=0; i< smb_width; i++){
+            sm = smbs + j*smb_width + i;
+            sml  = sm - ((i > 0) ? 1: 0);
+            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
+            smprev = smbs_prev + (j + smb_diff_prev+1)*smb_width -1;
+            decode_3dwave_super_mb_task(d, sbe, smbc, sml, smur, smprev, sm);
+        }
+        draw_edges_task(d, sbe, smbc, sm, j);
+    }
+
+    for(; j< smb_height; j++){
+        for(i=0; i< smb_width; i++){
+            sm = smbs + j*smb_width + i;
+            sml  = sm - ((i > 0) ? 1: 0);
+            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
+            decode_super_mb_task(d, sbe, smbc, sml, smur, sm);
+        }
+        draw_edges_task(d, sbe, smbc, sm, j);
+    }
+    return sm;
+}
+
+#pragma omp task inout(*d, *sbe, *release) input (*lastsmb)
+static void release_ref_list_task(H264Context *h, SuperMBContext *smbc, MBRecContext *d, SliceBufferEntry *sbe, SuperMBTask *lastsmb, int *release){
+    H264Slice *s = &sbe->slice;
+    for (int i=0; i<s->release_cnt; i++){
+        for(int j=0; j<h->max_dpb_cnt; j++){
+            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
+                #pragma omp critical (dpb)
+                release_dpb_entry(h, &h->dpb[j], 2);
+                break;
+            }
+        }
+    }
+    s->release_cnt=0;
+
+    release_smbc(h, smbc);
+    
+}
+
+// static void decode_mb_static_3dwave(H264Context *h, int mb_height, int mb_width, MBRecContext *d, H264Slice *s, H264Mb *mbs, SuperMBTask *smbs, SuperMBTask *smbs_prev){
+//
+// }
+/*-------------------------------------------------------------------------------*/
+//end for static 3d wave
+
+#pragma omp task inout (*oc) input(*sbe)
+static void output_task(H264Context *h, OutputContext *oc, SliceBufferEntry *sbe){
+    DecodedPicture* out =output_frame(h, oc, sbe->slice.curr_pic, h->ofile, h->frame_width, h->frame_height);
+    if (out){
+        #pragma omp critical (dpb)
+        release_dpb_entry(h, out, 1);
+    }
+    print_report(oc->frame_number, oc->video_size, 0, h->verbose);
+}
+
+/*
+* The following code is the main loop of the file converter
+*/
+//Put VMS entry point here
+int h264_decode_ompss( H264Context *h) {
+    const int bufs = h->pipe_bufs;
+
+    ParserContext *pc;
+    NalContext *nc;
+    EntropyContext *ec[bufs];
+    MBRecContext *rc[2];
+    OutputContext *oc;
+    SliceBufferEntry *sbe;
+    SuperMBContext *smbc;
+
+    DecodedPicture *out;
+    int frames=0;
+
+#if HAVE_LIBSDL2
+    pthread_t sdl_thr;
+    if (h->display){
+        pthread_create(&sdl_thr, NULL, sdl_thread, h);
+    }
+#endif
+    sbe= av_mallocz(sizeof(SliceBufferEntry) * bufs);
+
+
+    pc = get_parse_context(h->ifile);
+    nc = get_nal_context(h->width, h->height);
+
+    for(int i=0; i<bufs; i++){
+        ec[i] = get_entropy_context( h );
+    }
+
+    for(int i=0; i<2; i++){
+        rc[i] = get_mbrec_context(h);
+    }
+
+    oc = get_output_context( h );
+
+    av_start_timer();
+    int k=0; int init, release;
+    if (h->static_3d && bufs < h->num_frames ){
+        int num_pre_ed =0;
+        for (num_pre_ed=0; num_pre_ed< bufs -1 && !pc->final_frame; num_pre_ed++){
+            parse_task( h, pc, nc, &sbe[k%bufs] );
+            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
+            #pragma omp taskwait on(*pc)
+            k++;
+        }
+
+        while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
+            parse_task( h, pc, nc, &sbe[k%bufs] );
+            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
+
+            k++;
+
+            init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init);
+            smbc = acquire_smbc(h);
+            SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc);
+            release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release);
+
+            output_task (h, oc, &sbe[k%bufs]);
+            #pragma omp taskwait on(*pc)
+        }
+
+        for (int i=0; i< num_pre_ed; i++){
+            k++;
+            init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init);
+            smbc = acquire_smbc(h);
+            SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc);
+            release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release);
+
+            output_task (h, oc, &sbe[k%bufs]);
+        }
+
+    } else {
+        while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
+            parse_task( h, pc, nc, &sbe[k%bufs] );
+
+            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
+
+            decode_slice_mb_task(h, rc[0], &sbe[k%bufs]);
+
+            output_task (h, oc, &sbe[k%bufs]);
+            #pragma omp taskwait on(*pc)
+            k++;
+        }
+    }
+    #pragma omp taskwait
+
+    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
+
+    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
+    h->num_frames = oc->frame_number;
+    /* finished ! */
+
+    free_parse_context(pc);
+    free_nal_context  (nc);
+    free_output_context(oc);
+    for (int i=0; i<bufs; i++){
+        free_sb_entry(&sbe[i]);
+        free_entropy_context(ec[i]);
+    }
+    av_free(sbe);
+
+    for (int i=0; i<2; i++){
+        free_mbrec_context(rc[i]);
+    }
+
+#if HAVE_LIBSDL2
+    if (h->display){
+        signal_sdl_exit(h);
+        pthread_join(sdl_thr, NULL);
+    }
+#endif
+
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_parser.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_parser.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,224 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... parser
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 parser.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include <unistd.h>
+
+#include "golomb.h"
+#include "libavutil/error.h"
+#include "h264_types.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+#define END_NOT_FOUND (-100)
+
+static int ff_h264_find_frame_end(ParserContext *s, const uint8_t *buf, int buf_size)
+{
+    int i;
+    uint32_t state;
+
+    state= s->state;
+    if(state>13)
+        state= 7;
+
+    for(i=0; i<buf_size; i++){
+        if(state==7){
+        /* we check i<buf_size instead of i+3/7 because its simpler
+         * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end
+         */
+            while(i<buf_size && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL))
+                i+=8;
+
+            for(; i<buf_size; i++){
+                if(!buf[i]){
+                    state=2;
+                    break;
+                }
+            }
+        }else if(state<=2){
+            if(buf[i]==1)   state^= 5; //2->7, 1->4, 0->5
+            else if(buf[i]) state = 7;
+            else            state>>=1; //2->1, 1->0, 0->0
+        }else if(state<=5){
+            int v= buf[i] & 0x1F;
+            if(v==6 || v==7 || v==8 || v==9){
+                if(s->frame_start_found){
+                    i++;
+                    goto found;
+                }
+            }else if(v==1 || v==2 || v==5){
+                if(s->frame_start_found){
+                    state+=8;
+                    continue;
+                }else
+                    s->frame_start_found = 1;
+            }
+            state= 7;
+        }else{
+            if(buf[i] & 0x80)
+                goto found;
+            state= 7;
+        }
+    }
+    s->state= state;
+    return END_NOT_FOUND;
+
+found:
+    s->state=7;
+    s->frame_start_found= 0;
+    return i-(state&5);
+}
+
+static int ff_combine_frame(ParserContext *s, GetBitContext *gb, int next, uint8_t **buf, int *buf_size)
+{
+    int i;
+    /* Copy overread bytes from last frame into buffer. */
+    for(i =0; s->overread_cnt>0; s->overread_cnt--, i++){
+        gb->raw[s->index++]= s->overread[i];
+    }
+
+    /* EOF - END_NOT_FOUND means no next frame start is found in current partial read. If buf_size of the partial read is 0 we are at EOF */
+    if(!*buf_size && next == END_NOT_FOUND){
+        next= 0;
+    }
+    s->last_index= s->index;
+
+    /* copy into buffer end return */
+    if(next == END_NOT_FOUND){
+        gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, (*buf_size) + s->index + FF_INPUT_BUFFER_PADDING_SIZE);
+        memcpy(&gb->raw[s->index], *buf, *buf_size);
+        s->index += *buf_size;
+        return -1;
+    }
+
+    ///end found
+    *buf_size=  s->index + next;
+    /* append to buffer */
+
+    gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, next + s->index + FF_INPUT_BUFFER_PADDING_SIZE);
+    memcpy(&gb->raw[s->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE );
+    s->index = 0;
+
+    /* store overread bytes */
+    for(i=0; next < 0; next++, i++){
+        s->state = (s->state<<8) | gb->raw[s->last_index + next];
+        s->overread[i] = gb->raw[s->last_index + next];
+        s->overread_cnt++;
+    }
+
+    return 0;
+}
+
+static int h264_parse(ParserContext *s, GetBitContext *gb,
+                      uint8_t *buf, int buf_size)
+{
+    int next;
+
+    next= ff_h264_find_frame_end(s, buf, buf_size);
+
+    if (ff_combine_frame(s, gb, next, &buf, &buf_size) < 0) {
+        gb->buf_size = 0;
+        return buf_size;
+    }
+
+    if(next<0 && next != END_NOT_FOUND){
+        assert(s->last_index + next >= 0 );
+        ff_h264_find_frame_end(s, &gb->raw[s->last_index + next], -next); //update state
+    }
+
+    gb->buf_size = buf_size;
+    return next;
+}
+
+static int ff_raw_read_partial_packet(ParserContext *pc)
+{
+    int len= -1;
+
+    if (!pc->eof_reached){
+        len = read( pc->ifile, pc->data, pc->buffer_size);
+//         printf("read task %d\t%d\n", pc->ifile, len); fflush(NULL);
+        if (len < pc->buffer_size) {
+            pc->eof_reached = 1;
+        }
+    }
+
+    return len;
+}
+
+void av_read_frame_internal(ParserContext *pc, GetBitContext *gb){
+    int len;
+    uint8_t dummy_buf[FF_INPUT_BUFFER_PADDING_SIZE]={0};
+    av_fast_malloc(&gb->raw, &gb->alloc_size, 2048+FF_INPUT_BUFFER_PADDING_SIZE);
+
+    //Parsing is performed before read, since there are ussually leftovers from parsing the previous frame.
+    for(;;) {
+        if (pc->cur_len>0){
+            len = h264_parse(pc, gb, pc->cur_ptr, pc->cur_len);
+            if (len<0)
+                len =0;
+            //* increment read pointer */
+            pc->cur_ptr += len;
+            pc->cur_len -= len;
+
+            if (gb->buf_size) {
+                break;
+            }
+        }
+
+        //check for ret and not parser->eof_reached as one "read" can contain more than 1 frame
+        pc->size= ff_raw_read_partial_packet(pc);
+        if (pc->size < 0) {
+            pc->final_frame =1;
+            /* return the last frames, if any */
+            h264_parse(pc, gb, dummy_buf, 0);
+            break;
+        }
+        pc->cur_ptr = pc->data;
+        pc->cur_len = pc->size;
+    }
+
+    assert(gb->raw!=NULL);
+
+}
+
+ParserContext *get_parse_context(int ifile){
+    ParserContext *pc = av_mallocz(sizeof(ParserContext));
+    pc->buffer_size = 2048;
+    pc->final_frame = 0;
+    pc->cur_len= 0;
+    pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE);
+    pc->size = 2048;
+    pc->eof_reached =0;
+    pc->ifile = ifile;
+
+    return pc;
+}
+
+void free_parse_context(ParserContext *pc){
+    av_free(pc->data);
+    av_free(pc);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_parser.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_parser.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,10 @@
+#ifndef H264_PARSER_H
+#define H264_PARSER_H
+
+#include "h264_types.h"
+
+void av_read_frame_internal(ParserContext *pc, GetBitContext *gb);
+ParserContext *get_parse_context(int ifile);
+void free_parse_context(ParserContext *pc);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_pred.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,945 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 prediction functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "avcodec.h"
+#include "h264_pred.h"
+//#include "dsputil.h"
+
+static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const uint32_t a= ((uint32_t*)(src-stride))[0];
+    ((uint32_t*)(src+0*stride))[0]= a;
+    ((uint32_t*)(src+1*stride))[0]= a;
+    ((uint32_t*)(src+2*stride))[0]= a;
+    ((uint32_t*)(src+3*stride))[0]= a;
+}
+
+static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
+    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
+    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
+    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
+}
+
+static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
+                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
+}
+
+static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
+
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
+}
+
+static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
+
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
+}
+
+static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    ((uint32_t*)(src+0*stride))[0]=
+    ((uint32_t*)(src+1*stride))[0]=
+    ((uint32_t*)(src+2*stride))[0]=
+    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
+}
+
+
+#define LOAD_TOP_RIGHT_EDGE\
+    const int av_unused t4= topright[0];\
+    const int av_unused t5= topright[1];\
+    const int av_unused t6= topright[2];\
+    const int av_unused t7= topright[3];\
+
+#define LOAD_DOWN_LEFT_EDGE\
+    const int av_unused l4= src[-1+4*stride];\
+    const int av_unused l5= src[-1+5*stride];\
+    const int av_unused l6= src[-1+6*stride];\
+    const int av_unused l7= src[-1+7*stride];\
+
+#define LOAD_LEFT_EDGE\
+    const int av_unused l0= src[-1+0*stride];\
+    const int av_unused l1= src[-1+1*stride];\
+    const int av_unused l2= src[-1+2*stride];\
+    const int av_unused l3= src[-1+3*stride];\
+
+#define LOAD_TOP_EDGE\
+    const int av_unused t0= src[ 0-1*stride];\
+    const int av_unused t1= src[ 1-1*stride];\
+    const int av_unused t2= src[ 2-1*stride];\
+    const int av_unused t3= src[ 3-1*stride];\
+
+static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
+    src[0+1*stride]=
+    src[1+2*stride]=
+    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
+    src[0+0*stride]=
+    src[1+1*stride]=
+    src[2+2*stride]=
+    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+0*stride]=
+    src[2+1*stride]=
+    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+0*stride]=
+    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+}
+
+static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+//    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
+    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
+}
+
+static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=
+    src[1+2*stride]=(lt + t0 + 1)>>1;
+    src[1+0*stride]=
+    src[2+2*stride]=(t0 + t1 + 1)>>1;
+    src[2+0*stride]=
+    src[3+2*stride]=(t1 + t2 + 1)>>1;
+    src[3+0*stride]=(t2 + t3 + 1)>>1;
+    src[0+1*stride]=
+    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+1*stride]=
+    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+1*stride]=
+    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+}
+
+static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t0 + t1 + 1)>>1;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4+ 1)>>1;
+    src[3+2*stride]=(t4 + t5+ 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+}
+
+static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(l0 + l1 + 1)>>1;
+    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[2+0*stride]=
+    src[0+1*stride]=(l1 + l2 + 1)>>1;
+    src[3+0*stride]=
+    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+    src[2+1*stride]=
+    src[0+2*stride]=(l2 + l3 + 1)>>1;
+    src[3+1*stride]=
+    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
+    src[3+2*stride]=
+    src[1+3*stride]=
+    src[0+3*stride]=
+    src[2+2*stride]=
+    src[2+3*stride]=
+    src[3+3*stride]=l3;
+}
+
+
+static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
+	(void) topright;
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=
+    src[2+1*stride]=(lt + l0 + 1)>>1;
+    src[1+0*stride]=
+    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[0+1*stride]=
+    src[2+2*stride]=(l0 + l1 + 1)>>1;
+    src[1+1*stride]=
+    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[2+3*stride]=(l1 + l2+ 1)>>1;
+    src[1+2*stride]=
+    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[0+3*stride]=(l2 + l3 + 1)>>1;
+    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+}
+
+static void pred16x16_vertical_c(uint8_t *src, int stride){
+    int i;
+    const uint32_t a= ((uint32_t*)(src-stride))[0];
+    const uint32_t b= ((uint32_t*)(src-stride))[1];
+    const uint32_t c= ((uint32_t*)(src-stride))[2];
+    const uint32_t d= ((uint32_t*)(src-stride))[3];
+
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]= a;
+        ((uint32_t*)(src+i*stride))[1]= b;
+        ((uint32_t*)(src+i*stride))[2]= c;
+        ((uint32_t*)(src+i*stride))[3]= d;
+    }
+}
+
+static void pred16x16_horizontal_c(uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
+    }
+}
+
+static void pred16x16_dc_c(uint8_t *src, int stride){
+    int i, dc=0;
+
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    for(i=0;i<16; i++){
+        dc+= src[i-stride];
+    }
+
+    dc= 0x01010101*((dc + 16)>>5);
+
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= dc;
+    }
+}
+
+static void pred16x16_left_dc_c(uint8_t *src, int stride){
+    int i, dc=0;
+
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    dc= 0x01010101*((dc + 8)>>4);
+
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= dc;
+    }
+}
+
+static void pred16x16_top_dc_c(uint8_t *src, int stride){
+    int i, dc=0;
+
+    for(i=0;i<16; i++){
+        dc+= src[i-stride];
+    }
+    dc= 0x01010101*((dc + 8)>>4);
+
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= dc;
+    }
+}
+
+static void pred16x16_128_dc_c(uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<16; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]=
+        ((uint32_t*)(src+i*stride))[2]=
+        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
+    }
+}
+
+static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){
+  int i, j, k;
+  int a;
+  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+  const uint8_t * const src0 = src+7-stride;
+  const uint8_t *src1 = src+8*stride-1;
+  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=8; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  if(svq3){
+    H = ( 5*(H/4) ) / 16;
+    V = ( 5*(V/4) ) / 16;
+
+    /* required for 100% accuracy */
+    i = H; H = V; V = i;
+  }else if(rv40){
+    H = ( H + (H>>2) ) >> 4;
+    V = ( V + (V>>2) ) >> 4;
+  }else{
+    H = ( 5*H+32 ) >> 6;
+    V = ( 5*V+32 ) >> 6;
+  }
+
+  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
+  for(j=16; j>0; --j) {
+    int b = a;
+    a += V;
+    for(i=-16; i<0; i+=4) {
+      src[16+i] = cm[ (b    ) >> 5 ];
+      src[17+i] = cm[ (b+  H) >> 5 ];
+      src[18+i] = cm[ (b+2*H) >> 5 ];
+      src[19+i] = cm[ (b+3*H) >> 5 ];
+      b += 4*H;
+    }
+    src += stride;
+  }
+}
+
+static void pred16x16_plane_c(uint8_t *src, int stride){
+    pred16x16_plane_compat_c(src, stride, 0, 0);
+}
+
+
+static void pred8x8_vertical_c(uint8_t *src, int stride){
+    int i;
+    const uint32_t a= ((uint32_t*)(src-stride))[0];
+    const uint32_t b= ((uint32_t*)(src-stride))[1];
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= a;
+        ((uint32_t*)(src+i*stride))[1]= b;
+    }
+}
+
+static void pred8x8_horizontal_c(uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
+    }
+}
+
+static void pred8x8_128_dc_c(uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
+    }
+}
+
+static void pred8x8_left_dc_c(uint8_t *src, int stride){
+    int i;
+    int dc0, dc2;
+
+    dc0=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc0= 0x01010101*((dc0 + 2)>>2);
+    dc2= 0x01010101*((dc2 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc2;
+    }
+}
+
+
+static void pred8x8_top_dc_c(uint8_t *src, int stride){
+    int i;
+    int dc0, dc1;
+
+    dc0=dc1=0;
+    for(i=0;i<4; i++){
+        dc0+= src[i-stride];
+        dc1+= src[4+i-stride];
+    }
+    dc0= 0x01010101*((dc0 + 2)>>2);
+    dc1= 0x01010101*((dc1 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc1;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc1;
+    }
+}
+
+static void pred8x8_dc_c(uint8_t *src, int stride){
+    int i;
+    int dc0, dc1, dc2, dc3;
+
+    dc0=dc1=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc1+= src[4+i-stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
+    dc0= 0x01010101*((dc0 + 4)>>3);
+    dc1= 0x01010101*((dc1 + 2)>>2);
+    dc2= 0x01010101*((dc2 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc1;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc2;
+        ((uint32_t*)(src+i*stride))[1]= dc3;
+    }
+}
+
+//the following 4 function should not be optimized!
+static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){
+    pred8x8_top_dc_c(src, stride);
+    pred4x4_dc_c(src, NULL, stride);
+}
+
+static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){
+    pred8x8_dc_c(src, stride);
+    pred4x4_top_dc_c(src, NULL, stride);
+}
+
+static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){
+    pred8x8_left_dc_c(src, stride);
+    pred4x4_128_dc_c(src + 4*stride    , NULL, stride);
+    pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride);
+}
+
+static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){
+    pred8x8_left_dc_c(src, stride);
+    pred4x4_128_dc_c(src    , NULL, stride);
+    pred4x4_128_dc_c(src + 4, NULL, stride);
+}
+
+static void pred8x8_plane_c(uint8_t *src, int stride){
+  int j, k;
+  int a;
+  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+  const uint8_t * const src0 = src+3-stride;
+  const uint8_t *src1 = src+4*stride-1;
+  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=4; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  H = ( 17*H+16 ) >> 5;
+  V = ( 17*V+16 ) >> 5;
+
+  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
+  for(j=8; j>0; --j) {
+    int b = a;
+    a += V;
+    src[0] = cm[ (b    ) >> 5 ];
+    src[1] = cm[ (b+  H) >> 5 ];
+    src[2] = cm[ (b+2*H) >> 5 ];
+    src[3] = cm[ (b+3*H) >> 5 ];
+    src[4] = cm[ (b+4*H) >> 5 ];
+    src[5] = cm[ (b+5*H) >> 5 ];
+    src[6] = cm[ (b+6*H) >> 5 ];
+    src[7] = cm[ (b+7*H) >> 5 ];
+    src += stride;
+  }
+}
+
+#define SRC(x,y) src[(x)+(y)*stride]
+#define PL(y) \
+    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_LEFT \
+    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
+                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
+    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
+    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
+
+#define PT(x) \
+    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOP \
+    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
+                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
+    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
+    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
+                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
+
+#define PTR(x) \
+    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+    int t8, t9, t10, t11, t12, t13, t14, t15; \
+    if(has_topright) { \
+        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
+        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
+    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+
+#define PREDICT_8x8_LOAD_TOPLEFT \
+    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
+
+#define PREDICT_8x8_DC(v) \
+    int y; \
+    for( y = 0; y < 8; y++ ) { \
+        ((uint32_t*)src)[0] = \
+        ((uint32_t*)src)[1] = v; \
+        src += stride; \
+    }
+
+static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+	(void) has_topleft; (void) has_topright;
+    PREDICT_8x8_DC(0x80808080);
+}
+
+static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+	(void) has_topleft; (void) has_topright;
+    PREDICT_8x8_LOAD_LEFT;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+
+static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+
+static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
+                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+
+static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+	(void) has_topleft; (void) has_topright;
+    PREDICT_8x8_LOAD_LEFT;
+#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
+               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
+    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
+#undef ROW
+}
+
+static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    int y;
+    PREDICT_8x8_LOAD_TOP;
+    src[0] = t0;
+    src[1] = t1;
+    src[2] = t2;
+    src[3] = t3;
+    src[4] = t4;
+    src[5] = t5;
+    src[6] = t6;
+    src[7] = t7;
+    for( y = 1; y < 8; y++ )
+        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
+}
+
+static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
+    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
+    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
+    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
+    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
+}
+
+static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
+    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+}
+
+static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
+    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
+    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
+    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
+    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
+    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
+    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
+    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(7,0)= (t6 + t7 + 1) >> 1;
+}
+
+static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l6 + l7 + 1) >> 1;
+    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
+    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
+    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
+    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
+    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
+    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
+    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
+    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
+    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
+    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
+    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
+    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
+    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
+    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
+}
+
+static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + t1 + 1) >> 1;
+    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
+    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
+    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
+    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
+    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
+    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
+    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
+    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
+    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
+    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(7,6)= (t10 + t11 + 1) >> 1;
+    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
+}
+
+static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride){
+	(void) has_topleft; (void) has_topright;
+    PREDICT_8x8_LOAD_LEFT;
+    SRC(0,0)= (l0 + l1 + 1) >> 1;
+    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
+    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
+    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
+    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
+    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
+    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
+    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
+    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
+    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
+    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
+    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
+}
+#undef PREDICT_8x8_LOAD_LEFT
+#undef PREDICT_8x8_LOAD_TOP
+#undef PREDICT_8x8_LOAD_TOPLEFT
+#undef PREDICT_8x8_LOAD_TOPRIGHT
+#undef PREDICT_8x8_DC
+#undef PTR
+#undef PT
+#undef PL
+#undef SRC
+
+static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    pix -= stride;
+    for(i=0; i<4; i++){
+        uint8_t v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[4];
+        pix[3*stride]= v += block[8];
+        pix[4*stride]= v +  block[12];
+        pix++;
+        block++;
+    }
+}
+
+static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++){
+        uint8_t v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v +  block[3];
+        pix+= stride;
+        block+= 4;
+    }
+}
+
+static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    pix -= stride;
+    for(i=0; i<8; i++){
+        uint8_t v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[8];
+        pix[3*stride]= v += block[16];
+        pix[4*stride]= v += block[24];
+        pix[5*stride]= v += block[32];
+        pix[6*stride]= v += block[40];
+        pix[7*stride]= v += block[48];
+        pix[8*stride]= v +  block[56];
+        pix++;
+        block++;
+    }
+}
+
+static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<8; i++){
+        uint8_t v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v += block[3];
+        pix[4]= v += block[4];
+        pix[5]= v += block[5];
+        pix[6]= v += block[6];
+        pix[7]= v +  block[7];
+        pix+= stride;
+        block+= 8;
+    }
+}
+
+static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+ 
+/**
+ * Sets the intra prediction function pointers.
+ */
+void ff_h264_pred_init(H264PredContext *h){
+
+    h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
+    h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
+    h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
+    h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
+    h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
+    h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
+    h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
+    h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
+    h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
+    h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
+    h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
+    h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
+
+    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
+    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
+    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
+    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
+    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
+    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
+    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
+    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
+    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
+    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
+    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
+    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
+
+    h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
+    h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
+    h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
+
+    h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
+    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
+    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
+    h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t;
+    h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt;
+    h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00;
+    h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0;
+
+    h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
+
+    h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
+    h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
+    h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
+    h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
+
+    h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
+
+    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
+    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
+    h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
+
+    //special lossless h/v prediction for h264
+    h->pred4x4_add  [VERT_PRED   ]= pred4x4_vertical_add_c;
+    h->pred4x4_add  [ HOR_PRED   ]= pred4x4_horizontal_add_c;
+    h->pred8x8l_add [VERT_PRED   ]= pred8x8l_vertical_add_c;
+    h->pred8x8l_add [ HOR_PRED   ]= pred8x8l_horizontal_add_c;
+    h->pred8x8_add  [VERT_PRED8x8]= pred8x8_vertical_add_c;
+    h->pred8x8_add  [ HOR_PRED8x8]= pred8x8_horizontal_add_c;
+    h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c;
+    h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c;
+
+    if (HAVE_NEON) ff_h264_pred_init_arm(h);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_pred.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,90 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 prediction functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_H264PRED_H
+#define AVCODEC_H264PRED_H
+
+#include "libavutil/common.h"
+#include "dsputil.h"
+
+/**
+ * Prediction types
+ */
+//@{
+#define VERT_PRED             0
+#define HOR_PRED              1
+#define DC_PRED               2
+#define DIAG_DOWN_LEFT_PRED   3
+#define DIAG_DOWN_RIGHT_PRED  4
+#define VERT_RIGHT_PRED       5
+#define HOR_DOWN_PRED         6
+#define VERT_LEFT_PRED        7
+#define HOR_UP_PRED           8
+
+#define LEFT_DC_PRED          9
+#define TOP_DC_PRED           10
+#define DC_128_PRED           11
+
+#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN   12
+#define HOR_UP_PRED_RV40_NODOWN           13
+#define VERT_LEFT_PRED_RV40_NODOWN        14
+
+#define DC_PRED8x8            0
+#define HOR_PRED8x8           1
+#define VERT_PRED8x8          2
+#define PLANE_PRED8x8         3
+
+#define LEFT_DC_PRED8x8       4
+#define TOP_DC_PRED8x8        5
+#define DC_128_PRED8x8        6
+
+#define ALZHEIMER_DC_L0T_PRED8x8 7
+#define ALZHEIMER_DC_0LT_PRED8x8 8
+#define ALZHEIMER_DC_L00_PRED8x8 9
+#define ALZHEIMER_DC_0L0_PRED8x8 10
+//@}
+
+/**
+ * Context for storing H.264 prediction functions
+ */
+typedef struct H264PredContext{
+    void (*pred4x4  [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
+    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
+    void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
+    void (*pred16x16[4+3])(uint8_t *src, int stride);
+
+    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
+}H264PredContext;
+
+void ff_h264_pred_init(H264PredContext *h);
+void ff_h264_pred_init_arm(H264PredContext *h);
+
+
+#endif /* AVCODEC_H264PRED_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred_mode.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_pred_mode.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1013 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 direct mb/block decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "dsputil.h"
+#include "avcodec.h"
+#include "h264_data.h"
+#include "h264.h"
+#include "rectangle.h"
+
+//#undef NDEBUG
+#include <assert.h>
+
+static const uint8_t left_block_options[4][16]={
+    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
+    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
+    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
+    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
+};
+
+
+// static void check_cache_copy(MBRecContext *mrc, H264Slice *s, H264Mb *m){
+//     for (int list=0; list<2; list++){
+//         for (int i=0; i<40; i++){
+//             assert (m->ref_cache[list][i] == m->ref_cache_copy[list][i]);
+//             assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy[list][i][0]);
+//             assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy[list][i][1]);
+//         }
+//     }
+// }
+
+// static void check_cache_copy2(MBRecContext *mrc, H264Slice *s, H264Mb *m){
+//     for (int list=0; list<2; list++){
+//         for (int i=0; i<40; i++){
+//             assert (m->ref_cache[list][i] == m->ref_cache_copy2[list][i]);
+//             assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy2[list][i][0]);
+//             assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy2[list][i][1]);
+//         }
+//     }
+// }
+
+static void fill_decode_caches_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
+    int topleft_type, top_type, topright_type, left_type;
+    const uint8_t * left_block= left_block_options[0];
+    const int mb_x = m->mb_x;
+    int i;
+
+    mrs->top_type  = mrs->mb_type_top[mb_x  ];
+    mrs->left_type = mrs->mb_type    [mb_x-1];
+
+    topleft_type = mrs->mb_type_top[mb_x-1];
+    top_type     = mrs->mb_type_top[mb_x  ];
+    topright_type= mrs->mb_type_top[mb_x+1];
+    left_type    = mrs->mb_type    [mb_x-1];
+
+    int type_mask= s->pps.constrained_intra_pred ? 1 : -1;
+
+    if(!IS_SKIP(mb_type)){
+//         memset(mrc->non_zero_count_cache, 0, sizeof(mrc->non_zero_count_cache));
+        AV_COPY32(&mrs->non_zero_count_cache[4+8*1], &m->non_zero_count[ 0]);
+        AV_COPY32(&mrs->non_zero_count_cache[4+8*2], &m->non_zero_count[ 4]);
+        AV_COPY32(&mrs->non_zero_count_cache[4+8*3], &m->non_zero_count[ 8]);
+        AV_COPY32(&mrs->non_zero_count_cache[4+8*4], &m->non_zero_count[12]);
+
+        for (int i=0; i<2; i++) {
+            mrs->non_zero_count_cache[8*1 + 8*i + 1] = m->non_zero_count[16 + i*2   ];
+            mrs->non_zero_count_cache[8*1 + 8*i + 2] = m->non_zero_count[16 + i*2 +1];
+            mrs->non_zero_count_cache[8*4 + 8*i + 1] = m->non_zero_count[20 + i*2   ];
+            mrs->non_zero_count_cache[8*4 + 8*i + 2] = m->non_zero_count[20 + i*2 +1];
+        }
+
+        if(IS_INTRA(mb_type)){
+//             memset(mrc->intra4x4_pred_mode_cache, 0, sizeof(mrc->intra4x4_pred_mode_cache));
+
+            mrs->topleft_samples_available=
+            mrs->top_samples_available=
+            mrs->left_samples_available= 0xFFFF;
+            mrs->topright_samples_available= 0xEEEA;
+
+            if(!(top_type & type_mask)){
+                mrs->topleft_samples_available= 0xB3FF;
+                mrs->top_samples_available= 0x33FF;
+                mrs->topright_samples_available= 0x26EA;
+            }
+
+            if(!(left_type & type_mask)){
+                mrs->topleft_samples_available&= 0xDF5F;
+                mrs->left_samples_available&= 0x5F5F;
+            }
+
+            if(!(topleft_type & type_mask))
+                mrs->topleft_samples_available&= 0x7FFF;
+
+            if(!(topright_type & type_mask))
+                mrs->topright_samples_available&= 0xFBFF;
+
+            if(IS_INTRA4x4(mb_type)){
+                if(IS_INTRA4x4(top_type)){
+                    AV_COPY32(mrs->intra4x4_pred_mode_cache+4+8*0, &mrs->intra4x4_pred_mode_top[4*mb_x]);
+                }else{
+                    mrs->intra4x4_pred_mode_cache[4+8*0]=
+                    mrs->intra4x4_pred_mode_cache[5+8*0]=
+                    mrs->intra4x4_pred_mode_cache[6+8*0]=
+                    mrs->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask);
+                }
+
+                if(IS_INTRA4x4(left_type)){
+#if OMPSS
+                    mrs->intra4x4_pred_mode_cache[3+8*1]= m->intra4x4_pred_mode_left[0];
+                    mrs->intra4x4_pred_mode_cache[3+8*2]= m->intra4x4_pred_mode_left[1];
+                    mrs->intra4x4_pred_mode_cache[3+8*3]= m->intra4x4_pred_mode_left[2];
+                    mrs->intra4x4_pred_mode_cache[3+8*4]= m->intra4x4_pred_mode_left[3];
+#else
+                    mrs->intra4x4_pred_mode_cache[3+8*1]= mrs->intra4x4_pred_mode_left[0];
+                    mrs->intra4x4_pred_mode_cache[3+8*2]= mrs->intra4x4_pred_mode_left[1];
+                    mrs->intra4x4_pred_mode_cache[3+8*3]= mrs->intra4x4_pred_mode_left[2];
+                    mrs->intra4x4_pred_mode_cache[3+8*4]= mrs->intra4x4_pred_mode_left[3];
+#endif
+                }else{
+                    mrs->intra4x4_pred_mode_cache[3+8*1]=
+                    mrs->intra4x4_pred_mode_cache[3+8*2]=
+                    mrs->intra4x4_pred_mode_cache[3+8*3]=
+                    mrs->intra4x4_pred_mode_cache[3+8*4]= 2 - 3*!(left_type & type_mask);
+                }
+            }
+        }
+    }
+
+    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
+        int list;
+
+//         memset(mrs->mv_cache, 0, sizeof(mrs->mv_cache));
+//         memset(mrs->ref_cache, 0, sizeof(mrs->ref_cache));
+
+        mrs->ref_cache[0][scan8[5 ]+1] = mrs->ref_cache[0][scan8[7 ]+1] = mrs->ref_cache[0][scan8[13]+1] =
+        mrs->ref_cache[1][scan8[5 ]+1] = mrs->ref_cache[1][scan8[7 ]+1] = mrs->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
+
+        for(list=0; list<s->list_count; list++){
+            if(!USES_LIST(mb_type, list)){
+                continue;
+            }
+            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
+
+            if(USES_LIST(top_type, list)){
+                const int b_xy= 4*mb_x + 3*mrc->b_stride;
+                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
+                    mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
+                    mrs->ref_cache[list][scan8[0] + 1 - 1*8]= mrs->ref_index_top[list][4*mb_x + 2];
+                    mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
+                    mrs->ref_cache[list][scan8[0] + 3 - 1*8]= mrs->ref_index_top[list][4*mb_x + 3];
+            }else{
+                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
+                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
+            }
+
+            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
+                for(i=0; i<2; i++){
+                    int cache_idx = scan8[0] - 1 + i*2*8;
+                    if(USES_LIST(left_type, list)){
+                        const int b_xy= 4*(mb_x-1) + 3;
+                        const int b8_x= 4*(mb_x-1) + 1;
+                        AV_COPY32(mrs->mv_cache[list][cache_idx  ], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[0+i*2]]);
+                        AV_COPY32(mrs->mv_cache[list][cache_idx+8], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[1+i*2]]);
+                        mrs->ref_cache[list][cache_idx  ]= mrs->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
+                        mrs->ref_cache[list][cache_idx+8]= mrs->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
+                    }else{
+                        AV_ZERO32(mrs->mv_cache [list][cache_idx  ]);
+                        AV_ZERO32(mrs->mv_cache [list][cache_idx+8]);
+                        mrs->ref_cache[list][cache_idx  ]=
+                        mrs->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
+                    }
+                }
+            }else{
+                if(USES_LIST(left_type, list)){
+                    const int b_x = 4*(mb_x-1) + 3;
+                    const int b8_x= 4*(mb_x-1) + 1;
+                    AV_COPY32(mrs->mv_cache[list][scan8[0] - 1], mrs->motion_val[list][b_x + mrc->b_stride*left_block[0]]);
+                    mrs->ref_cache[list][scan8[0] - 1]= mrs->ref_index[list][b8_x + (left_block[0]&~1)];
+                }else{
+                    AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1]);
+                    mrs->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if(USES_LIST(topright_type, list)){
+                const int b_xy= 4*(mb_x+1) + 3*mrc->b_stride;
+                AV_COPY32(mrs->mv_cache[list][scan8[0] + 4 - 1*8], mrs->motion_val_top[list][b_xy]);
+                mrs->ref_cache[list][scan8[0] + 4 - 1*8]= mrs->ref_index_top[list][4*(mb_x+1) + 2];
+            }else{
+                AV_ZERO32(mrs->mv_cache [list][scan8[0] + 4 - 1*8]);
+                mrs->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+            }
+            if(mrs->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
+                int topleft_partition= -1;
+                if(USES_LIST(topleft_type, list)){
+                    const int b_xy = 4*(mb_x-1) + 3 + mrc->b_stride + (topleft_partition & 2*mrc->b_stride);
+                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
+                    AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 - 1*8], mrs->motion_val_top[list][b_xy]);
+                    mrs->ref_cache[list][scan8[0] - 1 - 1*8]= mrs->ref_index_top[list][b8_x];
+                }else{
+                    AV_ZERO32(mrs->mv_cache[list][scan8[0] - 1 - 1*8]);
+                    mrs->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
+                }
+            }
+
+            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
+                continue;
+
+            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
+                mrs->ref_cache[list][scan8[4 ]] =
+                mrs->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
+                AV_ZERO32(mrs->mv_cache [list][scan8[4 ]]);
+                AV_ZERO32(mrs->mv_cache [list][scan8[12]]);
+            }
+        }
+    }
+}
+
+static inline void write_back_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
+    const int b_stride = mrc->b_stride;
+    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
+    const int b8_x= 4*m->mb_x;
+    int list;
+
+    if(!USES_LIST(mb_type, 0))
+        fill_rectangle(&mrs->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
+
+    for(list=0; list<s->list_count; list++){
+        int y;
+        int16_t (*mv_dst)[2];
+        int16_t (*mv_src)[2];
+
+        if(!USES_LIST(mb_type, list))
+            continue;
+
+        mv_dst   = &mrs->motion_val[list][b_x];
+        mv_src   = &mrs->mv_cache[list][scan8[0]];
+        for(y=0; y<4; y++){
+            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
+        }
+
+        {
+            int8_t *ref_index = &mrs->ref_index[list][b8_x];
+            ref_index[0+0*2]= mrs->ref_cache[list][scan8[0]];
+            ref_index[1+0*2]= mrs->ref_cache[list][scan8[4]];
+            ref_index[0+1*2]= mrs->ref_cache[list][scan8[8]];
+            ref_index[1+1*2]= mrs->ref_cache[list][scan8[12]];
+        }
+    }
+}
+
+
+/**
+* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
+*/
+static int check_intra4x4_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){
+    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
+    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
+    int i;
+
+    if(!(mrs->top_samples_available&0x8000)){
+        for(i=0; i<4; i++){
+            int status= top[ mrs->intra4x4_pred_mode_cache[scan8[0] + i] ];
+            if(status<0){
+                av_log(AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
+                return -1;
+            } else if(status){
+                mrs->intra4x4_pred_mode_cache[scan8[0] + i]= status;
+            }
+        }
+    }
+
+    if((mrs->left_samples_available&0x8888)!=0x8888){
+        static const int mask[4]={0x8000,0x2000,0x80,0x20};
+        for(i=0; i<4; i++){
+            if(!(mrs->left_samples_available&mask[i])){
+                int status= left[ mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
+                if(status<0){
+                    av_log(AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
+                    return -1;
+                } else if(status){
+                    mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
+*/
+static int check_intra_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mode){
+    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
+    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
+
+    if(mode > 6) {
+        av_log(AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y);
+        return -1;
+    }
+
+    if(!(mrs->top_samples_available&0x8000)){
+        mode= top[ mode ];
+        if(mode<0){
+            av_log(AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y);
+            return -1;
+        }
+    }
+
+    if((mrs->left_samples_available&0x8080) != 0x8080){
+        mode= left[ mode ];
+        if(mrs->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
+            mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(mrs->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
+        }
+        if(mode<0){
+            av_log(AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y);
+            return -1;
+        }
+    }
+    return mode;
+}
+
+/**
+ * gets the predicted intra4x4 prediction mode.
+ */
+static inline int pred_intra_mode(MBRecContext *mrc, MBRecState *mrs, int n){
+    const int index8= scan8[n];
+    const int left= mrs->intra4x4_pred_mode_cache[index8 - 1];
+    const int top = mrs->intra4x4_pred_mode_cache[index8 - 8];
+    const int min= FFMIN(left, top);
+
+    if(min<0) return DC_PRED;
+    else      return min;
+}
+
+static void write_back_intra_pred_mode_rec(MBRecContext *mrc, MBRecState *mrs, H264Mb *m, int mb_x){
+    int8_t *mode= &mrs->intra4x4_pred_mode[4*mb_x];
+
+    AV_COPY32(mode, mrs->intra4x4_pred_mode_cache + 4 + 8*4);
+#if OMPSS
+    if (m->mb_x < mrc->mb_width-1){
+        H264Mb *mr= m+1;
+        mode = mr->intra4x4_pred_mode_left;
+        mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1];
+        mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2];
+        mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3];
+        mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4];
+    }
+#else
+    mode = mrs->intra4x4_pred_mode_left;
+    mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1];
+    mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2];
+    mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3];
+    mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4];
+#endif
+}
+
+static void pred_spatial_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
+    int b4_stride = mrc->b_stride;
+    const int mb_x = m->mb_x;
+    int mb_type_col[2];
+    const int16_t (*l1mv0)[2], (*l1mv1)[2];
+    const int8_t *l1ref0, *l1ref1;
+    const int is_b8x8 = IS_8X8(*mb_type);
+    unsigned int sub_mb_type= MB_TYPE_L0L1;
+    int i8, i4;
+    int ref[2];
+    int mv[2];
+    int list;
+
+    //assert(h->ref_list[1][0].reference&3);
+
+#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
+
+    /* ref = min(neighbors) */
+    for(list=0; list<2; list++){
+        int left_ref = mrs->ref_cache[list][scan8[0] - 1];
+        int top_ref  = mrs->ref_cache[list][scan8[0] - 8];
+        int refc = mrs->ref_cache[list][scan8[0] - 8 + 4];
+        const int16_t *C= mrs->mv_cache[list][ scan8[0] - 8 + 4];
+        if(refc == PART_NOT_AVAILABLE){
+            refc = mrs->ref_cache[list][scan8[0] - 8 - 1];
+            C    = mrs->mv_cache[list][scan8[0] - 8 - 1];
+        }
+        ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
+        if(ref[list] >= 0){
+            //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
+            const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ];
+            const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ];
+
+            int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
+            if(match_count > 1){ //most common
+                mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]),
+                                     mid_pred(A[1], B[1], C[1]) );
+            }else {
+                assert(match_count==1);
+                if(left_ref==ref[list]){
+                    mv[list]= AV_RN32A(A);
+                }else if(top_ref==ref[list]){
+                    mv[list]= AV_RN32A(B);
+                }else{
+                    mv[list]= AV_RN32A(C);
+                }
+            }
+        }else{
+            int mask= ~(MB_TYPE_L0 << (2*list));
+            mv[list] = 0;
+            ref[list] = -1;
+            if(!is_b8x8)
+                *mb_type &= mask;
+            sub_mb_type &= mask;
+        }
+    }
+
+    if(ref[0] < 0 && ref[1] < 0){
+        ref[0] = ref[1] = 0;
+        if(!is_b8x8)
+            *mb_type |= MB_TYPE_L0L1;
+        sub_mb_type |= MB_TYPE_L0L1;
+    }
+
+    if(!(is_b8x8|mv[0]|mv[1])){
+        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
+        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
+        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+        *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
+        return;
+    }
+
+    mb_type_col[0] =
+    mb_type_col[1] = mrs->list1_mb_type[mb_x];
+
+    sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
+        *mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
+    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
+        *mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
+    }else{
+        if(!s->direct_8x8_inference_flag){
+            /* FIXME save sub mb types from previous frames (or derive from MVs)
+            * so we know exactly what block size to use */
+            sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
+        }
+        *mb_type   |= MB_TYPE_8x8;
+    }
+
+    l1mv0  = (void *) &mrs->list1_motion_val[0][4*mb_x];
+    l1mv1  = (void *) &mrs->list1_motion_val[1][4*mb_x];
+    l1ref0 = &mrs->list1_ref_index [0][4*mb_x];
+    l1ref1 = &mrs->list1_ref_index [1][4*mb_x];
+//     if(!b8_stride){
+//         if(m->mb_y&1){
+//             l1ref0 += 2;
+//             l1ref1 += 2;
+//             l1mv0  +=  2*b4_stride;
+//             l1mv1  +=  2*b4_stride;
+//         }
+//     }
+
+    if(IS_16X16(*mb_type)){
+        int a,b;
+
+        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
+        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
+        if(!IS_INTRA(mb_type_col[0]) && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
+            || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
+            ))){
+            a=b=0;
+            if(ref[0] > 0)
+                a= mv[0];
+            if(ref[1] > 0)
+                b= mv[1];
+        }else{
+            a= mv[0];
+            b= mv[1];
+        }
+        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
+        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
+    }else{
+        int n=0;
+        for(i8=0; i8<4; i8++){
+            const int x8 = i8&1;
+            const int y8 = i8>>1;
+
+            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
+                continue;
+            m->sub_mb_type[i8] = sub_mb_type;
+
+            fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4);
+            fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4);
+            fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
+            fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
+
+            /* col_zero_flag */
+            if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 ))
+                ){
+                const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
+                if(IS_SUB_8X8(sub_mb_type)){
+                    const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
+                    if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
+                        if(ref[0] == 0)
+                            fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
+                        if(ref[1] == 0)
+                            fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
+                        n+=4;
+                    }
+                }else{
+                    int k=0;
+                    for(i4=0; i4<4; i4++){
+                        const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
+                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
+                            if(ref[0] == 0)
+                                AV_ZERO32(mrs->mv_cache[0][scan8[i8*4+i4]]);
+                            if(ref[1] == 0)
+                                AV_ZERO32(mrs->mv_cache[1][scan8[i8*4+i4]]);
+                            k++;
+                        }
+                    }
+                    if(!(k&3))
+                        m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8;
+                    n+=k;
+                }
+            }
+        }
+        if(!is_b8x8 && !(n&15)){
+            *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
+        }
+    }
+}
+
+static void pred_temp_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
+    const int mb_x = m->mb_x;
+    int b4_stride = mrc->b_stride;
+    int mb_type_col[2];
+    const int16_t (*l1mv0)[2], (*l1mv1)[2];
+    const int8_t *l1ref0, *l1ref1;
+    const int is_b8x8 = IS_8X8(*mb_type);
+    unsigned int sub_mb_type;
+    int i8, i4;
+    const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]};
+    const int *dist_scale_factor = s->dist_scale_factor;
+
+    mb_type_col[0] =
+    mb_type_col[1] = mrs->list1_mb_type[mb_x];
+
+    sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
+        *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
+    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
+        *mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
+    }else{
+        if(!s->direct_8x8_inference_flag){
+            /* FIXME save sub mb types from previous frames (or derive from MVs)
+            * so we know exactly what block size to use */
+            sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
+        }
+        *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
+    }
+
+    l1mv0  = (void *) &mrs->list1_motion_val[0][4*mb_x];
+    l1mv1  = (void *) &mrs->list1_motion_val[1][4*mb_x];
+    l1ref0 = &mrs->list1_ref_index [0][4*mb_x];
+    l1ref1 = &mrs->list1_ref_index [1][4*mb_x];
+
+    /* one-to-one mv scaling */
+    if(IS_16X16(*mb_type)){
+        int ref, mv0, mv1;
+
+        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
+        if(IS_INTRA(mb_type_col[0])){
+            ref=mv0=mv1=0;
+        }else{
+            const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
+            : map_col_to_list0[1][l1ref1[0]];
+            const int scale = dist_scale_factor[ref0];
+            const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
+            int mv_l0[2];
+            mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
+            mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
+            ref= ref0;
+            mv0= pack16to32(mv_l0[0],mv_l0[1]);
+            mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
+        }
+        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
+        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
+        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
+    }else{
+        for(i8=0; i8<4; i8++){
+            const int x8 = i8&1;
+            const int y8 = i8>>1;
+            int ref0, scale;
+            const int16_t (*l1mv)[2]= l1mv0;
+
+            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
+                continue;
+            m->sub_mb_type[i8] = sub_mb_type;
+            fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
+            if(IS_INTRA(mb_type_col[0])){
+                fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
+                fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
+                fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
+                continue;
+            }
+
+            ref0 = l1ref0[i8];
+            if(ref0 >= 0)
+                ref0 = map_col_to_list0[0][ref0 ];
+            else{
+                ref0 = map_col_to_list0[1][l1ref1[i8]];
+                l1mv= l1mv1;
+            }
+            scale = dist_scale_factor[ref0];
+
+            fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
+            if(IS_SUB_8X8(sub_mb_type)){
+                const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
+                int mx = (scale * mv_col[0] + 128) >> 8;
+                int my = (scale * mv_col[1] + 128) >> 8;
+                fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
+                fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
+            }else
+            for(i4=0; i4<4; i4++){
+                const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
+                int16_t *mv_l0 = mrs->mv_cache[0][scan8[i8*4+i4]];
+                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
+                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
+                AV_WN32A(mrs->mv_cache[1][scan8[i8*4+i4]],
+                    pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]));
+            }
+        }
+    }
+}
+
+void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
+    if(s->direct_spatial_mv_pred){
+        pred_spatial_direct_motion_rec(mrc, mrs, s, m, mb_type);
+    }else{
+        pred_temp_direct_motion_rec(mrc, mrs, s, m, mb_type);
+    }
+}
+
+static inline int fetch_diagonal_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, const int16_t **C, int i, int list, int part_width){
+    const int topright_ref= mrs->ref_cache[list][ i - 8 + part_width ];
+
+    if(topright_ref != PART_NOT_AVAILABLE){
+        *C= mrs->mv_cache[list][ i - 8 + part_width ];
+        return topright_ref;
+    }else{
+        *C= mrs->mv_cache[list][ i - 8 - 1 ];
+        return mrs->ref_cache[list][ i - 8 - 1 ];
+    }
+}
+
+/**
+ * gets the predicted MV.
+ * @param n the block index
+ * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
+ * @param mx the x component of the predicted motion vector
+ * @param my the y component of the predicted motion vector
+ */
+static inline void pred_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int part_width, int list, int ref, int * const mx, int * const my){
+    const int index8= scan8[n];
+    const int top_ref=      mrs->ref_cache[list][ index8 - 8 ];
+    const int left_ref=     mrs->ref_cache[list][ index8 - 1 ];
+    const int16_t * const A= mrs->mv_cache[list][ index8 - 1 ];
+    const int16_t * const B= mrs->mv_cache[list][ index8 - 8 ];
+    const int16_t * C;
+    int diagonal_ref, match_count;
+
+    assert(part_width==1 || part_width==2 || part_width==4);
+
+/* mv_cache
+  B . . A T T T T
+  U . . L . . , .
+  U . . L . . . .
+  U . . L . . , .
+  . . . L . . . .
+*/
+
+    diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, index8, list, part_width);
+    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
+
+    if(match_count > 1){ //most common
+        *mx= mid_pred(A[0], B[0], C[0]);
+        *my= mid_pred(A[1], B[1], C[1]);
+    }else if(match_count==1){
+        if(left_ref==ref){
+            *mx= A[0];
+            *my= A[1];
+        }else if(top_ref==ref){
+            *mx= B[0];
+            *my= B[1];
+        }else{
+            *mx= C[0];
+            *my= C[1];
+        }
+    }else{
+        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
+            *mx= A[0];
+            *my= A[1];
+        }else{
+            *mx= mid_pred(A[0], B[0], C[0]);
+            *my= mid_pred(A[1], B[1], C[1]);
+        }
+    }
+
+}
+
+/**
+ * gets the directionally predicted 16x8 MV.
+ * @param n the block index
+ * @param mx the x component of the predicted motion vector
+ * @param my the y component of the predicted motion vector
+ */
+static inline void pred_16x8_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){
+    if(n==0){
+        const int top_ref=      mrs->ref_cache[list][ scan8[0] - 8 ];
+        const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ];
+
+        if(top_ref == ref){
+            *mx= B[0];
+            *my= B[1];
+            return;
+        }
+    }else{
+        const int left_ref=     mrs->ref_cache[list][ scan8[8] - 1 ];
+        const int16_t * const A= mrs->mv_cache[list][ scan8[8] - 1 ];
+
+        if(left_ref == ref){
+            *mx= A[0];
+            *my= A[1];
+            return;
+        }
+    }
+
+    //RARE
+    pred_motion(mrc, mrs, s, n, 4, list, ref, mx, my);
+}
+
+/**
+ * gets the directionally predicted 8x16 MV.
+ * @param n the block index
+ * @param mx the x component of the predicted motion vector
+ * @param my the y component of the predicted motion vector
+ */
+static inline void pred_8x16_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){
+    if(n==0){
+        const int left_ref=      mrs->ref_cache[list][ scan8[0] - 1 ];
+        const int16_t * const A=  mrs->mv_cache[list][ scan8[0] - 1 ];
+
+        if(left_ref == ref){
+            *mx= A[0];
+            *my= A[1];
+            return;
+        }
+    }else{
+        const int16_t * C;
+        int diagonal_ref;
+
+        diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, scan8[4], list, 2);
+        if(diagonal_ref == ref){
+            *mx= C[0];
+            *my= C[1];
+            return;
+        }
+    }
+
+    //RARE
+    pred_motion(mrc, mrs, s, n, 2, list, ref, mx, my);
+}
+
+static inline void pred_pskip_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb * m, int * const mx, int * const my){
+    const int top_ref = mrs->ref_cache[0][ scan8[0] - 8 ];
+    const int left_ref= mrs->ref_cache[0][ scan8[0] - 1 ];
+
+    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
+       || !( top_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 8 ]))
+       || !(left_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 1 ]))){
+
+        *mx = *my = 0;
+        return;
+    }
+
+    pred_motion(mrc, mrs, s, 0, 4, 0, 0, mx, my);
+
+    return;
+}
+
+#define ADD_MVD(list) \
+{ \
+    mx += m->mvd[list][mp][0]; \
+    my += m->mvd[list][mp][1]; \
+    mp++; \
+}
+
+int pred_motion_mb_rec (MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){
+    int mp=0;
+    int mb_type = m->mb_type;
+    const int mb_x = m->mb_x;
+
+//     mrc->m =m;
+
+    fill_decode_caches_rec(mrc, mrs, s, m, mb_type);
+    if (IS_SKIP(mb_type)){
+        mb_type=0;
+
+        if( s->slice_type_nos == FF_B_TYPE )
+        {
+            mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
+            ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
+        }
+        else
+        {
+            int mx, my;
+
+            mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; //FIXME check required
+            pred_pskip_motion(mrc, mrs, s, m, &mx, &my);
+            fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+            fill_rectangle(mrs->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
+        }
+
+        write_back_motion_rec(mrc, mrs, s, m, mb_type);
+        m->mb_type = mrs->mb_type[mb_x]= mb_type;
+        return 0;
+    }
+
+
+    if (IS_INTRA_PCM(mb_type)){
+        mrs->mb_type[mb_x] =  mb_type;
+        return 0;
+    }
+    else if (IS_INTRA(mb_type)){
+        int i, pred_mode;
+
+        if( IS_INTRA4x4( mb_type ) ) {
+            if ( IS_8x8DCT(mb_type) ) {
+                for( i = 0; i < 16; i+=4 ) {
+                    int pred = pred_intra_mode(mrc, mrs, i );
+                    int mode = m->intra4x4_pred_mode[i];
+
+                    mode = mode < 0 ?  pred : mode + ( mode >= pred );
+                    fill_rectangle( &mrs->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
+                }
+            } else {
+                for( i = 0; i < 16; i++ ) {
+                    int pred = pred_intra_mode(mrc, mrs, i );
+                    int mode = m->intra4x4_pred_mode[i];
+                    mode = mode < 0 ?  pred : mode + ( mode >= pred );
+                    mrs->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
+                }
+            }
+            write_back_intra_pred_mode_rec(mrc, mrs, m, mb_x);
+            if( check_intra4x4_pred_mode(mrc, mrs, s, m) < 0 ) return -1;
+        } else {
+            m->intra16x16_pred_mode= check_intra_pred_mode(mrc, mrs, s, m, m->intra16x16_pred_mode );
+            if( m->intra16x16_pred_mode < 0 ) return -1;
+        }
+
+        pred_mode = m->chroma_pred_mode;
+        pred_mode= check_intra_pred_mode( mrc, mrs, s, m, pred_mode );
+        if( pred_mode < 0 ) return -1;
+        m->chroma_pred_mode= pred_mode;
+
+    }
+    else if (IS_8X8(mb_type)){
+        int i, j, list;
+
+        if( s->slice_type_nos == FF_B_TYPE ) {
+            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
+                            m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
+                ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
+                mrs->ref_cache[0][scan8[4]] =
+                mrs->ref_cache[1][scan8[4]] =
+                mrs->ref_cache[0][scan8[12]] =
+                mrs->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
+            }
+        }
+
+        for(list=0; list<s->list_count; list++){
+            for(i=0; i<4; i++){
+                if(IS_DIRECT(m->sub_mb_type[i])){
+                    mrs->ref_cache[list][ scan8[4*i]   ]=mrs->ref_cache[list][ scan8[4*i]+1 ];
+                    continue;
+                } else {
+                    mrs->ref_cache[list][ scan8[4*i]   ]=mrs->ref_cache[list][ scan8[4*i]+1 ]=
+                    mrs->ref_cache[list][ scan8[4*i]+8 ]=mrs->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i];
+
+                    if(IS_DIR(m->sub_mb_type[i], 0, list) ){
+                        const int sub_mb_type= m->sub_mb_type[i];
+                        const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
+
+                        int sub_partition_count = IS_SUB_8X8(sub_mb_type) ? 1 : (IS_SUB_4X4(sub_mb_type)? 4 :2);
+                        for(j=0; j<sub_partition_count; j++){
+                            int mx, my;
+                            const int index= 4*i + block_width*j;
+                            int16_t (* mv_cache)[2]= &mrs->mv_cache[list][ scan8[index]];
+                            pred_motion(mrc, mrs, s, index, block_width, list, mrs->ref_cache[list][ scan8[index] ], &mx, &my);
+
+                            ADD_MVD(list)
+
+                            if(IS_SUB_8X8(sub_mb_type)){
+                                mv_cache[ 1 ][0]=
+                                mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
+                                mv_cache[ 1 ][1]=
+                                mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
+                            }else if(IS_SUB_8X4(sub_mb_type)){
+                                mv_cache[ 1 ][0]= mx;
+                                mv_cache[ 1 ][1]= my;
+                            }else if(IS_SUB_4X8(sub_mb_type)){
+                                mv_cache[ 8 ][0]= mx;
+                                mv_cache[ 8 ][1]= my;
+                            }
+                            mv_cache[ 0 ][0]= mx;
+                            mv_cache[ 0 ][1]= my;
+                        }
+                    }else{
+                        fill_rectangle(mrs->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
+                    }
+                }
+            }
+        }
+    } else if( IS_DIRECT(mb_type) ) {
+        mb_type &= ~MB_TYPE_16x16;  //FIXME not nice
+        ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
+    }
+    else {
+        int list, i;
+        if(IS_16X16(mb_type)){
+            for(list=0; list<s->list_count; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    int ref;
+                    int mx,my;
+
+                    ref = m->ref_index[list][0];
+                    fill_rectangle(&mrs->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
+                    pred_motion(mrc, mrs, s, 0, 4, list, mrs->ref_cache[list][ scan8[0] ], &mx, &my);
+                    ADD_MVD(list)
+                    fill_rectangle(mrs->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
+                }
+            }
+        }
+        else if(IS_16X8(mb_type)){
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        int ref;
+                        int mx,my;
+                        ref = m->ref_index[list][i];
+                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
+
+                        pred_16x8_motion(mrc, mrs, s, 8*i, list, mrs->ref_cache[list][scan8[0] + 16*i], &mx, &my);
+                        ADD_MVD(list)
+
+                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
+                    }else{
+                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
+                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
+                    }
+                }
+            }
+
+        }else{
+            assert(IS_8X16(mb_type));
+
+            for(list=0; list<s->list_count; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){ //FIXME optimize
+                        int ref;
+                        int mx,my;
+                        ref = m->ref_index[list][i];
+                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
+                        pred_8x16_motion(mrc, mrs, s, i*4, list, mrs->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
+                        ADD_MVD(list)
+                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
+                    }else{
+                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
+                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
+                    }
+                }
+            }
+        }
+    }
+
+    if (IS_INTER(mb_type)||(IS_DIRECT(mb_type)))
+        write_back_motion_rec(mrc, mrs, s, m, mb_type);
+    m->mb_type = mrs->mb_type[mb_x]= mb_type;
+
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred_mode.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_pred_mode.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,10 @@
+#ifndef H264_DIRECT_H
+#define H264_DIRECT_H
+
+#include "h264_types.h"
+
+void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int *mb_type);
+int pred_motion_mb_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m);
+
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_ps.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_ps.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,462 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 parameter set decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "dsputil.h"
+#include "avcodec.h"
+#include "h264_types.h"
+#include "h264_data.h"
+#include "golomb.h"
+
+
+//#undef NDEBUG
+#include <assert.h>
+
+static const int pixel_aspect[17][2]={
+ {0, 1},
+ {1, 1},
+ {12, 11},
+ {10, 11},
+ {16, 11},
+ {40, 33},
+ {24, 11},
+ {20, 11},
+ {32, 11},
+ {80, 33},
+ {18, 11},
+ {15, 11},
+ {64, 33},
+ {160,99},
+ {4, 3},
+ {3, 2},
+ {2, 1},
+};
+
+const uint8_t ff_h264_chroma_qp[52]={
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
+   12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
+   28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
+   37,38,38,38,39,39,39,39
+};
+
+static const uint8_t default_scaling4[2][16]={
+{   6,13,20,28,
+   13,20,28,32,
+   20,28,32,37,
+   28,32,37,42
+},{
+   10,14,20,24,
+   14,20,24,27,
+   20,24,27,30,
+   24,27,30,34
+}};
+
+static const uint8_t default_scaling8[2][64]={
+{   6,10,13,16,18,23,25,27,
+   10,11,16,18,23,25,27,29,
+   13,16,18,23,25,27,29,31,
+   16,18,23,25,27,29,31,33,
+   18,23,25,27,29,31,33,36,
+   23,25,27,29,31,33,36,38,
+   25,27,29,31,33,36,38,40,
+   27,29,31,33,36,38,40,42
+},{
+    9,13,15,17,19,21,22,24,
+   13,13,17,19,21,22,24,25,
+   15,17,19,21,22,24,25,27,
+   17,19,21,22,24,25,27,28,
+   19,21,22,24,25,27,28,30,
+   21,22,24,25,27,28,30,32,
+   22,24,25,27,28,30,32,33,
+   24,25,27,28,30,32,33,35
+}};
+
+static inline int decode_hrd_parameters(GetBitContext *gb, SPS *sps){
+    int cpb_count, i;
+    cpb_count = get_ue_golomb_31(gb) + 1;
+
+    if(cpb_count > 32){
+        av_log(AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
+        return -1;
+    }
+
+    get_bits(gb, 4); /* bit_rate_scale */
+    get_bits(gb, 4); /* cpb_size_scale */
+    for(i=0; i<cpb_count; i++){
+        get_ue_golomb(gb); /* bit_rate_value_minus1 */
+        get_ue_golomb(gb); /* cpb_size_value_minus1 */
+        get_bits1(gb);     /* cbr_flag */
+    }
+    sps->initial_cpb_removal_delay_length = get_bits(gb, 5) + 1;
+    sps->cpb_removal_delay_length = get_bits(gb, 5) + 1;
+    sps->dpb_output_delay_length = get_bits(gb, 5) + 1;
+    sps->time_offset_length = get_bits(gb, 5);
+    sps->cpb_cnt = cpb_count;
+    return 0;
+}
+
+static inline int decode_vui_parameters(GetBitContext *gb, SPS *sps){
+    int aspect_ratio_info_present_flag;
+    unsigned int aspect_ratio_idc;
+
+    aspect_ratio_info_present_flag= get_bits1(gb);
+
+    if( aspect_ratio_info_present_flag ) {
+        aspect_ratio_idc= get_bits(gb, 8);
+        if( aspect_ratio_idc == EXTENDED_SAR ) {
+            sps->num= get_bits(gb, 16);
+            sps->den= get_bits(gb, 16);
+        }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(int[2])){
+            //sps->sar=  pixel_aspect[aspect_ratio_idc];
+        }else{
+            av_log( AV_LOG_ERROR, "illegal aspect ratio idc %d\n", aspect_ratio_idc);
+         //   return -1;
+        }
+    }else{
+        sps->num=
+        sps->den= 0;
+    }
+
+    if(get_bits1(gb)){      /* overscan_info_present_flag */
+        get_bits1(gb);      /* overscan_appropriate_flag */
+    }
+
+    sps->video_signal_type_present_flag = get_bits1(gb);
+    if(sps->video_signal_type_present_flag){
+        get_bits(gb, 3);    /* video_format */
+        sps->full_range = get_bits1(gb); /* video_full_range_flag */
+
+        sps->colour_description_present_flag = get_bits1(gb);
+        if(sps->colour_description_present_flag){
+            sps->color_primaries = get_bits(gb, 8); /* colour_primaries */
+            sps->color_trc       = get_bits(gb, 8); /* transfer_characteristics */
+            sps->colorspace      = get_bits(gb, 8); /* matrix_coefficients */
+            if (sps->color_primaries >= AVCOL_PRI_NB)
+                sps->color_primaries  = AVCOL_PRI_UNSPECIFIED;
+            if (sps->color_trc >= AVCOL_TRC_NB)
+                sps->color_trc  = AVCOL_TRC_UNSPECIFIED;
+            if (sps->colorspace >= AVCOL_SPC_NB)
+                sps->colorspace  = AVCOL_SPC_UNSPECIFIED;
+        }
+    }
+
+    if(get_bits1(gb)){      /* chroma_location_info_present_flag */
+        av_log(AV_LOG_ERROR, "chroma_location_info_present_flag found, but not supported\n");
+        (void) (get_ue_golomb(gb)+1);  /* chroma_sample_location_type_top_field */
+        (void) get_ue_golomb(gb);  /* chroma_sample_location_type_bottom_field */
+    }
+
+    sps->timing_info_present_flag = get_bits1(gb);
+    if(sps->timing_info_present_flag){
+        sps->num_units_in_tick = get_bits_long(gb, 32);
+        sps->time_scale = get_bits_long(gb, 32);
+        if(!sps->num_units_in_tick || !sps->time_scale){
+            av_log(AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
+            return -1;
+        }
+        sps->fixed_frame_rate_flag = get_bits1(gb);
+    }
+
+    sps->nal_hrd_parameters_present_flag = get_bits1(gb);
+    if(sps->nal_hrd_parameters_present_flag)
+        if(decode_hrd_parameters(gb, sps) < 0)
+            return -1;
+    sps->vcl_hrd_parameters_present_flag = get_bits1(gb);
+    if(sps->vcl_hrd_parameters_present_flag)
+        if(decode_hrd_parameters(gb, sps) < 0)
+            return -1;
+    if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
+        get_bits1(gb);     /* low_delay_hrd_flag */
+    sps->pic_struct_present_flag = get_bits1(gb);
+
+    sps->bitstream_restriction_flag = get_bits1(gb);
+    if(sps->bitstream_restriction_flag){
+        get_bits1(gb);     /* motion_vectors_over_pic_boundaries_flag */
+        get_ue_golomb(gb); /* max_bytes_per_pic_denom */
+        get_ue_golomb(gb); /* max_bits_per_mb_denom */
+        get_ue_golomb(gb); /* log2_max_mv_length_horizontal */
+        get_ue_golomb(gb); /* log2_max_mv_length_vertical */
+        sps->num_reorder_frames= get_ue_golomb(gb);
+        get_ue_golomb(gb); /*max_dec_frame_buffering*/
+
+        if(sps->num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
+            av_log(AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size, const uint8_t *jvt_list, const uint8_t *fallback_list){
+    int i, last = 8, next = 8;
+    const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
+    if(!get_bits1(gb)) /* matrix not written, we use the predicted one */
+        memcpy(factors, fallback_list, size*sizeof(uint8_t));
+    else
+    for(i=0;i<size;i++){
+        if(next)
+            next = (last + get_se_golomb(gb)) & 0xff;
+        if(!i && !next){ /* matrix not written, we use the preset one */
+            memcpy(factors, jvt_list, size*sizeof(uint8_t));
+            break;
+        }
+        last = factors[scan[i]] = next ? next : last;
+    }
+}
+
+static void decode_scaling_matrices(GetBitContext *gb, SPS *sps, PPS *pps, int is_sps,
+                                   uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
+    int fallback_sps = !is_sps && sps->scaling_matrix_present;
+    const uint8_t *fallback[4] = {
+        fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
+        fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
+        fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
+        fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
+    };
+    if(get_bits1(gb)){
+        sps->scaling_matrix_present |= is_sps;
+        decode_scaling_list(gb, scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
+        decode_scaling_list(gb, scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
+        decode_scaling_list(gb, scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
+        decode_scaling_list(gb, scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
+        decode_scaling_list(gb, scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
+        decode_scaling_list(gb, scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
+        if(is_sps || pps->transform_8x8_mode){
+            decode_scaling_list(gb, scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
+            decode_scaling_list(gb, scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
+        }
+    }
+}
+
+int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb){
+    int profile_idc, level_idc;
+    unsigned int sps_id;
+    int i;
+    SPS *sps;
+
+    profile_idc= get_bits(gb, 8);
+    get_bits1(gb);   //constraint_set0_flag
+    get_bits1(gb);   //constraint_set1_flag
+    get_bits1(gb);   //constraint_set2_flag
+    get_bits1(gb);   //constraint_set3_flag
+    get_bits(gb, 4); // reserved
+    level_idc= get_bits(gb, 8);
+    sps_id= get_ue_golomb_31(gb);
+
+    if(sps_id >= MAX_SPS_COUNT) {
+        av_log(AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
+        return -1;
+    }
+    if (!n->sps_buffers[sps_id])
+        n->sps_buffers[sps_id]= av_mallocz(sizeof(SPS));
+        
+    sps = n->sps_buffers[sps_id];
+    if(sps == NULL)
+        return -1;
+
+    sps->profile_idc= profile_idc;
+    sps->level_idc= level_idc;
+
+    memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
+    memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
+    sps->scaling_matrix_present = 0;
+
+    if(sps->profile_idc >= 100){ //high profile
+        sps->chroma_format_idc= get_ue_golomb_31(gb);
+        if(sps->chroma_format_idc == 3)
+            sps->residual_color_transform_flag = get_bits1(gb);
+        sps->bit_depth_luma   = get_ue_golomb(gb) + 8;
+        sps->bit_depth_chroma = get_ue_golomb(gb) + 8;
+        sps->transform_bypass = get_bits1(gb);
+        decode_scaling_matrices(gb, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
+    }else{
+        sps->chroma_format_idc= 1;
+        sps->bit_depth_luma   = 8;
+        sps->bit_depth_chroma = 8;
+    }
+
+    sps->log2_max_frame_num= get_ue_golomb(gb) + 4;
+    sps->poc_type= get_ue_golomb_31(gb);
+
+    if(sps->poc_type == 0){ //FIXME #define
+        sps->log2_max_poc_lsb= get_ue_golomb(gb) + 4;
+    } else if(sps->poc_type == 1){//FIXME #define
+        sps->delta_pic_order_always_zero_flag= get_bits1(gb);
+        sps->offset_for_non_ref_pic= get_se_golomb(gb);
+        sps->offset_for_top_to_bottom_field= get_se_golomb(gb);
+        sps->poc_cycle_length                = get_ue_golomb(gb);
+
+        if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
+            av_log(AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
+            goto fail;
+        }
+
+        for(i=0; i<sps->poc_cycle_length; i++)
+            sps->offset_for_ref_frame[i]= get_se_golomb(gb);
+    }else if(sps->poc_type != 2){
+        av_log(AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
+        goto fail;
+    }
+
+    sps->ref_frame_count= get_ue_golomb_31(gb);
+    if(sps->ref_frame_count >= 32){
+        av_log(AV_LOG_ERROR, "too many reference frames\n");
+        goto fail;
+    }
+    sps->gaps_in_frame_num_allowed_flag= get_bits1(gb);
+    sps->mb_width = get_ue_golomb(gb) + 1;
+    sps->mb_height= get_ue_golomb(gb) + 1;
+
+
+    sps->frame_mbs_only_flag= get_bits1(gb);
+    if(!sps->frame_mbs_only_flag){
+        av_log(AV_LOG_ERROR, "MBAFF support not included\n");
+        get_bits1(gb);
+    }else
+        sps->mb_aff= 0;
+
+    sps->direct_8x8_inference_flag= get_bits1(gb);
+    if(!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag){
+        av_log(AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n");
+        goto fail;
+    }
+
+    sps->crop= get_bits1(gb);
+    if(sps->crop){
+		sps->crop_left = get_ue_golomb(gb);
+		sps->crop_right = get_ue_golomb(gb);
+		sps->crop_top = get_ue_golomb(gb);
+		sps->crop_bottom= get_ue_golomb(gb);
+		if(sps->crop_left || sps->crop_top){
+			av_log( AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
+		}
+		if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
+			av_log( AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
+		}
+	}else {
+	
+		sps->crop_left  =
+		sps->crop_right =
+		sps->crop_top   =
+		sps->crop_bottom= 0;
+	}
+
+    sps->vui_parameters_present_flag= get_bits1(gb);
+    if( sps->vui_parameters_present_flag )
+        if (decode_vui_parameters(gb, sps) < 0)
+            goto fail;
+
+    
+    n->sps = *sps;
+
+    if( sps->bitstream_restriction_flag){
+        n->has_b_frames = sps->num_reorder_frames;
+    }
+    else
+        n->has_b_frames= MAX_DELAYED_PIC_COUNT;
+
+    return 0;
+fail:
+    av_free(sps);
+    return -1;
+}
+
+static void
+build_qp_table(PPS *pps, int t, int index)
+{
+    int i;
+    for(i = 0; i < 52; i++)
+        pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)];
+}
+
+int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length){
+    unsigned int pps_id= get_ue_golomb(gb);
+    PPS *pps;
+
+    if(pps_id >= MAX_PPS_COUNT) {
+        av_log(AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
+        return -1;
+    }
+    if (!n->pps_buffers[pps_id])
+        n->pps_buffers[pps_id]= av_mallocz(sizeof(PPS));
+    pps = n->pps_buffers[pps_id];
+    if(pps == NULL)
+        return -1;
+    pps->sps_id= get_ue_golomb_31(gb);
+    if((unsigned)pps->sps_id>=MAX_SPS_COUNT || n->sps_buffers[pps->sps_id] == NULL){
+        av_log(AV_LOG_ERROR, "sps_id out of range\n");
+        goto fail;
+    }
+
+    pps->cabac= get_bits1(gb);
+    pps->pic_order_present= get_bits1(gb);
+    if(pps->pic_order_present){        
+        av_log(AV_LOG_ERROR, "no interlaces support\n");
+    }
+    pps->slice_group_count= get_ue_golomb(gb) + 1;
+    if(pps->slice_group_count > 1 ){
+        pps->mb_slice_group_map_type= get_ue_golomb(gb);
+        av_log(AV_LOG_ERROR, "multiple slices not supported\n");
+    }
+    pps->ref_count[0]= get_ue_golomb(gb) + 1;
+    pps->ref_count[1]= get_ue_golomb(gb) + 1;
+    if(pps->ref_count[0]> 32 || pps->ref_count[1]> 32){
+        av_log(AV_LOG_ERROR, "reference overflow (pps)\n");
+        goto fail;
+    }
+
+    pps->weighted_pred= get_bits1(gb);
+    pps->weighted_bipred_idc= get_bits(gb, 2);
+    pps->init_qp= get_se_golomb(gb) + 26;
+    pps->init_qs= get_se_golomb(gb) + 26;
+    pps->chroma_qp_index_offset[0]= get_se_golomb(gb);
+    pps->deblocking_filter_parameters_present= get_bits1(gb);
+    pps->constrained_intra_pred= get_bits1(gb);
+    pps->redundant_pic_cnt_present = get_bits1(gb);
+
+    pps->transform_8x8_mode= 0;
+    memcpy(pps->scaling_matrix4, n->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
+    memcpy(pps->scaling_matrix8, n->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
+
+    if(get_bits_count(gb) < bit_length){
+        pps->transform_8x8_mode= get_bits1(gb);
+        decode_scaling_matrices(gb, n->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
+        pps->chroma_qp_index_offset[1]= get_se_golomb(gb); //second_chroma_qp_index_offset
+    } else {
+        pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
+    }
+
+    build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
+    build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
+    if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
+        pps->chroma_qp_diff= 1;
+
+    return 0;
+fail:
+    av_free(pps);
+    return -1;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_ps.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_ps.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,9 @@
+#ifndef H264_PS_H
+#define H264_PS_H
+
+#include "h264_types.h"
+
+int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb);
+int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pthread.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_pthread.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,604 @@
+#include "config.h"
+
+#include "h264_types.h"
+#include "h264_parser.h"
+#include "h264_nal.h"
+#include "h264_entropy.h"
+#include "h264_rec.h"
+#include "h264_misc.h"
+// #undef NDEBUG
+#include <assert.h>
+#include <pthread.h>
+
+#define XOANON 1
+
+#ifdef XOANON
+static int ed_rec_affinity[40] = { 0,  4,  8, 12, 16, 20, 24, 28, 32, 36,
+                                   1,  5,  9, 13, 17, 21, 25, 29, 33, 37,
+                                   2,  6, 10, 14, 18, 22, 26, 30, 34, 38,
+                                   3,  7, 11, 15, 19, 23, 27, 31, 35, 39 };
+static int ed_rec_smt_aff[80]  = { 0,  40,  4, 44,  8, 48, 12, 52, 16, 56, 20, 60, 24, 64, 28, 68, 32, 72, 36, 76,
+                                   1,  41,  5, 45,  9, 49, 13, 53, 17, 57, 21, 61, 25, 65, 29, 69, 33, 73, 37, 77,
+                                   2,  42,  6, 46, 10, 50, 14, 54, 18, 58, 22, 62, 26, 66, 30, 70, 34, 74, 38, 78,
+                                   3,  43,  7, 47, 11, 51, 15, 55, 19, 59, 23, 63, 27, 67, 31, 71, 35, 75, 39, 79 };
+#else
+static int ed_rec_affinity[10] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9};
+static int ed_rec_smt_aff[20] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, };
+#endif
+
+static int frames=0;
+
+static void notify_one_worker(H264Context *h){
+    pthread_mutex_lock(&h->task_lock);
+    pthread_cond_signal(&h->task_cond);
+    pthread_mutex_unlock(&h->task_lock);
+}
+
+static void notify_all_workers(H264Context *h){
+    pthread_mutex_lock(&h->task_lock);
+    pthread_cond_broadcast(&h->task_cond);
+    pthread_mutex_unlock(&h->task_lock);
+}
+
+static void push_sbe (SliceBufferQueue *sbq, SliceBufferEntry *sbe, int notify ){
+    pthread_mutex_lock(&sbq->lock);
+    while (sbq->cnt >= sbq->size)
+        pthread_cond_wait(&sbq->cond, &sbq->lock);
+    sbq->queue[sbq->fi] = sbe;
+    sbq->cnt++;
+    sbq->fi++; sbq->fi %= sbq->size;
+    if (notify)
+        pthread_cond_signal(&sbq->cond);
+    pthread_mutex_unlock(&sbq->lock);
+}
+
+static SliceBufferEntry* pop_sbe (SliceBufferQueue *sbq, int block){
+    SliceBufferEntry *sbe=NULL;
+
+    pthread_mutex_lock(&sbq->lock);
+    if (block){
+        while (sbq->cnt <= 0)
+            pthread_cond_wait(&sbq->cond, &sbq->lock);
+    }else {
+        if (sbq->cnt <= 0)
+            goto nonblock;
+    }
+    sbe = sbq->queue[sbq->fo];
+    sbq->cnt--;
+    sbq->fo++; sbq->fo %= sbq->size;
+    pthread_cond_signal(&sbq->cond);
+nonblock:
+    pthread_mutex_unlock(&sbq->lock);
+
+    return sbe;
+}
+
+// static void push_rle (RingLineQueue *rlq, SliceBufferEntry *sbe, int line, int notify){
+//
+//     //check for free slots
+//     pthread_mutex_lock(&rlq->wslock);
+//     while (rlq->free <= 0){
+//         pthread_cond_wait(&rlq->wscond, &rlq->wslock);
+//     }
+//     //free slot is available, decrement one in this lock
+//     rlq->free--;
+//     pthread_mutex_unlock(&rlq->wslock);
+//
+//     pthread_mutex_lock(&rlq->swlock);
+//     rlq->queue[rlq->fi]->sbe=sbe;
+//     rlq->queue[rlq->fi]->line=line;
+//     rlq->queue[rlq->fi]->mb_cnt=0;
+//     rlq->fi++; rlq->fi %= rlq->size;
+//     rlq->ready++;
+//     if(notify)
+//         pthread_cond_signal(&rlq->swcond);
+//     pthread_mutex_unlock(&rlq->swlock);
+// }
+
+// static RingLineEntry* pop_rle (RingLineQueue *rlq, int block){
+//     RingLineEntry *rle=NULL;
+//
+//     pthread_mutex_lock(&rlq->swlock);
+//     if (block){
+//         while (rlq->ready <= 0)
+//             pthread_cond_wait(&rlq->swcond, &rlq->swlock);
+//     }else {
+//         if (rlq->ready <= 0)
+//             goto nonblock;
+//     }
+//     rle = rlq->queue[rlq->fo];
+//     rlq->fo++; rlq->fo %= rlq->size;
+//     rlq->ready--;
+// nonblock:
+//     pthread_mutex_unlock(&rlq->swlock);
+//
+//     return rle;
+// }
+//
+// static void rel_rle (RingLineQueue *rlq){
+//     pthread_mutex_lock(&rlq->wslock);
+//     rlq->free++;
+//     pthread_cond_signal(&rlq->wscond);
+//     pthread_mutex_unlock(&rlq->wslock);
+// }
+
+static RingLineEntry* pop_rle (SliceBufferQueue *sbq, RingLineQueue *rlq, int *has_token){
+    RingLineEntry *rle=NULL;
+    SliceBufferEntry *sbe=NULL;
+    int line=-1;
+
+    pthread_mutex_lock(&sbq->lock);
+    if (sbq->cnt <= 0)
+        goto unlock;
+    sbe = sbq->queue[sbq->fo];
+    line = sbe->lines_taken;
+
+
+    pthread_mutex_lock(&rlq->swlock);
+    if (!*has_token){
+        if (rlq->free <= 0)
+            goto unlock2;
+        rlq->free--;
+        *has_token=1;
+    }
+    rle = rlq->queue[rlq->fo];
+    rlq->fo++; rlq->fo %= rlq->size;
+    rle->sbe=sbe;
+    rle->line = line;
+    rle->mb_cnt =0;
+    if (++sbe->lines_taken >= sbe->lines_total){
+        sbq->cnt--;
+        sbq->fo++; sbq->fo %= sbq->size;
+        pthread_cond_signal(&sbq->cond);
+    }
+unlock2:
+    pthread_mutex_unlock(&rlq->swlock);
+unlock:
+    pthread_mutex_unlock(&sbq->lock);
+
+
+    return rle;
+}
+
+static void rel_rle (RingLineQueue *rlq, int *rec_token){
+    pthread_mutex_lock(&rlq->swlock);
+    rlq->free++;
+    *rec_token=0;
+//     pthread_cond_signal(&rlq->swcond);
+    pthread_mutex_unlock(&rlq->swlock);
+
+}
+
+//get either a entropy or a line reconstruct task
+static void pop_next_task(H264Context *h, SliceBufferEntry **psbe, RingLineEntry **prle, int *rec_token){
+
+    pthread_mutex_lock(&h->task_lock);
+
+    for(;;){
+        if ( (*psbe = pop_sbe(&h->sb_q[ENTROPY], 0)) ){
+            if (*rec_token){
+                rel_rle(&h->rl_q, rec_token);
+                pthread_cond_signal(&h->task_cond);
+            }
+            break;
+        }
+        else if ( (*prle = pop_rle(&h->sb_q[MBDEC], &h->rl_q, rec_token)) )
+            break;
+        pthread_cond_wait(&h->task_cond, &h->task_lock);
+    }
+
+    pthread_mutex_unlock(&h->task_lock);
+}
+
+void *parse_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+    ParserContext *pc = get_parse_context(h->ifile);
+    NalContext *nc = get_nal_context(h->width, h->height);
+    H264Slice *s;
+    SliceBufferEntry *sbe = NULL;
+
+    while(!pc->final_frame && frames++ <h->num_frames && !h->quit){
+        sbe = get_sb_entry(h);
+
+        av_read_frame_internal(pc, &sbe->gb);
+        s = &sbe->slice;
+
+        decode_nal_units(nc, s, &sbe->gb);
+
+        push_sbe(&h->sb_q[ENTROPY], sbe, 0);
+        notify_one_worker(h);
+    }
+
+    if (!h->no_mbd){
+        sbe = get_sb_entry(h);
+        sbe->state=-1;
+        sbe->slice.coded_pic_num=nc->coded_pic_num;
+        sbe->lines_total=h->threads;
+
+        push_sbe(&h->sb_q[REORDER], sbe, 1);
+    }else{
+        for (int i=0; i<h->threads; i++){
+            sbe = get_sb_entry(h);
+            sbe->state=-1;
+            push_sbe(&h->sb_q[ENTROPY], sbe, 1);
+            notify_one_worker(h);
+        }
+    }
+    free_nal_context(nc);
+    free_parse_context(pc);
+
+    pthread_exit(NULL);
+    return NULL;
+}
+
+int decode_slice_entropy(EntropyContext *ec, SliceBufferEntry *sbe){
+    int i,j;
+    H264Slice *s = &sbe->slice;
+    GetBitContext *gb = &sbe->gb;
+    CABACContext *c = &ec->c;
+    H264Mb *mbs = sbe->mbs;
+
+    if( !s->pps.cabac ){
+        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
+        return -1;
+    }
+
+    init_dequant_tables(s, ec);
+    ec->curr_qscale = s->qscale;
+    ec->last_qscale_diff = 0;
+    ec->chroma_qp[0] = get_chroma_qp( s, 0, s->qscale);
+    ec->chroma_qp[1] = get_chroma_qp( s, 1, s->qscale);
+
+    /* realign */
+    align_get_bits( gb );
+    /* init cabac */
+    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
+
+    ff_h264_init_cabac_states(ec, s, c);
+
+    for(j=0; j<ec->mb_height; j++){
+        init_entropy_buf(ec, s, j);
+        for(i=0; i<ec->mb_width; i++){
+            int eos,ret;
+            H264Mb *m = &mbs[i + j*ec->mb_width];
+            //memset(m, 0, sizeof(H264Mb));
+            m->mb_x=i;
+            m->mb_y=j;
+            ec->m = m;
+
+            ret = ff_h264_decode_mb_cabac(ec, s, c);
+            eos = get_cabac_terminate( c); (void) eos;
+
+            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
+                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int decode_slice_mb(MBRecContext *d, RingLineEntry *rle, int frames){
+    SliceBufferEntry *sbe= rle->sbe;
+    H264Slice *s = &sbe->slice;
+    H264Mb *mbs = sbe->mbs;
+
+    int mb_width= d->mb_width;
+    int i;
+    const int line = rle->line;
+
+    init_mbrec_context(d, d->mrs, s, line);
+
+    H264Mb *m = &mbs[line*mb_width];
+    d->top=rle->prev_line->top;
+    d->top_next=rle->top;
+
+//     assert(rle->mb_cnt ==0);
+    for(i=0; i< mb_width; i++){
+        if (frames || line>0){
+            while (rle->mb_cnt >= rle->prev_line->mb_cnt -1);
+        }
+        h264_decode_mb_internal( d, d->mrs, s, &m[i]);
+        rle->mb_cnt++;
+    }
+    draw_edges(d, s, line);
+
+    return 0;
+}
+
+// static int decode_slice_mb_static(MBRecContext *d, H264Slice *s, RLThreadContext *r, RLThreadContext *rp,  int frames){
+//     int mb_height= d->mb_height;
+//     int mb_width= d->mb_width;
+//     int thread_num = r->thread_num;
+//     int thread_total = r->thread_total;
+//     int i;
+//     int j = thread_num;
+//
+//     r->mb_cnt=frames* mb_height*mb_width;
+//     for(; j<mb_height; j+=thread_total){
+//         H264Mb *m = &s->mbs[j*mb_width];
+//         for(i=0; i< mb_width; i++){
+//             if (j>0){
+//                 while (r->mb_cnt- (thread_num? 0:mb_width) >= rp->mb_cnt-1);
+//             }
+//             h264_decode_mb_internal(d, s, m++);
+//             r->mb_cnt++;
+//         }
+//         draw_edges(d, s, j);
+//     }
+//     return 0;
+// }
+
+static void *ed_rec_thread(void *arg){
+    H264Context *h =  (H264Context*) arg;
+    EntropyContext *ec=NULL;
+    MBRecContext *mrc=NULL;
+
+    RingLineEntry *rle=NULL;
+    SliceBufferEntry *sbe=NULL;
+    H264Slice *s;
+    int rec_token=0;
+
+    if (!h->no_mbd){
+        mrc = get_mbrec_context(h);
+    }
+    ec = get_entropy_context(h);
+
+    for(;;){
+        pop_next_task(h, &sbe, &rle, &rec_token);
+        if (sbe){
+            if (h->no_mbd && sbe->state<0){
+                break;
+            }
+            if (!sbe->initialized){
+                init_sb_entry(h, sbe);
+            }
+            decode_slice_entropy(ec, sbe);
+
+            if (h->no_mbd){
+                release_sb_entry(h, sbe);
+                sbe=NULL;
+            } else {
+                push_sbe(&h->sb_q[REORDER], sbe, 1);
+            }
+        } else if (rle){
+            if (rle->sbe->state<0)
+                break;
+            s = &rle->sbe->slice;
+
+            decode_slice_mb(mrc, rle, s->coded_pic_num);
+
+            if (rle->line == h->mb_height-1){
+                push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1);
+            }
+            rle->mb_cnt++;
+        }
+    }
+
+    //make sure threads quit in order of rle assignment
+    if (!h->no_mbd){
+        while (rle->prev_line->mb_cnt <= h->mb_width);
+        rel_rle(&h->rl_q, &rec_token);
+        notify_one_worker(h);
+        rle->mb_cnt = h->mb_width +1;
+        if (rle->line == h->threads-1){
+            push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1);
+        }
+
+        free_mbrec_context(mrc);
+    }
+
+    free_entropy_context(ec);
+
+    pthread_exit(NULL);
+    return NULL;
+}
+
+static void *reorder_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+    int i;
+    SliceBufferEntry *reorder[h->sb_size];
+    SliceBufferEntry *sbe, *next_sbe;
+    H264Slice *s;
+    int reorder_cnt=0;
+    unsigned next_pic_num=0;
+
+    for(;;){
+
+        sbe = pop_sbe(&h->sb_q[REORDER], 1);
+
+        s = &sbe->slice;
+        for(i=reorder_cnt; i>0; i--){
+            if (s->coded_pic_num < reorder[i-1]->slice.coded_pic_num)
+                break;
+            reorder[i]=reorder[i-1];
+        }
+        reorder[i]=sbe;
+
+        while(reorder_cnt>=0){
+            if (next_pic_num!=reorder[reorder_cnt]->slice.coded_pic_num){
+                break;
+            }
+            next_sbe = reorder[reorder_cnt];
+            H264Slice *es = &next_sbe->slice;
+
+            if (next_sbe->state<0)
+                goto end;
+
+            for (int i=0; i<2; i++){
+                for(int j=0; j< es->ref_count[i]; j++){
+                    if (es->ref_list_cpn[i][j] ==-1)
+                        continue;
+                    int k;
+                    for (k=0; k<h->max_dpb_cnt; k++){
+                        if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == es->ref_list_cpn[i][j]){
+                            es->dp_ref_list[i][j] = &h->dpb[k];
+                            break;
+                        }
+                    }
+                }
+            }
+            next_sbe->dp = get_dpb_entry(h, es);
+
+            push_sbe(&h->sb_q[MBDEC], next_sbe, 0);
+            notify_all_workers(h);
+
+//             for (int i=0; i< h->mb_height; i++){
+//                 push_rle(&h->rl_q, next_sbe, i, 0);
+//                 notify_one_worker(h);
+//             }
+
+
+            next_pic_num++;
+            reorder_cnt--;
+        }
+        reorder_cnt++;
+    }
+
+end:
+    {
+        push_sbe(&h->sb_q[MBDEC], next_sbe, 0);
+        notify_all_workers(h);
+        if (h->no_mbd){
+            push_sbe(&h->sb_q[OUTPUT], next_sbe, 1);
+        }
+//         for (int i=0; i< h->threads; i++){
+//             push_rle(&h->rl_q, next_sbe, i, 0);
+//             notify_one_worker(h);
+//         }
+    }
+
+    pthread_exit(NULL);
+    return NULL;
+}
+
+void create_ed_rec_threads(H264Context *h){
+    cpu_set_t cpuset;
+    int* aff;
+
+    if (h->setaff){
+        aff = h->smt ? ed_rec_smt_aff : ed_rec_affinity ;
+        for (int i=0; i<h->threads; i++){
+            pthread_attr_init(&h->ed_rec_attr[i]);
+            CPU_ZERO(&cpuset);
+            CPU_SET(aff[i], &cpuset);
+            pthread_attr_setaffinity_np(&h->ed_rec_attr[i], sizeof(cpu_set_t), &cpuset);
+            pthread_create(&h->ed_rec_thr[i], &h->ed_rec_attr[i], ed_rec_thread, h);
+        }
+    } else {
+        for (int i=0; i<h->threads; i++){
+            pthread_create(&h->ed_rec_thr[i], NULL, ed_rec_thread, h);
+        }
+    }
+}
+
+void join_ed_rec_threads(H264Context *h){
+    for (int i=0; i< h->threads; i++){
+        pthread_join(h->ed_rec_thr[i], NULL);
+    }
+}
+
+void *output_thread(void *arg){
+    H264Context *h = (H264Context *) arg;
+
+    OutputContext *oc = get_output_context( h );
+
+    SliceBufferEntry *sbe = NULL;
+    H264Slice *s=NULL;
+    for(;;) {
+        DecodedPicture *out, *dp;
+        sbe = pop_sbe(&h->sb_q[OUTPUT], 1);
+
+        if (sbe->state <0)
+            break;
+
+        s = &sbe->slice;
+        for (int i=0; i<s->release_cnt; i++){
+            for(int j=0; j<h->max_dpb_cnt; j++){
+                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
+                    release_dpb_entry(h, &h->dpb[j], 2);
+                    break;
+                }
+            }
+        }
+
+        dp=sbe->dp;
+        release_sb_entry(h, sbe);
+
+        out =output_frame(h, oc, dp, h->ofile, h->frame_width, h->frame_height);
+        if (out){
+            release_dpb_entry(h, out, 1);
+        }
+
+        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
+
+    }
+    /* at the end of stream, we must flush the decoder buffers */
+    while (output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height));
+    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
+
+    free_output_context(oc);
+
+    pthread_exit(NULL);
+    return NULL;
+}
+
+/*
+* The following code is the main loop of the file converter
+*/
+int h264_decode_pthread(H264Context *h) {
+    pthread_t parse_thr, reorder_thr, output_thr;
+
+    av_start_timer();
+
+    pthread_create(&parse_thr, NULL, parse_thread, h);
+    if (!h->no_mbd){
+        pthread_create(&reorder_thr, NULL, reorder_thread, h);
+        pthread_create(&output_thr, NULL, output_thread, h);
+    }
+#if HAVE_LIBSDL2
+    pthread_t sdl_thr;
+    if (h->display){
+        pthread_create(&sdl_thr, NULL, sdl_thread, h);
+    }
+#endif
+    create_ed_rec_threads(h);
+
+
+    if (h->rl_side_touch){
+        pthread_mutex_lock(&h->ilock);
+        while (h->init_threads< h->threads)
+            pthread_cond_wait(&h->icond, &h->ilock);
+        pthread_mutex_unlock(&h->ilock);
+
+        pthread_mutex_lock(&h->tlock);
+        h->touch_start =1;
+        pthread_cond_broadcast(&h->tcond);
+        pthread_mutex_unlock(&h->tlock);
+
+        pthread_mutex_lock(&h->tdlock);
+        while (h->touch_done < h->threads)
+            pthread_cond_wait(&h->tdcond, &h->tdlock);
+        pthread_mutex_unlock(&h->tdlock);
+
+        pthread_mutex_lock(&h->slock);
+        h->start =1;
+        pthread_cond_broadcast(&h->scond);
+        pthread_mutex_unlock(&h->slock);
+    }
+    join_ed_rec_threads(h);
+    pthread_join(parse_thr, NULL);
+    if (!h->no_mbd){
+        pthread_join(reorder_thr, NULL);
+        pthread_join(output_thr, NULL);
+    }
+#if HAVE_LIBSDL2
+    if (h->display)
+        signal_sdl_exit(h);
+        pthread_join(sdl_thr, NULL);
+#endif
+
+
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pthread.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_pthread.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,14 @@
+#ifndef H264_PTHREAD_H
+#define H264_PTHREAD_H
+
+#include "h264_types.h"
+
+int decode_B_slice_entropy(EntropyContext *ec, EDSlice *s, EDThreadContext *eb, EDThreadContext *eb_prev);
+int decode_slice_entropy(EntropyContext *hc, EDSlice *s);
+
+void *read_thread(void *arg);
+void *parsenal_thread(void *arg);
+void *mbrec_thread(void *arg);
+void *write_thread(void *arg);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_rec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_rec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,412 @@
+#include "config.h"
+
+#include "dsputil.h"
+#include "h264_types.h"
+#include "h264_data.h"
+#include "h264_mc.h"
+#include "h264_deblock.h"
+#include "h264_pred_mode.h"
+//#undef NDEBUG
+#include <assert.h>
+
+void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line){
+    DecodedPicture *pic = s->curr_pic;
+    int mb_stride = mrc->mb_stride;
+    int mb_width = mrc->mb_width;
+    mrs->mb_type_top = pic->mb_type + (line -1)*mb_stride;
+    mrs->mb_type = pic->mb_type + line*mb_stride;
+    mrs->ref_index_top[0] = pic->ref_index[0] + 4*(line -1)*mb_stride;
+    mrs->ref_index_top[1] = pic->ref_index[1] + 4*(line -1)*mb_stride;
+    mrs->ref_index[0] = pic->ref_index[0] + 4*line*mb_stride;
+    mrs->ref_index[1] = pic->ref_index[1] + 4*line*mb_stride;
+
+    mrs->motion_val_top[0] = pic->motion_val[0] + 4*mb_width*4*(line-1);
+    mrs->motion_val_top[1] = pic->motion_val[1] + 4*mb_width*4*(line-1);
+    mrs->motion_val[0] = pic->motion_val[0] + 4*mb_width*4*line;
+    mrs->motion_val[1] = pic->motion_val[1] + 4*mb_width*4*line;
+
+    mrs->intra4x4_pred_mode_top = pic->intra4x4_pred_mode + 4*mb_width*(line-1);
+    mrs->intra4x4_pred_mode = pic->intra4x4_pred_mode + 4*mb_width*line;
+
+    mrs->non_zero_count_top = pic->non_zero_count + 8*mb_width*(line-1);
+    mrs->non_zero_count = pic->non_zero_count + 8*mb_width*line;
+
+    if (s->slice_type_nos == FF_B_TYPE){
+        mrs->list1_mb_type = s->dp_ref_list[1][0]->mb_type + line*mb_stride;
+        mrs->list1_ref_index[0]  = s->dp_ref_list[1][0]->ref_index[0] + 4*line*mb_stride;
+        mrs->list1_ref_index[1]  = s->dp_ref_list[1][0]->ref_index[1] + 4*line*mb_stride;
+        mrs->list1_motion_val[0] = s->dp_ref_list[1][0]->motion_val[0] + 4*mb_width*4*line;
+        mrs->list1_motion_val[1] = s->dp_ref_list[1][0]->motion_val[1] + 4*mb_width*4*line;
+    }
+
+}
+
+#if OMPSS
+static void backup_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+    int i;
+    uint8_t * top_border_y1 = m->top_border;
+    uint8_t * top_border_y2 = m->top_border + 8;
+    uint8_t * top_border_cb = m->top_border + 16;
+    uint8_t * top_border_cr = m->top_border + 24;
+    uint8_t * top_border_next = m->top_border_next;
+
+    src_y  -=   linesize;
+    src_cb -= uvlinesize;
+    src_cr -= uvlinesize;
+
+    m->left_border[0]= m->top_border[15];
+    for(i=1; i<17 ; i++){
+        m->left_border[i]= src_y[15 + i*linesize];
+    }
+
+    *(uint64_t*)(top_border_y1)   = *(uint64_t*)(src_y +  16*linesize);
+    *(uint64_t*)(top_border_next) = *(uint64_t*)(src_y +  16*linesize);
+    *(uint64_t*)(top_border_y2)   = *(uint64_t*)(src_y +8+16*linesize);
+
+    m->left_border[17]= m->top_border[16+7];
+    m->left_border[17+9]= m->top_border[24+7];
+    for(i=1; i<9; i++){
+        m->left_border[17  +i]= src_cb[7+i*uvlinesize];
+        m->left_border[17+9+i]= src_cr[7+i*uvlinesize];
+    }
+    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
+    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
+}
+
+static void xchg_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+    int temp8, i;
+    uint64_t temp64;
+
+    uint8_t * top_border_y1 = m->top_border;
+    uint8_t * top_border_y2 = m->top_border + 8;
+    uint8_t * top_border_cb = m->top_border + 16;
+    uint8_t * top_border_cr = m->top_border + 24;
+    uint8_t * top_border_next = m->top_border_next;
+
+    int deblock_left;
+    int deblock_top;
+
+    deblock_left = (m->mb_x > 0);
+    deblock_top =  (m->mb_y > 0);
+
+    src_y  -= (  linesize + 1);
+    src_cb -= (uvlinesize + 1);
+    src_cr -= (uvlinesize + 1);
+
+    #define XCHG(a,b,t,xchg)\
+    t= a;\
+    if(xchg)\
+        a= b;\
+    b= t;
+
+    if(deblock_left){
+        for(i = !deblock_top; i<16; i++){
+            XCHG(m->left_border[i], src_y [i*  linesize], temp8, xchg);
+        }
+        XCHG(m->left_border[i], src_y [i*  linesize], temp8, 1);
+
+        for(i = !deblock_top; i<8; i++){
+            XCHG(m->left_border[17  +i], src_cb[i*uvlinesize], temp8, xchg);
+            XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, xchg);
+        }
+        XCHG(m->left_border[17  +i], src_cb[i*uvlinesize], temp8, 1);
+        XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, 1);
+    }
+
+    if(deblock_top){
+        XCHG(*(uint64_t*)(top_border_y1)  , *(uint64_t*)(src_y +1), temp64, xchg);
+        XCHG(*(uint64_t*)(top_border_y2)  , *(uint64_t*)(src_y +9), temp64, 1);
+        XCHG(*(uint64_t*)(top_border_next), *(uint64_t*)(src_y +17), temp64, 1);
+
+        XCHG(*(uint64_t*)(top_border_cb)  , *(uint64_t*)(src_cb+1), temp64, 1);
+        XCHG(*(uint64_t*)(top_border_cr)  , *(uint64_t*)(src_cr+1), temp64, 1);
+    }
+}
+#else
+
+static void backup_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+    int i;
+    uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y;
+    uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb;
+    uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr;
+
+    uint8_t* left_border_y = d->left.unfiltered_y;
+    uint8_t* left_border_cb = d->left.unfiltered_cb;
+    uint8_t* left_border_cr = d->left.unfiltered_cr;
+
+    src_y  -=   linesize;
+    src_cb -= uvlinesize;
+    src_cr -= uvlinesize;
+
+    // There are two lines saved, the line above the top macroblock of a pair,
+    // and the line above the bottom macroblock
+    left_border_y[0] = top_border_y[15];
+    for(i=1; i<17; i++){
+        left_border_y[i] = src_y[15+i*  linesize];
+    }
+    *(uint64_t*)(top_border_y   )   = *(uint64_t*)(src_y +  16*linesize);
+    *(uint64_t*)(top_border_y +8)   = *(uint64_t*)(src_y +8+16*linesize);
+
+    left_border_cb[0] = top_border_cb[7];
+    left_border_cr[0] = top_border_cr[7];
+    for(i=1; i<9; i++){
+        left_border_cb[i] = src_cb[7+i*uvlinesize];
+        left_border_cr[i] = src_cr[7+i*uvlinesize];
+    }
+    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
+    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
+}
+
+static void xchg_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+
+    int temp8, i;
+    uint64_t temp64;
+    int deblock_left;
+    int deblock_top;
+
+    uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y;
+    uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb;
+    uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr;
+    uint8_t* top_border_y_next = d->top[m->mb_x +1].unfiltered_y;
+
+    uint8_t* left_border_y = d->left.unfiltered_y;
+    uint8_t* left_border_cb = d->left.unfiltered_cb;
+    uint8_t* left_border_cr = d->left.unfiltered_cr;
+
+    deblock_left = (m->mb_x > 0);
+    deblock_top =  (m->mb_y > 0);
+
+    src_y  -= (  linesize + 1);
+    src_cb -= (uvlinesize + 1);
+    src_cr -= (uvlinesize + 1);
+
+    #define XCHG(a,b,t,xchg)\
+    t= a;\
+    if(xchg)\
+        a= b;\
+    b= t;
+
+    if(deblock_left){
+        for(i = !deblock_top; i<16; i++){
+            XCHG(left_border_y[i], src_y [i*  linesize], temp8, xchg);
+        }
+        XCHG(left_border_y[i], src_y [i*  linesize], temp8, 1);
+
+        for(i = !deblock_top; i<8; i++){
+            XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg);
+            XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg);
+        }
+        XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1);
+        XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1);
+    }
+
+    if(deblock_top){
+        XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg);
+        XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1);
+        if(m->mb_x+1 < d->mb_width){
+            XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1);
+        }
+        XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1);
+        XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1);
+    }
+}
+
+#endif
+
+void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m){
+    int i;
+    const int mb_x= m->mb_x;
+    const int mb_y= m->mb_y;
+    int *block_offset = d->block_offset;
+
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+
+    int linesize   = d->linesize;
+    int uvlinesize = d->uvlinesize;
+
+    uint8_t *dest_y  = s->curr_pic->data[0] + (mb_x + mb_y * linesize  ) * 16;
+    uint8_t *dest_cb = s->curr_pic->data[1] + (mb_x + mb_y * uvlinesize) * 8;
+    uint8_t *dest_cr = s->curr_pic->data[2] + (mb_x + mb_y * uvlinesize) * 8;
+
+    pred_motion_mb_rec (d, mrs, s, m);
+
+    const int mb_type= m->mb_type;
+
+    d->dsp.prefetch(dest_y + (m->mb_x&3)*4*linesize + 64, d->linesize, 4);
+    d->dsp.prefetch(dest_cb + (m->mb_x&7)*uvlinesize + 64, dest_cr - dest_cb, 2);
+
+    if(IS_INTRA(mb_type)){
+#if OMPSS
+        xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+#else
+        xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+#endif
+
+        d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cb, uvlinesize);
+        d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cr, uvlinesize);
+
+        if(IS_INTRA4x4(mb_type)){
+            if(IS_8x8DCT(mb_type)){
+                idct_dc_add = d->hdsp.h264_idct8_dc_add;
+                idct_add    = d->hdsp.h264_idct8_add;
+
+                for(i=0; i<16; i+=4){
+                    uint8_t * const ptr= dest_y + block_offset[i];
+                    const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ];
+
+                    const int nnz = mrs->non_zero_count_cache[ scan8[i] ];
+                    d->hpc.pred8x8l[ dir ](ptr, (mrs->topleft_samples_available<<i)&0x8000,
+                                                (mrs->topright_samples_available<<i)&0x4000, linesize);
+                    if(nnz){
+                        if(nnz == 1 && m->mb[i*16])
+                            idct_dc_add(ptr, m->mb + i*16, linesize);
+                        else
+                            idct_add   (ptr, m->mb + i*16, linesize);
+                    }
+                }
+            }else{
+                idct_dc_add = d->hdsp.h264_idct_dc_add;
+                idct_add    = d->hdsp.h264_idct_add;
+
+                for(i=0; i<16; i++){
+                    uint8_t * const ptr= dest_y + block_offset[i];
+                    const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ];
+                    uint8_t *topright;
+                    int nnz, tr;
+                    if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
+                        const int topright_avail= (mrs->topright_samples_available<<i)&0x8000;
+                        assert(mb_y || linesize <= block_offset[i]);
+                        if(!topright_avail){
+                            tr= ptr[3 - linesize]*0x01010101;
+                            topright= (uint8_t*) &tr;
+                        }else
+                            topright= ptr + 4 - linesize;
+                    }else
+                        topright= NULL;
+
+                    d->hpc.pred4x4[ dir ](ptr, topright, linesize);
+                    nnz = mrs->non_zero_count_cache[ scan8[i] ];
+                    if(nnz){
+                        if(nnz == 1 && m->mb[i*16])
+                            idct_dc_add(ptr, m->mb + i*16, linesize);
+                        else
+                            idct_add   (ptr, m->mb + i*16, linesize);
+                    }
+                }
+            }
+        }else{
+            d->hpc.pred16x16[ m->intra16x16_pred_mode ](dest_y , linesize);
+        }
+#if OMPSS
+        xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
+#else
+        xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
+#endif
+    }else {
+        hl_motion(d, mrs, s, m, dest_y, dest_cb, dest_cr,
+                    d->hdsp.qpel_put, d->dsp.put_h264_chroma_pixels_tab,
+                    d->hdsp.qpel_avg, d->dsp.avg_h264_chroma_pixels_tab,
+                    d->hdsp.weight_h264_pixels_tab, d->hdsp.biweight_h264_pixels_tab);
+    }
+
+    if(!IS_INTRA4x4(mb_type)){
+
+        if(IS_INTRA16x16(mb_type)){
+
+            d->hdsp.h264_idct_add16intra(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
+
+        }else if(m->cbp&15){
+
+            if(IS_8x8DCT(mb_type)){
+                d->hdsp.h264_idct8_add4(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
+            }else{
+                d->hdsp.h264_idct_add16(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
+            }
+        }
+    }
+
+    if(m->cbp&0x30){
+        uint8_t *dest[2] = {dest_cb, dest_cr};
+
+        idct_add = d->hdsp.h264_idct_add;
+        idct_dc_add = d->hdsp.h264_idct_dc_add;
+        for(i=16; i<16+8; i++){
+            if(mrs->non_zero_count_cache[ scan8[i] ])
+                idct_add   (dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize);
+            else if(m->mb[i*16])
+                idct_dc_add(dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize);
+        }
+    }
+
+#if OMPSS
+    backup_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+    if (mb_x+1 <d->mb_width){
+        H264Mb *mr = m+1;
+        memcpy(mr->left_border, m->left_border, sizeof(m->left_border));
+    }
+    if (mb_y +1 <d->mb_height){
+        H264Mb *md = m + d->mb_width;
+        memcpy(md->top_border, m->top_border, sizeof(m->top_border));
+        if (mb_x>0){
+            H264Mb *mdl = m + d->mb_width -1;
+            memcpy(mdl->top_border_next, m->top_border_next, sizeof(m->top_border_next));
+        }
+    }
+#else
+    backup_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+    if (mb_y +1 <d->mb_height && d->top_next != d->top){
+        memcpy(&d->top_next[mb_x],&d->top[mb_x], sizeof(TopBorder));
+    }
+#endif
+
+    ff_h264_filter_mb(d, mrs, s, m, dest_y, dest_cb, dest_cr);
+}
+
+MBRecContext *get_mbrec_context(H264Context *h){
+    MBRecContext *d = av_mallocz(sizeof(MBRecContext));
+
+    ff_h264dsp_init(&d->hdsp);
+    ff_h264_pred_init(&d->hpc);
+    dsputil_init(&d->dsp);
+
+#if !OMPSS
+    d->mrs = av_mallocz(sizeof(MBRecState));
+#endif
+    d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab;
+    d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab;
+    d->mb_height = h->mb_height;
+    d->mb_width  = h->mb_width;
+    d->mb_stride  = h->mb_stride;
+    d->b_stride  = h->b_stride;
+    d->height = h->height;
+    d->width  = h->width;
+    d->linesize = h->width + EDGE_WIDTH*2;
+    d->uvlinesize = d->linesize>>1;
+
+    d->scratchpad_y = av_malloc(d->linesize*16*sizeof(uint8_t));
+    d->scratchpad_cb= av_malloc(d->uvlinesize*8*sizeof(uint8_t));
+    d->scratchpad_cr= av_malloc(d->uvlinesize*8*sizeof(uint8_t));
+
+    for (int i=0; i<16; i++){
+        d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3);
+    }
+    for (int i=0; i<4; i++){
+        d->block_offset[16+i]=
+        d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3);
+    }
+
+
+
+    return d;
+}
+
+void free_mbrec_context(MBRecContext *d){
+#if !OMPSS
+    av_free(d->mrs);
+#endif
+    av_free(d->scratchpad_y);
+    av_free(d->scratchpad_cb);
+    av_free(d->scratchpad_cr);
+    av_free(d);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_rec.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_rec.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,12 @@
+#ifndef H264_REC_H
+#define H264_REC_H
+
+#include "h264_types.h"
+
+MBRecContext *get_mbrec_context(H264Context *h);
+void free_mbrec_context( MBRecContext *d);
+void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m);
+
+void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_refs.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_refs.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,461 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10  reference picture handling.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "dsputil.h"
+#include "h264_types.h"
+#include "golomb.h"
+
+//#undef NDEBUG
+#include <assert.h>
+
+static int build_def_list(PictureInfo **def, PictureInfo **in, int len, int is_long){
+    int i[2]={0};
+    int index=0;
+
+    while(i[0]<len || i[1]<len){
+        while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference)))
+            i[0]++;
+        while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & 0)))
+            i[1]++;
+        if(i[0] < len){
+            in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
+            def[index++]= in[ i[0]++ ];
+        }
+        if(i[1] < len){
+            in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
+            def[index++]= in[ i[1]++ ];
+        }
+    }
+
+    return index;
+}
+
+static int add_sorted(PictureInfo **sorted, PictureInfo **src, int len, int limit, int dir){
+    int i, best_poc;
+    int out_i= 0;
+
+    for(;;){
+        best_poc= dir ? INT_MIN : INT_MAX;
+
+        for(i=0; i<len; i++){
+            const int poc= src[i]->poc;
+            if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
+                best_poc= poc;
+                sorted[out_i]= src[i];
+            }
+        }
+        if(best_poc == (dir ? INT_MIN : INT_MAX))
+            break;
+        limit= sorted[out_i++]->poc - dir;
+    }
+    return out_i;
+}
+
+int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s){
+    int i,len;
+
+    if(s->slice_type_nos==FF_B_TYPE){
+        PictureInfo *sorted[32];
+        int cur_poc, list;
+        int lens[2];
+
+        cur_poc= s->poc;
+
+        for(list= 0; list<2; list++){
+            len= add_sorted(sorted, n->short_ref, n->short_ref_count, cur_poc, !list);
+            len+=add_sorted(sorted+len, n->short_ref, n->short_ref_count, cur_poc, list);
+            assert(len<=32);
+            len= build_def_list(s->ref_list[list], sorted, len, 0);
+            len+=build_def_list(s->ref_list[list] +len, n->long_ref, 16 , 1);
+            assert(len<=32);
+
+            for(int i=len; i<s->ref_count[list]; i++)
+                s->ref_list[list][i] = NULL;
+
+            lens[list]= len;
+        }
+
+        if(lens[0] == lens[1] && lens[1] > 1){
+            for(i=0; s->ref_list[0][i]->poc == s->ref_list[1][i]->poc && i<lens[0]; i++);
+
+			if(i == lens[0])
+				FFSWAP(PictureInfo *, s->ref_list[1][0], s->ref_list[1][1]);
+        }
+    }else{
+        len = build_def_list(s->ref_list[0], n->short_ref, n->short_ref_count, 0);
+        len+= build_def_list(s->ref_list[0] +len, n->long_ref, 16, 1);
+        assert(len <= 32);
+        for(i=len; i<s->ref_count[0]; i++)
+            s->ref_list[0][i] = NULL;
+    }
+
+    return 0;
+}
+
+/**
+* print short term list
+*/
+static void print_short_term(NalContext *n) {
+    av_log(AV_LOG_DEBUG, "short term list:\n");
+    for(int i=0; i<n->short_ref_count; i++){
+        PictureInfo *pic= n->short_ref[i];
+        av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d ref:%d \n", i, pic->frame_num, pic->poc, pic->reference);
+    }
+}
+
+/**
+* print long term list
+*/
+static void print_long_term(NalContext *n) {
+    uint32_t i;
+
+    av_log(AV_LOG_DEBUG, "long term list:\n");
+    for(i = 0; i < 16; i++){
+        PictureInfo *pic= n->long_ref[i];
+        if (pic) {
+            av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d\n", i, pic->frame_num, pic->poc);
+        }
+    }
+}
+
+int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb){
+    int list, index;
+
+    print_short_term(n);
+    print_long_term(n);
+
+    for(list=0; list<s->list_count; list++){
+
+        if(get_bits1(gb)){
+            int frame_num = n->frame_num;
+            unsigned int abs_diff_pic_num;
+            for(index=0; ; index++){
+                unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(gb);
+                int i=0;
+                PictureInfo *ref = NULL;
+
+                if(reordering_of_pic_nums_idc==3){
+                    break;
+                }
+                if(index >= s->ref_count[list]){
+                    av_log(AV_LOG_ERROR, "reference count overflow\n");
+                    return -1;
+                }
+
+                if (reordering_of_pic_nums_idc>2){
+                    av_log(AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
+                    return -1;
+                }
+
+                if (reordering_of_pic_nums_idc<2){
+                    //av_log(AV_LOG_ERROR, "long term pic not supported\n");
+
+                    abs_diff_pic_num= get_ue_golomb(gb) + 1;
+                    if(abs_diff_pic_num > (unsigned) n->max_pic_num){
+                        av_log(AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
+                        return -1;
+                    }
+
+                    if(reordering_of_pic_nums_idc == 0)
+                        frame_num-= abs_diff_pic_num;
+                    else
+                        frame_num+= abs_diff_pic_num;
+                    frame_num &= n->max_pic_num - 1;
+
+                    for(i= 0 ; i<n->short_ref_count; i++){
+                        ref = n->short_ref[i];
+                        if(ref->frame_num == frame_num && ref->reference){
+                            break;
+                        }
+                    }
+                    ref->pic_id= frame_num;
+                }else{
+                    int long_idx;
+                    long_idx= get_ue_golomb(gb); //long_term_pic_idx
+
+                    if(long_idx>31){
+                        av_log(AV_LOG_ERROR, "long_term_pic_idx overflow\n");
+                        return -1;
+                    }
+                    ref = n->long_ref[long_idx];
+                    assert(!(ref && !ref->reference));
+                    if(ref && (ref->reference)){
+                        ref->pic_id= long_idx;
+                        assert(ref->long_ref);
+                    }else{
+                        av_log(AV_LOG_ERROR, "reference picture missing during reorder\n");
+                    }
+                }
+
+                if (i >= n->short_ref_count) {
+                    av_log(AV_LOG_ERROR, "reference picture missing during reorder\n");
+                    return -1;
+                } else {
+                    for(i=index; i+1 <s->ref_count[list]; i++){
+
+//                         if(ref->frame_num == s->ref_list[list][i]->frame_num)
+//                            break;
+                        ///there is probably no need for a separate pic_id and frame_num
+						if (s->ref_list[list][i]){
+
+							if(ref->long_ref == s->ref_list[list][i]->long_ref && ref->pic_id == s->ref_list[list][i]->pic_id)
+								break;
+						}
+                    }
+                    for(; i > index; i--){
+                        s->ref_list[list][i]= s->ref_list[list][i-1];
+                    }
+                    s->ref_list[list][index]= ref;
+                }
+            }
+        }
+    }
+
+//     //Check if everything went well
+//     for(list=0; list<s->list_count; list++){
+// 		//printf("ref_count %d list %d\n", s->ref_count[list], list);
+//         for(index= 0; index < s->ref_count[list]; index++){
+// 			//printf("%d\n", s->ref_list[list][index]->pic_id);
+//             if(!s->ref_list[list][index]->data[0]){
+//                 av_log(AV_LOG_ERROR, "Missing reference picture\n");
+//                 return -1;
+//             }
+//         }
+//     }
+
+    return 0;
+}
+
+static PictureInfo *find_short(NalContext *n, int frame_num){
+    int i;
+    for(i=0; i<n->short_ref_count; i++){
+        if(n->short_ref[i]->frame_num == frame_num) {
+            return n->short_ref[i];
+        }
+    }
+    return NULL;
+}
+
+static int remove_short(NalContext *n, H264Slice *s, int frame_num, int release){
+    int i;
+
+    for (i=0; i<n->short_ref_count; i++){
+        if (n->short_ref[i]->frame_num == frame_num){
+            if (release){
+                s->release_ref_cpn[s->release_cnt++] = n->short_ref[i]->cpn;
+                n->short_ref[i]->reference &= ~2;
+            }
+            n->short_ref[i] = NULL;
+            if (--n->short_ref_count)
+                memmove(&n->short_ref[i], &n->short_ref[i+1], (n->short_ref_count - i)*sizeof(PictureInfo *));
+            return 0;
+        }
+    }
+    return -1;
+}
+
+static void remove_long(NalContext *n, H264Slice *s, int i){
+
+    if (n->long_ref[i]){
+        s->release_ref_cpn[s->release_cnt++] = n->long_ref[i]->cpn;
+        n->long_ref[i]->reference &= ~2;
+        n->long_ref[i]->long_ref = 0;
+        n->long_ref_count--;
+        n->long_ref[i] = NULL;
+    }
+}
+
+void ff_h264_remove_all_refs(NalContext *n, H264Slice *s){
+    int i;
+
+    while (n->short_ref[0])
+        remove_short(n, s, n->short_ref[0]->frame_num, 1);
+
+    for(i=0; i<16; i++){
+        remove_long(n, s, i);
+    }
+    assert(n->short_ref_count==0);
+    assert(n->long_ref_count==0);
+}
+
+int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb){
+
+    if(s->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
+        get_bits1(gb); //get_bits1(gb) -1; //broken link
+        if(get_bits1(gb)){
+            av_log(AV_LOG_ERROR, "MMCO_LONG reference management not supported\n");
+        }
+    }else{
+        if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
+            int i,j;
+            for(i= 0; i<MAX_MMCO_COUNT; i++) {
+                PictureInfo *pic;
+                int short_pic_num=0;
+                unsigned int long_arg=0;
+                MMCOOpcode opcode= get_ue_golomb_31(gb);
+
+                if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
+                    short_pic_num= (n->frame_num - get_ue_golomb(gb) - 1) & (n->max_pic_num - 1);
+                }
+                if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
+                    long_arg= get_ue_golomb_31(gb);
+                    if(long_arg >= 16){
+                        av_log(AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
+                        return -1;
+                    }
+                }
+
+                if(opcode > (unsigned)MMCO_LONG){
+                    av_log(AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
+                    return -1;
+                }
+                if(opcode == MMCO_END)
+                    break;
+
+                switch (opcode){
+                    case MMCO_SHORT2UNUSED:
+                        remove_short(n, s, short_pic_num, 1);
+                        break;
+                    case MMCO_SHORT2LONG:
+                        pic = find_short(n, short_pic_num);
+                        if (n->long_ref[long_arg] != pic)
+                            remove_long(n, s, long_arg);
+                        remove_short(n, s, short_pic_num, 0);
+                        n->long_ref[long_arg]= pic;
+                        if (pic){
+                            pic->long_ref=1;
+                            n->long_ref[long_arg]= pic;
+                            n->long_ref_count++;
+                        }
+                        break;
+                    case MMCO_LONG2UNUSED:
+                        assert(n->long_ref[long_arg]);
+                        remove_long(n, s, long_arg);
+                        break;
+                    case MMCO_SET_MAX_LONG:
+                        for(j=long_arg; j<16; j++)
+                            remove_long(n, s, j);
+                        break;
+                    case MMCO_RESET:
+                        while(n->short_ref_count)
+                            remove_short(n, s, n->short_ref[0]->frame_num, 1);
+
+                        for(j=0; j < 16; j++)
+                            remove_long(n, s, j);
+
+                        s->current_picture_info->poc=
+                        s->poc =
+                        n->poc_lsb=
+                        n->poc_msb=
+                        n->frame_num=
+                        s->current_picture_info->frame_num= 0;
+                        break;
+					case MMCO_END:
+					case MMCO_LONG:
+						break;
+                }
+            }
+        }else{// sliding window ref picture marking
+            if(n->short_ref_count == n->sps.ref_frame_count) {
+                s->release_ref_cpn[s->release_cnt++] = n->short_ref[n->short_ref_count - 1]->cpn;
+                n->short_ref[n->short_ref_count - 1]->reference &= ~2;
+                n->short_ref[ n->short_ref_count - 1 ] =NULL;
+                n->short_ref_count--;
+            }
+        }
+    }
+
+    if(n->short_ref_count)
+        memmove(&n->short_ref[1], &n->short_ref[0], n->short_ref_count*sizeof(PictureInfo *));
+
+    n->short_ref[0]= s->current_picture_info;
+    n->short_ref_count++;
+
+    return 0;
+}
+
+static int get_scale_factor(H264Slice *s, int poc, int poc1, int i){
+    int poc0 = s->ref_list[0][i]->poc;
+    int td = av_clip(poc1 - poc0, -128, 127);
+    if(td == 0 || s->ref_list[0][i]->long_ref){
+        return 256;
+    }else{
+        int tb = av_clip(poc - poc0, -128, 127);
+        int tx = (16384 + (FFABS(td) >> 1)) / td;
+        return av_clip((tb*tx + 32) >> 6, -1024, 1023);
+    }
+}
+
+void ff_h264_direct_dist_scale_factor(H264Slice *s){
+    const int poc = s->current_picture_info->poc;
+    const int poc1 = s->ref_list[1][0]->poc;
+
+    for(int i=0; i<s->ref_count[0]; i++){
+        s->dist_scale_factor[i] = get_scale_factor(s, poc, poc1, i);
+    }
+}
+
+static void fill_colmap(H264Slice *s, int map[2][16], int list){
+    PictureInfo * const ref1 = s->ref_list[1][0];
+    int old_ref, rfield;
+
+    /* bogus; fills in for missing frames */
+    memset(map[list], 0, sizeof(map[list]));
+
+    for(rfield=0; rfield<2; rfield++){
+        for(old_ref=0; old_ref < ref1->ref_count[list]; old_ref++){
+            int poc = ref1->ref_poc[list][old_ref];
+
+            for(int j=0; j<s->ref_count[0]; j++){
+                if(s->ref_list[0][j]->poc == poc){
+                    map[list][old_ref] = j;
+                    break;
+                }
+            }
+        }
+    }
+}
+
+void ff_h264_direct_ref_list_init(H264Slice *s){
+    PictureInfo * const cur = s->current_picture_info;
+    int list;
+
+    for(list=0; list<2; list++){
+        cur->ref_count[list] = s->ref_count[list];
+        for(int j=0; j<s->ref_count[list]; j++){
+            cur->ref_poc[list][j] = s->ref_list[list][j] ? s->ref_list[list][j]->poc : 0;
+        }
+    }
+
+    if(s->slice_type_nos != FF_B_TYPE || s->direct_spatial_mv_pred)
+        return;
+
+    for(list=0; list<2; list++){
+        fill_colmap(s, s->map_col_to_list0, list);
+    }
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_refs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_refs.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,14 @@
+#ifndef H264_REFS_H
+#define H264_REFS_H
+
+#include "avcodec.h"
+#include "h264_types.h"
+
+int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s);
+int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb);
+void ff_h264_remove_all_refs(NalContext *n, H264Slice *s);
+int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb);
+void ff_h264_direct_ref_list_init(H264Slice *s);
+void ff_h264_direct_dist_scale_factor(H264Slice *s);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_sei.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_sei.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,191 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... sei decoding
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 sei decoding.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "avcodec.h"
+#include "h264_types.h"
+#include "golomb.h"
+
+//#undef NDEBUG
+#include <assert.h>
+
+static const uint8_t sei_num_clock_ts_table[9]={
+    1,  1,  1,  2,  2,  3,  3,  2,  3
+};
+
+void ff_h264_reset_sei(NalContext *n) {
+    n->sei_recovery_frame_cnt       = -1;
+    n->sei_dpb_output_delay         =  0;
+    n->sei_cpb_removal_delay        = -1;
+    n->sei_buffering_period_present =  0;
+}
+
+static int decode_picture_timing(NalContext *n, GetBitContext *gb){
+    if(n->sps.nal_hrd_parameters_present_flag || n->sps.vcl_hrd_parameters_present_flag){
+        n->sei_cpb_removal_delay = get_bits(gb, n->sps.cpb_removal_delay_length);
+        n->sei_dpb_output_delay = get_bits(gb, n->sps.dpb_output_delay_length);
+    }
+    if(n->sps.pic_struct_present_flag){
+        unsigned int i, num_clock_ts;
+        n->sei_pic_struct = get_bits(gb, 4);
+        n->sei_ct_type    = 0;
+
+        if (n->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
+            return -1;
+
+        num_clock_ts = sei_num_clock_ts_table[n->sei_pic_struct];
+
+        for (i = 0 ; i < num_clock_ts ; i++){
+            if(get_bits(gb, 1)){                  /* clock_timestamp_flag */
+                unsigned int full_timestamp_flag;
+                n->sei_ct_type |= 1<<get_bits(gb, 2);
+                skip_bits(gb, 1);                 /* nuit_field_based_flag */
+                skip_bits(gb, 5);                 /* counting_type */
+                full_timestamp_flag = get_bits(gb, 1);
+                skip_bits(gb, 1);                 /* discontinuity_flag */
+                skip_bits(gb, 1);                 /* cnt_dropped_flag */
+                skip_bits(gb, 8);                 /* n_frames */
+                if(full_timestamp_flag){
+                    skip_bits(gb, 6);             /* seconds_value 0..59 */
+                    skip_bits(gb, 6);             /* minutes_value 0..59 */
+                    skip_bits(gb, 5);             /* hours_value 0..23 */
+                }else{
+                    if(get_bits(gb, 1)){          /* seconds_flag */
+                        skip_bits(gb, 6);         /* seconds_value range 0..59 */
+                        if(get_bits(gb, 1)){      /* minutes_flag */
+                            skip_bits(gb, 6);     /* minutes_value 0..59 */
+                            if(get_bits(gb, 1))   /* hours_flag */
+                                skip_bits(gb, 5); /* hours_value 0..23 */
+                        }
+                    }
+                }
+                if(n->sps.time_offset_length > 0)
+                    skip_bits(gb, n->sps.time_offset_length); /* time_offset */
+            }
+        }
+    }
+    return 0;
+}
+
+static int decode_unregistered_user_data(GetBitContext *gb, int size){
+    char user_data[16+256];
+    int e, build, i;
+
+    if(size<16)
+        return -1;
+
+    for(i=0; i<(int) sizeof(user_data)-1 && i<size; i++){
+        user_data[i]= get_bits(gb, 8);
+    }
+
+    user_data[i]= 0;
+    e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
+    (void) e;
+    for(; i<size; i++)
+        skip_bits(gb, 8);
+
+    return 0;
+}
+
+static int decode_recovery_point(NalContext *n, GetBitContext *gb){
+
+    n->sei_recovery_frame_cnt = get_ue_golomb(gb);
+    skip_bits(gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
+
+    return 0;
+}
+
+static int decode_buffering_period(NalContext *n, GetBitContext *gb){
+    unsigned int sps_id;
+    int sched_sel_idx;
+    SPS *sps;
+
+    sps_id = get_ue_golomb_31(gb);
+    if(sps_id > 31 || !n->sps_buffers[sps_id]) {
+        av_log(AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
+        return -1;
+    }
+    sps = n->sps_buffers[sps_id];
+
+    // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
+    if (sps->nal_hrd_parameters_present_flag) {
+        for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
+            n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length);
+            skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
+        }
+    }
+    if (sps->vcl_hrd_parameters_present_flag) {
+        for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
+            n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length);
+            skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
+        }
+    }
+
+    n->sei_buffering_period_present = 1;
+    return 0;
+}
+
+int ff_h264_decode_sei(NalContext *n, GetBitContext *gb){
+    while(get_bits_count(gb) + 16 < gb->size_in_bits){
+        int size, type;
+
+        type=0;
+        do{
+            type+= show_bits(gb, 8);
+        }while(get_bits(gb, 8) == 255);
+
+        size=0;
+        do{
+            size+= show_bits(gb, 8);
+        }while(get_bits(gb, 8) == 255);
+
+        switch(type){
+        case SEI_TYPE_PIC_TIMING: // Picture timing SEI
+            if(decode_picture_timing(n, gb) < 0)
+                return -1;
+            break;
+        case SEI_TYPE_USER_DATA_UNREGISTERED:
+            if(decode_unregistered_user_data(gb, size) < 0)
+                return -1;
+            break;
+        case SEI_TYPE_RECOVERY_POINT:
+            if(decode_recovery_point(n, gb) < 0)
+                return -1;
+            break;
+        case SEI_BUFFERING_PERIOD:
+            if(decode_buffering_period(n, gb) < 0)
+                return -1;
+            break;
+        default:
+            skip_bits(gb, 8*size);
+        }
+
+        //FIXME check bits here
+        align_get_bits(gb);
+    }
+
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_sei.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_sei.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,7 @@
+#ifndef H264_SEI_H
+#define H264_SEI_H
+
+int ff_h264_decode_sei(NalContext *n, GetBitContext *gb);
+void ff_h264_reset_sei(NalContext *n);
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_seq.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_seq.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,220 @@
+/*
+* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include "h264_types.h"
+#include "h264_parser.h"
+#include "h264_nal.h"
+#include "h264_entropy.h"
+#include "h264_rec.h"
+#include "h264_pred_mode.h"
+#include "h264_misc.h"
+// #undef NDEBUG
+#include <assert.h>
+
+static int decode_slice_entropy_seq(H264Context *h, EntropyContext *ec, H264Slice *s, GetBitContext *gb, H264Mb *mbs){
+    int i,j;
+//     GetBitContext *gb = s->gb;
+    CABACContext *c = &ec->c;
+
+    if( !s->pps.cabac ){
+        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
+        return -1;
+    }
+
+    init_dequant_tables(s, ec);
+    ec->curr_qscale = s->qscale;
+    ec->last_qscale_diff = 0;
+    ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale);
+    ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale);
+
+    /* realign */
+    align_get_bits( gb );
+    /* init cabac */
+    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
+
+    ff_h264_init_cabac_states(ec, s, c);
+
+    for(j=0; j<ec->mb_height; j++){
+        init_entropy_buf(ec, s, j);
+        for(i=0; i<ec->mb_width; i++){
+            int eos,ret;
+            H264Mb *m = &mbs[i + j*ec->mb_width];
+            //memset(m, 0, sizeof(H264Mb));
+            m->mb_x=i;
+            m->mb_y=j;
+            ec->m = m;
+
+            ret = ff_h264_decode_mb_cabac(ec, s, c);
+            eos = get_cabac_terminate( c);
+            (void) eos;
+            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
+                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
+                return -1;
+            }
+        }
+    }
+
+//     av_freep(&s->gb.raw);
+//     if (s->gb.rbsp)
+//         av_freep(&s->gb.rbsp);
+
+    return 0;
+}
+
+
+
+/**
+*   Sequential version
+*/
+static void decode_slice_mb_seq(H264Context *h, MBRecContext *d, H264Slice *s2, H264Mb *mbs){
+
+    for (int i=0; i<2; i++){
+        for(int j=0; j< s2->ref_count[i]; j++){
+            if (s2->ref_list_cpn[i][j] ==-1)
+                continue;
+            int k;
+            for (k=0; k<h->max_dpb_cnt; k++){
+                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s2->ref_list_cpn[i][j]){
+                    s2->dp_ref_list[i][j] = &h->dpb[k];
+                    break;
+                }
+            }
+        }
+    }
+
+    get_dpb_entry(h, s2);
+
+    if (!h->no_mbd){
+        for(int j=0; j<d->mb_height; j++){
+            init_mbrec_context(d, d->mrs, s2, j);
+            if (h->profile) printf("\n[MBREC LINE %d ", j);
+            for(int i=0; i<d->mb_width; i++){
+
+                if ((i & 0x7) == 0) start_timer(h, REC);
+                H264Mb *m = &mbs[i + j*d->mb_width];
+                if (h->profile==2)
+                    pred_motion_mb_rec (d, d->mrs, s2, m);
+                else{
+                    h264_decode_mb_internal(d, d->mrs, s2, m);
+                }
+                stop_timer(h, REC);
+            }
+            draw_edges(d, s2, j);
+
+        }
+    }
+
+    for (int i=0; i<s2->release_cnt; i++){
+        for(int j=0; j<h->max_dpb_cnt; j++){
+            if(h->dpb[j].cpn== s2->release_ref_cpn[i]){
+                release_dpb_entry(h, &h->dpb[j], 2);
+                break;
+            }
+        }
+    }
+    s2->release_cnt=0;
+}
+
+/*
+* The following code is the main loop of the file converter
+*/
+int h264_decode_seq( H264Context *h) {
+    ParserContext *pc;
+    NalContext *nc;
+    EntropyContext *ec;
+    MBRecContext *rc;
+    OutputContext *oc;
+
+    H264Slice slice, *s=&slice;
+    H264Mb *mbs;
+    DecodedPicture *out;
+    int frames=0;
+
+#if HAVE_LIBSDL2
+    pthread_t sdl_thr;
+    if (h->display){
+        pthread_create(&sdl_thr, NULL, sdl_thread, h);
+    }
+#endif
+    
+    pc = get_parse_context(h->ifile);
+    nc = get_nal_context(h->width, h->height);
+
+    memset(s, 0, sizeof(H264Slice));
+    mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
+
+    ec = get_entropy_context( h );
+    rc = get_mbrec_context(h);
+    rc->top_next = rc->top = av_malloc( h->mb_width * sizeof(TopBorder));
+
+    oc = get_output_context( h );
+
+    av_start_timer();
+    GetBitContext gb = {0,};
+    while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
+        if (h->profile) start_timer(h, FRONT);
+        av_read_frame_internal(pc, &gb);
+        decode_nal_units(nc, s, &gb);
+        if (h->profile) stop_timer(h, FRONT);
+//         memset(s->mbs, 0, sizeof(H264Mb)*ec->mb_width*ec->mb_height);
+        if (h->profile) start_timer(h, ED);
+        decode_slice_entropy_seq(h, ec, s, &gb, mbs);
+        if (h->profile) stop_timer(h, ED);
+
+        if (h->profile) start_timer(h, REC);
+        decode_slice_mb_seq(h, rc, s, mbs);
+        if (h->profile) stop_timer(h, REC);
+
+        out =output_frame(h, oc, s->curr_pic, h->ofile, h->frame_width, h->frame_height);
+        if (out){
+            release_dpb_entry(h, out, 1);
+        }
+
+        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
+        if (h->profile == 3){
+            printf("[ENTROPY %.3fms] [MBREC %.3fms]\n", h->last_time[ED] , h->last_time[REC]);
+        }
+    }
+    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
+    
+    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
+    h->num_frames = oc->frame_number;
+    /* finished ! */
+    av_freep(&mbs);
+    av_freep(&gb.raw);
+    if (gb.rbsp)
+        av_freep(&gb.rbsp);
+    av_freep(&rc->top);
+
+    free_parse_context(pc);
+    free_nal_context  (nc);
+    free_entropy_context(ec);
+    free_mbrec_context(rc);
+    free_output_context(oc);
+
+#if HAVE_LIBSDL2
+    if (h->display){
+        signal_sdl_exit(h);
+        pthread_join(sdl_thr, NULL);
+    }
+#endif
+    
+    return 0;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_types.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/h264_types.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,658 @@
+#ifndef H264_TYPES_H
+#define H264_TYPES_H
+
+#include "config.h"
+#ifdef HAVE_LIBSDL2
+#include <SDL2/SDL.h>
+#endif
+
+#include <pthread.h>
+#include "avcodec.h"
+#include "cabac.h"
+#include "h264_dsp.h"
+#include "h264_pred.h"
+#include "get_bits.h"
+
+
+#define MAX_REF_PIC_COUNT 16
+#define MAX_DELAYED_PIC_COUNT 16
+
+#define MAX_THREADS 80
+
+//#define MAX_PIC_COUNT (4*(MAX_REF_PIC_COUNT+MAX_DELAYED_PIC_COUNT))
+
+#define DPB_SIZE 33
+
+
+//potsdam machine 8xX7560 without HT
+// static int edb_affinity [16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+// static int edip_affinity[8] =  {16, 17, 18, 19, 20, 21, 22, 23};
+//
+// static int mbd_affinity[8][5] = {	{24, 32, 40, 48, 56},
+// 							{25, 33, 41, 49, 57},
+// 							{26, 34, 42, 50, 58},
+// 							{27, 35, 43, 51, 59},
+// 							{28, 36, 44, 52, 60},
+// 							{29, 37, 45, 53, 61},
+// 							{30, 38, 46, 54, 62},
+// 							{31, 39, 47, 55, 63}, };
+
+// static int edb_affinity [22] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 58, 59, 60, 61 ,62, 63};
+// static int edip_affinity[10] =  {16, 17, 18, 19, 20, 21, 22, 23, 56, 57 };
+//
+// static int mbd_affinity[8][5] = {	{24, 32, 40, 48, 56},
+// 							{25, 33, 41, 49, 57},
+// 							{26, 34, 42, 50, 58},
+// 							{27, 35, 43, 51, 59},
+// 							{28, 36, 44, 52, 60},
+// 							{29, 37, 45, 53, 61},
+// 							{30, 38, 46, 54, 62},
+// 							{31, 39, 47, 55, 63}, };
+// //4 socket
+// static int edip_affinity[5] = {0, 1, 2, 3, 56};
+// static int edb_affinity [12] = {8, 9, 10, 11, 16, 17, 18, 19, 59, 58, 57, 51};
+//
+// static int mbd_affinity[4][5] = { {24, 32, 40, 48, 56},
+// {25, 33, 41, 49, 57},
+// {26, 34, 42, 50, 58},
+// {27, 35, 43, 51, 59}, };
+
+// static int edip_affinity[3] = {0, 1, 49};
+// static int edb_affinity [6] = {8, 9, 16, 17, 56, 57};
+//
+// static int mbd_affinity[2][5] = { {24, 32, 40, 48, 56},
+// {25, 33, 41, 49, 57}};
+
+// static int edip_affinity[2] = {0, 8};
+// static int edb_affinity [3] = {16, 24, 56};
+//
+// static int mbd_affinity[1][4] = { {32, 40, 48, 56},
+// };
+
+/// for ducks_take_off_2160p
+// static int edip_affinity[2] = {0, 8};
+// static int edb_affinity [3] = {16, 24, 32};
+//
+// static int mbd_affinity[1][4] = {{ 40, 48, 56, 32}};
+
+// static int edip_affinity[3] = {0, 1, 57};
+// static int edb_affinity [7] = {8, 9, 16, 17, 24, 25, 56};
+//
+// static int mbd_affinity[2][4] = { {32, 40, 48, 56},
+// {33, 41, 49, 57}};
+
+//4 socket
+// static int edip_affinity[6]  = {0, 1, 2, 3, 59};
+// static int edb_affinity [14] = {8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 58, 57};
+//
+// static int mbd_affinity[4][4] = { {32, 40, 48, 56},
+// {33, 41, 49, 57},
+// {34, 42, 50, 58},
+// {35, 43, 51, 59}, };
+
+
+// static int edb_affinity [29] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 59, 60, 61, 62, 63};
+// static int edip_affinity[11] =  {24, 25, 26, 27, 28, 29, 30, 31, 63, 62, 61};
+//
+// static int mbd_affinity[8][4] = {{32, 40, 48, 56},
+// 							{33, 41, 49, 57},
+// 							{34, 42, 50, 58},
+// 							{35, 43, 51, 59},
+// 							{36, 44, 52, 60},
+// 							{37, 45, 53, 61},
+// 							{38, 46, 54, 62},
+// 							{39, 47, 55, 63}, };
+
+//potsdam machine 4xX7550 with HT
+// int edip_affinity[16] = {0, 8, 16, 24, 	1, 9, 17, 25, 	2, 10, 18, 26,	3, 11, 19, 27 };
+// int edb_affinity [16] = {1, 9, 17, 25, 	2, 10, 18, 26, 	6, 14, 22, 30,	7, 15, 23, 31 };
+// int edip_affinity[16] = {58, 50, 42, 34, 	1, 9, 17, 25, 	2, 10, 18, 26,	3, 11, 19, 27 };
+// int edb_affinity [16] = {57, 49, 41, 33, 	56, 48, 40, 32, 	6, 14, 22, 30,	7, 15, 23, 31 };
+// //int edb_affinity [16] = {4, 12, 20, 28, 5, 13, 21, 29, 	6, 14, 22, 30,	7, 15, 23, 31 };
+// //mb threads affinity on logical cores moving back to keep inteference with ed threads low
+// int mbd_affinity[4][8] = {	{63, 62, 61, 60, 59, 58, 57, 56},
+// 							{55, 54, 53, 52, 51, 50, 49, 48},
+// 							{47, 46, 45, 44, 43, 42, 41, 40},
+// 							{39, 38, 37, 36, 35, 34, 33, 32},
+// 							};
+
+
+// static int edip_affinity[2] = {0, 2};
+// static int edb_affinity [4] = {1, 3, 2, 5};
+//
+// static int mbd_affinity[1][4] = {{ 4, 6, 7, 5}};
+
+enum{
+    PARSE=0,
+    ENTROPY,
+    REORDER,
+    REORDER2,   //second mutex-cond pair used in reorder_thread
+    MBDEC,
+    OUTPUT,
+    STAGES
+};
+
+//adhoc for profiling
+enum{
+    TOTAL=0,
+    FRONT,
+    ED,
+    REC,
+    PROFILE_STAGES
+};
+
+/* bit input */
+/* buffer, buffer_end and size_in_bits must be present and used by every reader */
+
+/* frame parsing */
+typedef struct ParserContext {
+    //int64_t offset;      ///< byte offset from starting packet start
+    int ifile;
+    int ofile;
+    int buffer_size;
+    int eof_reached;
+
+    uint8_t *data;
+    int   size;
+    uint8_t *cur_ptr;
+    int cur_len;
+
+    int64_t frame_offset; /* offset of the current frame */
+    int64_t cur_offset; /* current offset (incremented by each av_parser_parse()) */
+    int64_t next_frame_offset; /* offset of the next frame */
+    int pict_type;
+    int repeat_pict;     //frame_duration = (1 + repeat_pict) * time_base. It is used by codecs like H.264 to display telecined material.
+    int key_frame;  //Set by parser to 1 for key frames and 0 for non-key frames.
+    int64_t pos;     // Byte position of currently parsed frame in stream.
+    int64_t last_pos;  //Previous frame byte position.
+    int final_frame;
+
+    uint8_t overread[5];
+    int overread_cnt;           ///< the number of bytes which where irreversibly read from the next frame
+    int index;
+    int last_index;
+    int frame_start_found;
+    uint32_t state;             ///< contains the last few bytes in MSB order
+} ParserContext;
+
+typedef struct NalContext {
+
+    SPS *sps_buffers[MAX_SPS_COUNT];
+    PPS *pps_buffers[MAX_PPS_COUNT];
+    SPS sps; ///< current sps
+
+    PictureInfo picture[16 + 1];  ///< Ref pic buffer used for deriving lists. Later linked with pic in dpb.
+    PictureInfo *release_ref[MAX_MMCO_COUNT];
+    PictureInfo *short_ref[32];
+    PictureInfo *long_ref[32];
+    int long_ref_count;  ///< number of actual long term references
+    int short_ref_count; ///< number of actual short term references
+
+    //POC stuff
+    uint32_t coded_pic_num;
+    int poc_lsb;
+    int poc_msb;
+    uint32_t poc_offset;
+    int delta_poc;
+    int frame_num;
+    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
+    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
+    int frame_num_offset;         ///< for POC type 2
+    int prev_frame_num_offset;    ///< for POC type 2
+    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
+
+    int max_pic_num;
+    int redundant_pic_count;
+    int outputed_poc;
+    int ip_id;
+//   int b8_stride;             ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing
+    int b4_stride;             ///< 4*mb_width+1 used for some 4x4 block arrays to allow simple addressing
+    int mb_stride;             ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11
+    int mb_width;
+    int mb_height;
+    int width;
+    int height;
+
+    int has_b_frames;
+    //pic_struct in picture timing SEI message
+    SEI_PicStructType sei_pic_struct;
+    // Bit set of clock types for fields/frames in picture timing SEI message. For each found ct_type, appropriate bit is set (e.g., bit 1 for interlaced).
+    int sei_ct_type;
+    // dpb_output_delay in picture timing SEI message, see H.264 C.2.2
+    int sei_dpb_output_delay;
+    //cpb_removal_delay in picture timing SEI message, see H.264 C.1.2
+    int sei_cpb_removal_delay;
+    //recovery_frame_cnt from SEI message
+    int sei_recovery_frame_cnt;
+    // Timestamp stuff
+    int sei_buffering_period_present;  ///< Buffering period SEI flag
+    int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs
+
+} NalContext;
+
+typedef struct EntropyContext{
+    CABACContext c;
+
+    H264Mb *m;
+    int top_cbp;
+    int left_cbp;
+    int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct
+
+    uint32_t top_type;
+    uint32_t left_type;
+    uint32_t topright_type;
+    uint32_t topleft_type;
+
+    int curr_qscale;
+    int chroma_qp[2]; //QPc
+    int last_qscale_diff;
+
+    uint32_t dequant4_buffer[6][52][16];
+    uint32_t dequant8_buffer[2][52][64];
+    uint32_t (*dequant4_coeff[6])[16];
+    uint32_t (*dequant8_coeff[2])[64];
+
+//     uint8_t (*non_zero_count_top)[32];
+//     uint8_t (*non_zero_count)[32];
+//     uint8_t (*non_zero_count_row[2])[32];
+
+    uint8_t (*non_zero_count_top)[8];
+    uint8_t (*non_zero_count)[8];
+    uint8_t (*non_zero_count_row[2])[8];
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_left[8]);
+
+    uint8_t (*mvd_top[2])[2];
+    uint8_t (*mvd[2])[2];
+    uint8_t (*mvd_table[2][2])[2];
+
+    uint8_t *direct_top;
+    uint8_t *direct;
+    uint8_t *direct_table[2];
+
+    uint8_t *chroma_pred_mode_top;
+    uint8_t *chroma_pred_mode;
+    uint8_t *chroma_pred_mode_table[2];
+
+    uint16_t *cbp_top;
+    uint16_t *cbp;
+    uint16_t *cbp_table[2];
+
+    int8_t *qscale_top;
+    int8_t *qscale;
+    int8_t *qscale_table[2];
+
+    int8_t *ref_index_top[2];
+    int8_t *ref_index[2];
+    int8_t *ref_index_table[2][2];
+
+    uint32_t *mb_type_top;
+    uint32_t *mb_type;
+    uint32_t *mb_type_table[2];
+
+    int b_stride;
+    int mb_stride;
+    int mb_width;
+    int mb_height;
+
+    uint8_t *zigzag_scan;
+    uint8_t *zigzag_scan8x8;
+    uint8_t direct_cache[5*8];
+
+    DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]);
+    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
+    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
+    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
+
+} EntropyContext;
+
+typedef struct H264Slice {
+    PPS pps;                   ///< current pps
+    PictureInfo* current_picture_info;
+    DecodedPicture* curr_pic;
+    int slice_num;
+
+    int release_ref_cpn[MAX_MMCO_COUNT];
+    int release_cnt;
+
+    int qp_thresh;      ///< QP threshold to skip loopfilter
+    int use_weight;
+    int use_weight_chroma;
+    int luma_log2_weight_denom;
+    int chroma_log2_weight_denom;
+
+    int16_t luma_weight[16][2][2];
+    int16_t chroma_weight[16][2][2][2];
+    int16_t implicit_weight[16][16][2];
+
+    //poc number of ref_list int ref_poc[2][16]
+    //In edslice this must becom Picture Info
+    int ref_list_cpn[2][16];
+    PictureInfo *ref_list[2][16];         ///Reordered version of default_ref_list according to picture reordering in slice header
+    DecodedPicture *dp_ref_list[2][16];
+    int ref_count[2];   ///< counts frames or fields, depending on current mb mode
+
+    int slice_type;
+    int slice_type_nos;
+    int slice_alpha_c0_offset;
+    int slice_beta_offset;
+    int direct_8x8_inference_flag;
+
+    uint8_t list_count;
+    uint32_t coded_pic_num;
+
+    int poc;
+    int key_frame;
+    int mmco_reset; //FIXME not used?
+
+    ///stuff only needed for nal/entropy decoding
+//     H264Mb *m;
+//     GetBitContext *gb;
+    int ip_id;
+    int transform_bypass;
+    int direct_spatial_mv_pred;
+    int map_col_to_list0[2][16];
+    int dist_scale_factor[16];
+
+    int cabac_init_idc;
+    int nal_ref_idc;
+    int nal_unit_type;
+
+    int ref2frm[2][64];  ///< reference to frame number lists, the first 2 are for -2,-1
+
+    int qscale;
+
+} H264Slice;
+
+typedef struct {
+    H264Slice slice;
+    H264Mb *mbs;
+    DecodedPicture *dp;
+    GetBitContext gb;
+
+    int lines_taken;
+    int lines_total;
+    int state;       // 0 free, 1 in use //1 wait for entropy, 2 wait for reconstruct.
+    int initialized;
+} SliceBufferEntry;
+
+typedef struct RingLineEntry{
+    union{
+    DECLARE_ALIGNED(64, volatile int32_t, mb_cnt);
+    DECLARE_ALIGNED(64, int32_t, pad[16]);
+    };
+    SliceBufferEntry *sbe;
+    int id;
+    int line;
+    TopBorder *top;
+    struct RingLineEntry *prev_line;
+
+} RingLineEntry;
+
+// #if OMPSS
+typedef struct SuperMBTask{
+    int smb_x;
+    int smb_y;
+} SuperMBTask;
+
+typedef struct SuperMBContext{
+    int nsmb_width;             //number of super macroblocks in picture width
+    int nsmb_height;            //number of super macroblocks in picture height
+    int nsmb_3dheight;          //number of super macroblocks in picture height - max motion vertical vector
+    int smb_width;              //width of a super macroblock
+    int smb_height;             //height of a super macroblock
+    int refcount;
+    int index;
+    SuperMBTask *smbs[2];
+} SuperMBContext;
+// #endif
+
+//scratchpad for decoding a macroblock
+typedef struct MBRecState{
+    int8_t *ref_index_top[2];
+    int8_t *ref_index[2];
+    int16_t (*motion_val_top[2])[2];
+    int16_t (*motion_val[2])[2];
+    uint32_t *mb_type_top;
+    uint32_t *mb_type;
+
+    int8_t *list1_ref_index[2];
+    int16_t (*list1_motion_val[2])[2];
+    uint32_t *list1_mb_type;
+
+    int8_t *intra4x4_pred_mode_top;
+    int8_t *intra4x4_pred_mode;
+#if !OMPSS
+    int8_t intra4x4_pred_mode_left[4];
+#endif
+    int8_t *non_zero_count_top;
+    int8_t *non_zero_count;
+//     int8_t non_zero_count_left[8];
+
+
+    unsigned int topleft_samples_available;
+    unsigned int topright_samples_available;
+    unsigned int top_samples_available;
+    unsigned int left_samples_available;
+
+    int top_type;
+    int left_type;
+
+    DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]);
+    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
+    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
+    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
+
+    DECLARE_ALIGNED(8, int16_t, bS)[2][4][4];
+    uint8_t edges[2];
+
+}MBRecState ;
+
+typedef struct MBRecContext{
+    DSPContext dsp;             ///< pointers for accelerated dsp functions
+    H264DSPContext hdsp;
+    H264PredContext hpc;
+
+    MBRecState *mrs;
+    RingLineEntry *rle;         //debug
+
+    uint8_t *scratchpad_y;      ///implemented different on Cell
+    uint8_t *scratchpad_cb;     ///implemented different on Cell
+    uint8_t *scratchpad_cr;     ///implemented different on Cell
+
+    int linesize;
+    int uvlinesize;
+    int mb_width;
+    int mb_height;
+    int mb_stride;
+    int b_stride;
+    int width;
+    int height;
+
+#if !OMPSS   // not used in OMPSS
+    LeftBorder left;
+    TopBorder *top;
+    TopBorder *top_next; 	// next line top border
+#endif
+    /*
+    .UU.YYYY
+    .UU.YYYY
+    .vv.YYYY
+    .VV.YYYY
+    */
+
+    // block_offset[ 0..23] for frame macroblocks
+    int block_offset[16+8];
+
+} MBRecContext;
+
+#ifdef HAVE_LIBSDL2
+typedef struct SDLContext{
+    int display;
+    int fullscreen;
+    pthread_t listen_thread;
+
+    SDL_DisplayMode full;
+    SDL_DisplayMode wind;
+
+    
+    SDL_Renderer *renderer;
+    SDL_Rect rect;
+    SDL_Rect win_rect;
+    SDL_Window *window;
+    double aspect;
+    int win_w;
+    int win_h;
+    int resized;
+    
+    SDL_Texture *sbmap_texture;
+    int showmap;
+    int updatemap;
+    int pause;
+    
+} SDLContext;
+#endif
+
+typedef struct OutputContext {
+    int bit_buffer_size;
+    uint8_t *bit_buffer;
+    uint64_t video_size;
+    int frame_number;
+    DecodedPicture *delayed_pic[DPB_SIZE];
+    int dp_cnt;
+
+} OutputContext;
+
+typedef struct {
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+    SliceBufferEntry **queue;
+    int size;
+    int cnt;
+    int fi;
+    int fo;
+} SliceBufferQueue;
+
+typedef struct {
+    pthread_mutex_t wslock;
+    pthread_cond_t wscond;
+    pthread_mutex_t swlock;
+    pthread_cond_t swcond;
+    RingLineEntry **queue;
+    int size;
+    int ready;
+    int free;
+    int fi;
+    int fo;
+} RingLineQueue;
+
+#if HAVE_LIBSDL2
+typedef struct {
+    pthread_mutex_t sdl_lock;
+    pthread_cond_t sdl_cond;
+    SDL_Texture **queue;
+    int size;
+    int ready;
+    int fi;
+    int fo;
+    int exit;
+} SDLTextureQueue;
+#endif
+/**
+* H264Context
+*/
+typedef struct H264Context{
+    SliceBufferQueue sb_q[STAGES];
+    RingLineQueue rl_q;
+
+    pthread_mutex_t lock[STAGES];
+    pthread_cond_t cond[STAGES];
+
+    pthread_mutex_t task_lock;
+    pthread_cond_t task_cond;
+
+    pthread_attr_t ed_rec_attr[MAX_THREADS];
+    pthread_t ed_rec_thr[MAX_THREADS];
+
+    int init_threads;
+    pthread_mutex_t ilock;
+    pthread_cond_t icond;
+
+    const char *file_name;
+    int profile;
+    int start;
+    int touch_start;
+    int setaff;
+    int touch_done;
+    int rl_side_touch;
+    int statmbd;
+    pthread_mutex_t slock;
+    pthread_cond_t scond;
+    pthread_mutex_t tlock;
+    pthread_cond_t tcond;
+    pthread_mutex_t tdlock;
+    pthread_cond_t tdcond;
+
+    int ed_ppe_threads;
+    int threads;
+    int smt;
+
+    int acdpb_cnt;  //debug
+    int reldpb_cnt;
+    
+    int sb_size;
+    SliceBufferEntry *sb;               ///< Slice Syntax Buffer
+    int free_sb_cnt;
+    int slice_bufs;
+
+    int max_dpb_cnt;
+    DecodedPicture *dpb;       ///< Decoded Picture Buffer
+    int free_dpb_cnt;
+
+    int ifile;
+    int ofile;
+    int frame_width;
+    int frame_height;
+    int num_frames;
+    int width;
+    int height;
+    int mb_width;
+    int mb_height;
+    int mb_stride;          ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11
+    int b4_stride;
+    int b_stride;
+
+    int smb_height;
+    int smb_width;
+    pthread_mutex_t smb_lock;
+    pthread_cond_t sdl_cond;
+    pthread_mutex_t sdl_lock;
+    SuperMBContext *smbc;
+    
+    int wave_order;
+    int static_3d;
+    int pipe_bufs;
+
+    //shared tables used in entropy decoding
+    uint8_t zigzag_scan[16];
+    uint8_t zigzag_scan8x8[64];
+
+    int verbose;
+    int no_mbd;
+    int display;
+    int fullscreen;
+    int quit;
+#ifdef HAVE_LIBSDL2
+    SDLTextureQueue sdlq;
+    SDLContext *sdlc;
+#endif
+     
+    struct timespec start_time[PROFILE_STAGES];
+    struct timespec end_time[PROFILE_STAGES];
+    double last_time[PROFILE_STAGES];
+    double total_time[PROFILE_STAGES];
+
+}H264Context;
+
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/mathops.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/mathops.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,145 @@
+/*
+ * simple math operations
+ * Copyright (c) 2001, 2002 Fabrice Bellard
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_MATHOPS_H
+#define AVCODEC_MATHOPS_H
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+
+#if   ARCH_ARM
+#   include "arm/mathops.h"
+#elif ARCH_PPC
+#   include "ppc/mathops.h"
+#elif ARCH_X86
+#   include "x86/mathops.h"
+#endif
+
+/* generic implementation */
+
+#ifndef MULL
+#   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
+#endif
+
+#ifndef MULH
+//gcc 3.4 creates an incredibly bloated mess out of this
+//#    define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32)
+
+static av_always_inline int MULH(int a, int b){
+    return ((int64_t)(a) * (int64_t)(b))>>32;
+}
+#endif
+
+#ifndef UMULH
+static av_always_inline unsigned UMULH(unsigned a, unsigned b){
+    return ((uint64_t)(a) * (uint64_t)(b))>>32;
+}
+#endif
+
+#ifndef MUL64
+#   define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
+#endif
+
+#ifndef MAC64
+#   define MAC64(d, a, b) ((d) += MUL64(a, b))
+#endif
+
+#ifndef MLS64
+#   define MLS64(d, a, b) ((d) -= MUL64(a, b))
+#endif
+
+/* signed 16x16 -> 32 multiply add accumulate */
+#ifndef MAC16
+#   define MAC16(rt, ra, rb) rt += (ra) * (rb)
+#endif
+
+/* signed 16x16 -> 32 multiply */
+#ifndef MUL16
+#   define MUL16(ra, rb) ((ra) * (rb))
+#endif
+
+#ifndef MLS16
+#   define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb))
+#endif
+
+/* median of 3 */
+#ifndef mid_pred
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+#if 0
+    int t= (a-b)&((a-b)>>31);
+    a-=t;
+    b+=t;
+    b-= (b-c)&((b-c)>>31);
+    b+= (a-b)&((a-b)>>31);
+
+    return b;
+#else
+    if(a>b){
+        if(c>b){
+            if(c>a) b=a;
+            else    b=c;
+        }
+    }else{
+        if(b>c){
+            if(c>a) b=c;
+            else    b=a;
+        }
+    }
+    return b;
+#endif
+}
+#endif
+
+#ifndef sign_extend
+static inline av_const int sign_extend(int val, unsigned bits)
+{
+    return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
+}
+#endif
+
+#ifndef zero_extend
+static inline av_const unsigned zero_extend(unsigned val, unsigned bits)
+{
+    return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
+}
+#endif
+
+#ifndef COPY3_IF_LT
+#define COPY3_IF_LT(x, y, a, b, c, d)\
+if ((y) < (x)) {\
+    (x) = (y);\
+    (a) = (b);\
+    (c) = (d);\
+}
+#endif
+
+#ifndef NEG_SSR32
+#   define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
+#endif
+
+#ifndef NEG_USR32
+#   define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
+#endif
+
+#endif /* AVCODEC_MATHOPS_H */
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_altivec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/dsputil_altivec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+#include "libavcodec/dsputil.h"
+#include "dsputil_ppc.h"
+#include "util_altivec.h"
+#include "types_altivec.h"
+#include "dsputil_altivec.h"
+
+
+static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
+{
+    int i;
+    vector unsigned char perm, bytes, *pixv;
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    vector signed short shorts;
+
+    for (i = 0; i < 8; i++) {
+        // Read potentially unaligned pixels.
+        // We're reading 16 pixels, and actually only want 8,
+        // but we simply ignore the extras.
+        perm = vec_lvsl(0, pixels);
+        pixv = (vector unsigned char *) pixels;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts = (vector signed short)vec_mergeh(zero, bytes);
+
+        // save the data to the block, we assume the block is 16-byte aligned
+        vec_st(shorts, i*16, (vector signed short*)block);
+
+        pixels += line_size;
+    }
+}
+
+static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
+        const uint8_t *s2, int stride)
+{
+    int i;
+    vector unsigned char perm, bytes, *pixv;
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    vector signed short shorts1, shorts2;
+
+    for (i = 0; i < 4; i++) {
+        // Read potentially unaligned pixels
+        // We're reading 16 pixels, and actually only want 8,
+        // but we simply ignore the extras.
+        perm = vec_lvsl(0, s1);
+        pixv = (vector unsigned char *) s1;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the same for the second block of pixels
+        perm = vec_lvsl(0, s2);
+        pixv = (vector unsigned char *) s2;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the subtraction
+        shorts1 = vec_sub(shorts1, shorts2);
+
+        // save the data to the block, we assume the block is 16-byte aligned
+        vec_st(shorts1, 0, (vector signed short*)block);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+
+
+        // The code below is a copy of the code above... This is a manual
+        // unroll.
+
+        // Read potentially unaligned pixels
+        // We're reading 16 pixels, and actually only want 8,
+        // but we simply ignore the extras.
+        perm = vec_lvsl(0, s1);
+        pixv = (vector unsigned char *) s1;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the same for the second block of pixels
+        perm = vec_lvsl(0, s2);
+        pixv = (vector unsigned char *) s2;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the subtraction
+        shorts1 = vec_sub(shorts1, shorts2);
+
+        // save the data to the block, we assume the block is 16-byte aligned
+        vec_st(shorts1, 0, (vector signed short*)block);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+    }
+}
+
+
+static void clear_block_altivec(DCTELEM *block) {
+    LOAD_ZERO;
+    vec_st(zero_s16v,   0, block);
+    vec_st(zero_s16v,  16, block);
+    vec_st(zero_s16v,  32, block);
+    vec_st(zero_s16v,  48, block);
+    vec_st(zero_s16v,  64, block);
+    vec_st(zero_s16v,  80, block);
+    vec_st(zero_s16v,  96, block);
+    vec_st(zero_s16v, 112, block);
+}
+
+
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    register vector unsigned char perm = vec_lvsl(0, pixels);
+    int i;
+    register int line_size_2 = line_size << 1;
+    register int line_size_3 = line_size + line_size_2;
+    register int line_size_4 = line_size << 2;
+
+POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
+// hand-unrolling the loop by 4 gains about 15%
+// mininum execution time goes from 74 to 60 cycles
+// it's faster than -funroll-loops, but using
+// -funroll-loops w/ this is bad - 74 cycles again.
+// all this is on a 7450, tuning for the 7450
+#if 0
+    for (i = 0; i < h; i++) {
+        pixelsv1 = vec_ld(0, pixels);
+        pixelsv2 = vec_ld(16, pixels);
+        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+               0, block);
+        pixels+=line_size;
+        block +=line_size;
+    }
+#else
+    for (i = 0; i < h; i += 4) {
+        pixelsv1  = vec_ld( 0, pixels);
+        pixelsv2  = vec_ld(15, pixels);
+        pixelsv1B = vec_ld(line_size, pixels);
+        pixelsv2B = vec_ld(15 + line_size, pixels);
+        pixelsv1C = vec_ld(line_size_2, pixels);
+        pixelsv2C = vec_ld(15 + line_size_2, pixels);
+        pixelsv1D = vec_ld(line_size_3, pixels);
+        pixelsv2D = vec_ld(15 + line_size_3, pixels);
+        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+               0, (unsigned char*)block);
+        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
+               line_size, (unsigned char*)block);
+        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
+               line_size_2, (unsigned char*)block);
+        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
+               line_size_3, (unsigned char*)block);
+        pixels+=line_size_4;
+        block +=line_size_4;
+    }
+#endif
+POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
+void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
+    int i;
+
+POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
+
+    for (i = 0; i < h; i++) {
+        pixelsv1 = vec_ld( 0, pixels);
+        pixelsv2 = vec_ld(16,pixels);
+        blockv = vec_ld(0, block);
+        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+        blockv = vec_avg(blockv,pixelsv);
+        vec_st(blockv, 0, (unsigned char*)block);
+        pixels+=line_size;
+        block +=line_size;
+    }
+
+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    int i;
+
+POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
+
+   for (i = 0; i < h; i++) {
+       /* block is 8 bytes-aligned, so we're either in the
+          left block (16 bytes-aligned) or in the right block (not) */
+       int rightside = ((unsigned long)block & 0x0000000F);
+
+       blockv = vec_ld(0, block);
+       pixelsv1 = vec_ld( 0, pixels);
+       pixelsv2 = vec_ld(16, pixels);
+       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+
+       if (rightside) {
+           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
+       } else {
+           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
+       }
+
+       blockv = vec_avg(blockv, pixelsv);
+
+       vec_st(blockv, 0, block);
+
+       pixels += line_size;
+       block += line_size;
+   }
+
+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
+    register int i;
+    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
+    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned short pixelssum1, pixelssum2, temp3;
+    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+    temp1 = vec_ld(0, pixels);
+    temp2 = vec_ld(16, pixels);
+    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
+        pixelsv2 = temp2;
+    } else {
+        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+    }
+    pixelsv1 = vec_mergeh(vczero, pixelsv1);
+    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                         (vector unsigned short)pixelsv2);
+    pixelssum1 = vec_add(pixelssum1, vctwo);
+
+POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
+    for (i = 0; i < h ; i++) {
+        int rightside = ((unsigned long)block & 0x0000000F);
+        blockv = vec_ld(0, block);
+
+        temp1 = vec_ld(line_size, pixels);
+        temp2 = vec_ld(line_size + 16, pixels);
+        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
+            pixelsv2 = temp2;
+        } else {
+            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+        }
+
+        pixelsv1 = vec_mergeh(vczero, pixelsv1);
+        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                             (vector unsigned short)pixelsv2);
+        temp3 = vec_add(pixelssum1, pixelssum2);
+        temp3 = vec_sra(temp3, vctwo);
+        pixelssum1 = vec_add(pixelssum2, vctwo);
+        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+
+        if (rightside) {
+            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+        } else {
+            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+        }
+
+        vec_st(blockv, 0, block);
+
+        block += line_size;
+        pixels += line_size;
+    }
+
+POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
+    register int i;
+    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
+    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned short pixelssum1, pixelssum2, temp3;
+    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+    temp1 = vec_ld(0, pixels);
+    temp2 = vec_ld(16, pixels);
+    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
+        pixelsv2 = temp2;
+    } else {
+        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+    }
+    pixelsv1 = vec_mergeh(vczero, pixelsv1);
+    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                         (vector unsigned short)pixelsv2);
+    pixelssum1 = vec_add(pixelssum1, vcone);
+
+POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+    for (i = 0; i < h ; i++) {
+        int rightside = ((unsigned long)block & 0x0000000F);
+        blockv = vec_ld(0, block);
+
+        temp1 = vec_ld(line_size, pixels);
+        temp2 = vec_ld(line_size + 16, pixels);
+        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
+            pixelsv2 = temp2;
+        } else {
+            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+        }
+
+        pixelsv1 = vec_mergeh(vczero, pixelsv1);
+        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                             (vector unsigned short)pixelsv2);
+        temp3 = vec_add(pixelssum1, pixelssum2);
+        temp3 = vec_sra(temp3, vctwo);
+        pixelssum1 = vec_add(pixelssum2, vcone);
+        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+
+        if (rightside) {
+            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+        } else {
+            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+        }
+
+        vec_st(blockv, 0, block);
+
+        block += line_size;
+        pixels += line_size;
+    }
+
+POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
+    register int i;
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned short temp3, temp4,
+        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
+    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
+
+    temp1 = vec_ld(0, pixels);
+    temp2 = vec_ld(16, pixels);
+    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
+        pixelsv2 = temp2;
+    } else {
+        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+    }
+    pixelsv3 = vec_mergel(vczero, pixelsv1);
+    pixelsv4 = vec_mergel(vczero, pixelsv2);
+    pixelsv1 = vec_mergeh(vczero, pixelsv1);
+    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                         (vector unsigned short)pixelsv4);
+    pixelssum3 = vec_add(pixelssum3, vctwo);
+    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                         (vector unsigned short)pixelsv2);
+    pixelssum1 = vec_add(pixelssum1, vctwo);
+
+    for (i = 0; i < h ; i++) {
+        blockv = vec_ld(0, block);
+
+        temp1 = vec_ld(line_size, pixels);
+        temp2 = vec_ld(line_size + 16, pixels);
+        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
+            pixelsv2 = temp2;
+        } else {
+            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+        }
+
+        pixelsv3 = vec_mergel(vczero, pixelsv1);
+        pixelsv4 = vec_mergel(vczero, pixelsv2);
+        pixelsv1 = vec_mergeh(vczero, pixelsv1);
+        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+
+        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                             (vector unsigned short)pixelsv4);
+        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                             (vector unsigned short)pixelsv2);
+        temp4 = vec_add(pixelssum3, pixelssum4);
+        temp4 = vec_sra(temp4, vctwo);
+        temp3 = vec_add(pixelssum1, pixelssum2);
+        temp3 = vec_sra(temp3, vctwo);
+
+        pixelssum3 = vec_add(pixelssum4, vctwo);
+        pixelssum1 = vec_add(pixelssum2, vctwo);
+
+        blockv = vec_packsu(temp3, temp4);
+
+        vec_st(blockv, 0, block);
+
+        block += line_size;
+        pixels += line_size;
+    }
+
+POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
+    register int i;
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned short temp3, temp4,
+        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
+    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+
+    temp1 = vec_ld(0, pixels);
+    temp2 = vec_ld(16, pixels);
+    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
+        pixelsv2 = temp2;
+    } else {
+        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+    }
+    pixelsv3 = vec_mergel(vczero, pixelsv1);
+    pixelsv4 = vec_mergel(vczero, pixelsv2);
+    pixelsv1 = vec_mergeh(vczero, pixelsv1);
+    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                         (vector unsigned short)pixelsv4);
+    pixelssum3 = vec_add(pixelssum3, vcone);
+    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                         (vector unsigned short)pixelsv2);
+    pixelssum1 = vec_add(pixelssum1, vcone);
+
+    for (i = 0; i < h ; i++) {
+        blockv = vec_ld(0, block);
+
+        temp1 = vec_ld(line_size, pixels);
+        temp2 = vec_ld(line_size + 16, pixels);
+        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
+            pixelsv2 = temp2;
+        } else {
+            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+        }
+
+        pixelsv3 = vec_mergel(vczero, pixelsv1);
+        pixelsv4 = vec_mergel(vczero, pixelsv2);
+        pixelsv1 = vec_mergeh(vczero, pixelsv1);
+        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+
+        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                             (vector unsigned short)pixelsv4);
+        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                             (vector unsigned short)pixelsv2);
+        temp4 = vec_add(pixelssum3, pixelssum4);
+        temp4 = vec_sra(temp4, vctwo);
+        temp3 = vec_add(pixelssum1, pixelssum2);
+        temp3 = vec_sra(temp3, vctwo);
+
+        pixelssum3 = vec_add(pixelssum4, vcone);
+        pixelssum1 = vec_add(pixelssum2, vcone);
+
+        blockv = vec_packsu(temp3, temp4);
+
+        vec_st(blockv, 0, block);
+
+        block += line_size;
+        pixels += line_size;
+    }
+
+POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
+    register int i;
+    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
+    register vector unsigned char blockv, temp1, temp2, blocktemp;
+    register vector unsigned short pixelssum1, pixelssum2, temp3;
+
+    register const vector unsigned char vczero = (const vector unsigned char)
+                                        vec_splat_u8(0);
+    register const vector unsigned short vctwo = (const vector unsigned short)
+                                        vec_splat_u16(2);
+
+    temp1 = vec_ld(0, pixels);
+    temp2 = vec_ld(16, pixels);
+    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
+        pixelsv2 = temp2;
+    } else {
+        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+    }
+    pixelsv1 = vec_mergeh(vczero, pixelsv1);
+    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                         (vector unsigned short)pixelsv2);
+    pixelssum1 = vec_add(pixelssum1, vctwo);
+
+POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
+    for (i = 0; i < h ; i++) {
+        int rightside = ((unsigned long)block & 0x0000000F);
+        blockv = vec_ld(0, block);
+
+        temp1 = vec_ld(line_size, pixels);
+        temp2 = vec_ld(line_size + 16, pixels);
+        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
+            pixelsv2 = temp2;
+        } else {
+            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+        }
+
+        pixelsv1 = vec_mergeh(vczero, pixelsv1);
+        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                             (vector unsigned short)pixelsv2);
+        temp3 = vec_add(pixelssum1, pixelssum2);
+        temp3 = vec_sra(temp3, vctwo);
+        pixelssum1 = vec_add(pixelssum2, vctwo);
+        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+
+        if (rightside) {
+            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+        } else {
+            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+        }
+
+        blockv = vec_avg(blocktemp, blockv);
+        vec_st(blockv, 0, block);
+
+        block += line_size;
+        pixels += line_size;
+    }
+
+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
+}
+
+void dsputil_init_altivec(DSPContext* c)
+{
+    c->diff_pixels = diff_pixels_altivec;
+    c->get_pixels = get_pixels_altivec;
+    c->clear_block = clear_block_altivec;
+
+    c->put_pixels_tab[0][0] = put_pixels16_altivec;
+    /* the two functions do the same thing, so use the same code */
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
+    c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
+    c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
+    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
+    c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
+
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_altivec.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/dsputil_altivec.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H
+#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H
+
+#include <stdint.h>
+#include "libavcodec/dsputil.h"
+
+void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+int has_altivec(void);
+
+void fdct_altivec(int16_t *block);
+void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
+                  int x16, int y16, int rounder);
+void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
+void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_vp3_idct_altivec(DCTELEM *block);
+void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block);
+
+void dsputil_h264_init_ppc(DSPContext* c);
+
+void dsputil_init_altivec(DSPContext* c);
+//void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
+//void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
+//void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_ppc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/dsputil_ppc.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "dsputil_ppc.h"
+#include "dsputil_altivec.h"
+
+static void prefetch_ppc(void *mem, int stride, int h)
+{
+    register const uint8_t *p = mem;
+    do {
+        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
+        p+= stride;
+    } while(--h);
+}
+
+void dsputil_init_ppc(DSPContext* c)
+{
+    c->prefetch = prefetch_ppc;
+
+#if HAVE_ALTIVEC
+	dsputil_h264_init_ppc(c);	
+	dsputil_init_altivec(c);
+
+	c->idct_put = idct_put_altivec;
+	c->idct_add = idct_add_altivec;
+
+#endif /* HAVE_ALTIVEC */
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_ppc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/dsputil_ppc.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PPC_DSPUTIL_PPC_H
+#define AVCODEC_PPC_DSPUTIL_PPC_H
+
+#include "config.h"
+
+#if CONFIG_POWERPC_PERF
+void powerpc_display_perf_report(void);
+/* the 604* have 2, the G3* have 4, the G4s have 6,
+   and the G5 are completely different (they MUST use
+   ARCH_PPC64, and let's hope all future 64 bis PPC
+   will use the same PMCs... */
+#define POWERPC_NUM_PMC_ENABLED 6
+/* if you add to the enum below, also add to the perfname array
+   in dsputil_ppc.c */
+enum powerpc_perf_index {
+    altivec_fft_num = 0,
+    altivec_gmc1_num,
+    altivec_dct_unquantize_h263_num,
+    altivec_fdct,
+    altivec_idct_add_num,
+    altivec_idct_put_num,
+    altivec_put_pixels16_num,
+    altivec_avg_pixels16_num,
+    altivec_avg_pixels8_num,
+    altivec_put_pixels8_xy2_num,
+    altivec_put_no_rnd_pixels8_xy2_num,
+    altivec_put_pixels16_xy2_num,
+    altivec_put_no_rnd_pixels16_xy2_num,
+    altivec_hadamard8_diff8x8_num,
+    altivec_hadamard8_diff16_num,
+    altivec_avg_pixels8_xy2_num,
+    powerpc_clear_blocks_dcbz32,
+    powerpc_clear_blocks_dcbz128,
+    altivec_put_h264_chroma_mc8_num,
+    altivec_avg_h264_chroma_mc8_num,
+    altivec_put_h264_qpel16_h_lowpass_num,
+    altivec_avg_h264_qpel16_h_lowpass_num,
+    altivec_put_h264_qpel16_v_lowpass_num,
+    altivec_avg_h264_qpel16_v_lowpass_num,
+    altivec_put_h264_qpel16_hv_lowpass_num,
+    altivec_avg_h264_qpel16_hv_lowpass_num,
+    powerpc_perf_total
+};
+enum powerpc_data_index {
+    powerpc_data_min = 0,
+    powerpc_data_max,
+    powerpc_data_sum,
+    powerpc_data_num,
+    powerpc_data_total
+};
+extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
+
+#if !ARCH_PPC64
+#define POWERP_PMC_DATATYPE unsigned long
+#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a))
+#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a))
+#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a))
+#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#else /* ARCH_PPC64 */
+#define POWERP_PMC_DATATYPE unsigned long long
+#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a))
+#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a))
+#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a))
+#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#endif /* ARCH_PPC64 */
+#define POWERPC_PERF_DECLARE(a, cond)       \
+    POWERP_PMC_DATATYPE                     \
+        pmc_start[POWERPC_NUM_PMC_ENABLED], \
+        pmc_stop[POWERPC_NUM_PMC_ENABLED],  \
+        pmc_loop_index;
+#define POWERPC_PERF_START_COUNT(a, cond) do { \
+    POWERPC_GET_PMC6(pmc_start[5]); \
+    POWERPC_GET_PMC5(pmc_start[4]); \
+    POWERPC_GET_PMC4(pmc_start[3]); \
+    POWERPC_GET_PMC3(pmc_start[2]); \
+    POWERPC_GET_PMC2(pmc_start[1]); \
+    POWERPC_GET_PMC1(pmc_start[0]); \
+    } while (0)
+#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
+    POWERPC_GET_PMC1(pmc_stop[0]);            \
+    POWERPC_GET_PMC2(pmc_stop[1]);            \
+    POWERPC_GET_PMC3(pmc_stop[2]);            \
+    POWERPC_GET_PMC4(pmc_stop[3]);            \
+    POWERPC_GET_PMC5(pmc_stop[4]);            \
+    POWERPC_GET_PMC6(pmc_stop[5]);            \
+    if (cond) {                               \
+        for(pmc_loop_index = 0;               \
+            pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
+            pmc_loop_index++) {               \
+            if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) {  \
+                POWERP_PMC_DATATYPE diff =                                \
+                  pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index];   \
+                if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
+                    perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
+                if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
+                    perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
+                perfdata[pmc_loop_index][a][powerpc_data_sum] += diff;    \
+                perfdata[pmc_loop_index][a][powerpc_data_num] ++;         \
+            }                                 \
+        }                                     \
+    }                                         \
+} while (0)
+#else /* CONFIG_POWERPC_PERF */
+// those are needed to avoid empty statements.
+#define POWERPC_PERF_DECLARE(a, cond)        int altivec_placeholder __attribute__ ((unused))
+#define POWERPC_PERF_START_COUNT(a, cond)    do {} while (0)
+#define POWERPC_PERF_STOP_COUNT(a, cond)     do {} while (0)
+#endif /* CONFIG_POWERPC_PERF */
+
+#endif /*  AVCODEC_PPC_DSPUTIL_PPC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/h264_altivec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/h264_altivec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h264_data.h"
+#include "libavcodec/h264_dsp.h"
+
+#include "dsputil_ppc.h"
+#include "dsputil_altivec.h"
+#include "util_altivec.h"
+#include "types_altivec.h"
+
+#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
+#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
+
+#define OP_U8_ALTIVEC                          PUT_OP_U8_ALTIVEC
+#define PREFIX_h264_chroma_mc8_altivec         put_h264_chroma_mc8_altivec
+#define PREFIX_no_rnd_vc1_chroma_mc8_altivec   put_no_rnd_vc1_chroma_mc8_altivec
+#define PREFIX_h264_chroma_mc8_num             altivec_put_h264_chroma_mc8_num
+#define PREFIX_h264_qpel16_h_lowpass_altivec   put_h264_qpel16_h_lowpass_altivec
+#define PREFIX_h264_qpel16_h_lowpass_num       altivec_put_h264_qpel16_h_lowpass_num
+#define PREFIX_h264_qpel16_v_lowpass_altivec   put_h264_qpel16_v_lowpass_altivec
+#define PREFIX_h264_qpel16_v_lowpass_num       altivec_put_h264_qpel16_v_lowpass_num
+#define PREFIX_h264_qpel16_hv_lowpass_altivec  put_h264_qpel16_hv_lowpass_altivec
+#define PREFIX_h264_qpel16_hv_lowpass_num      altivec_put_h264_qpel16_hv_lowpass_num
+#include "h264_template_altivec.c"
+#undef OP_U8_ALTIVEC
+#undef PREFIX_h264_chroma_mc8_altivec
+#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
+#undef PREFIX_h264_chroma_mc8_num
+#undef PREFIX_h264_qpel16_h_lowpass_altivec
+#undef PREFIX_h264_qpel16_h_lowpass_num
+#undef PREFIX_h264_qpel16_v_lowpass_altivec
+#undef PREFIX_h264_qpel16_v_lowpass_num
+#undef PREFIX_h264_qpel16_hv_lowpass_altivec
+#undef PREFIX_h264_qpel16_hv_lowpass_num
+
+#define OP_U8_ALTIVEC                          AVG_OP_U8_ALTIVEC
+#define PREFIX_h264_chroma_mc8_altivec         avg_h264_chroma_mc8_altivec
+#define PREFIX_no_rnd_vc1_chroma_mc8_altivec   avg_no_rnd_vc1_chroma_mc8_altivec
+#define PREFIX_h264_chroma_mc8_num             altivec_avg_h264_chroma_mc8_num
+#define PREFIX_h264_qpel16_h_lowpass_altivec   avg_h264_qpel16_h_lowpass_altivec
+#define PREFIX_h264_qpel16_h_lowpass_num       altivec_avg_h264_qpel16_h_lowpass_num
+#define PREFIX_h264_qpel16_v_lowpass_altivec   avg_h264_qpel16_v_lowpass_altivec
+#define PREFIX_h264_qpel16_v_lowpass_num       altivec_avg_h264_qpel16_v_lowpass_num
+#define PREFIX_h264_qpel16_hv_lowpass_altivec  avg_h264_qpel16_hv_lowpass_altivec
+#define PREFIX_h264_qpel16_hv_lowpass_num      altivec_avg_h264_qpel16_hv_lowpass_num
+#include "h264_template_altivec.c"
+#undef OP_U8_ALTIVEC
+#undef PREFIX_h264_chroma_mc8_altivec
+#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
+#undef PREFIX_h264_chroma_mc8_num
+#undef PREFIX_h264_qpel16_h_lowpass_altivec
+#undef PREFIX_h264_qpel16_h_lowpass_num
+#undef PREFIX_h264_qpel16_v_lowpass_altivec
+#undef PREFIX_h264_qpel16_v_lowpass_num
+#undef PREFIX_h264_qpel16_hv_lowpass_altivec
+#undef PREFIX_h264_qpel16_hv_lowpass_num
+
+#define H264_MC(OPNAME, SIZE, CODETYPE) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
+}\
+
+static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
+                                    const uint8_t * src2, int dst_stride,
+                                    int src_stride1, int h)
+{
+    int i;
+    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+
+    mask_ = vec_lvsl(0, src2);
+
+    for (i = 0; i < h; i++) {
+
+        tmp1 = vec_ld(i * src_stride1, src1);
+        mask = vec_lvsl(i * src_stride1, src1);
+        tmp2 = vec_ld(i * src_stride1 + 15, src1);
+
+        a = vec_perm(tmp1, tmp2, mask);
+
+        tmp1 = vec_ld(i * 16, src2);
+        tmp2 = vec_ld(i * 16 + 15, src2);
+
+        b = vec_perm(tmp1, tmp2, mask_);
+
+        tmp1 = vec_ld(0, dst);
+        mask = vec_lvsl(0, dst);
+        tmp2 = vec_ld(15, dst);
+
+        d = vec_avg(a, b);
+
+        edges = vec_perm(tmp2, tmp1, mask);
+
+        align = vec_lvsr(0, dst);
+
+        tmp2 = vec_perm(d, edges, align);
+        tmp1 = vec_perm(edges, d, align);
+
+        vec_st(tmp2, 15, dst);
+        vec_st(tmp1, 0 , dst);
+
+        dst += dst_stride;
+    }
+}
+
+static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
+                                    const uint8_t * src2, int dst_stride,
+                                    int src_stride1, int h)
+{
+    int i;
+    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+
+    mask_ = vec_lvsl(0, src2);
+
+    for (i = 0; i < h; i++) {
+
+        tmp1 = vec_ld(i * src_stride1, src1);
+        mask = vec_lvsl(i * src_stride1, src1);
+        tmp2 = vec_ld(i * src_stride1 + 15, src1);
+
+        a = vec_perm(tmp1, tmp2, mask);
+
+        tmp1 = vec_ld(i * 16, src2);
+        tmp2 = vec_ld(i * 16 + 15, src2);
+
+        b = vec_perm(tmp1, tmp2, mask_);
+
+        tmp1 = vec_ld(0, dst);
+        mask = vec_lvsl(0, dst);
+        tmp2 = vec_ld(15, dst);
+
+        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
+
+        edges = vec_perm(tmp2, tmp1, mask);
+
+        align = vec_lvsr(0, dst);
+
+        tmp2 = vec_perm(d, edges, align);
+        tmp1 = vec_perm(edges, d, align);
+
+        vec_st(tmp2, 15, dst);
+        vec_st(tmp1, 0 , dst);
+
+        dst += dst_stride;
+    }
+}
+
+/* Implemented but could be faster
+#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
+#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
+ */
+
+H264_MC(put_, 16, altivec)
+H264_MC(avg_, 16, altivec)
+
+
+/****************************************************************************
+ * IDCT transform:
+ ****************************************************************************/
+
+#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)               \
+    /* 1st stage */                                               \
+    vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
+    vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
+    vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
+    vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
+    vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
+    vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
+    /* 2nd stage: output */                                       \
+    va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
+    va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
+    va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
+    va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
+
+#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
+    b0 = vec_mergeh( a0, a0 ); \
+    b1 = vec_mergeh( a1, a0 ); \
+    b2 = vec_mergeh( a2, a0 ); \
+    b3 = vec_mergeh( a3, a0 ); \
+    a0 = vec_mergeh( b0, b2 ); \
+    a1 = vec_mergel( b0, b2 ); \
+    a2 = vec_mergeh( b1, b3 ); \
+    a3 = vec_mergel( b1, b3 ); \
+    b0 = vec_mergeh( a0, a2 ); \
+    b1 = vec_mergel( a0, a2 ); \
+    b2 = vec_mergeh( a1, a3 ); \
+    b3 = vec_mergel( a1, a3 )
+
+#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
+    vdst_orig = vec_ld(0, dst);                               \
+    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
+    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
+    va = vec_add(va, vdst_ss);                                \
+    va_u8 = vec_packsu(va, zero_s16v);                        \
+    va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
+    vec_ste(va_u32, element, (uint32_t*)dst);
+
+static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
+{
+    vec_s16 va0, va1, va2, va3;
+    vec_s16 vz0, vz1, vz2, vz3;
+    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
+    vec_u8 va_u8;
+    vec_u32 va_u32;
+    vec_s16 vdst_ss;
+    const vec_u16 v6us = vec_splat_u16(6);
+    vec_u8 vdst, vdst_orig;
+    vec_u8 vdst_mask = vec_lvsl(0, dst);
+    int element = ((unsigned long)dst & 0xf) >> 2;
+    LOAD_ZERO;
+
+    block[0] += 32;  /* add 32 as a DC-level for rounding */
+
+    vtmp0 = vec_ld(0,block);
+    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
+    vtmp2 = vec_ld(16,block);
+    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
+
+    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
+    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
+    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
+
+    va0 = vec_sra(va0,v6us);
+    va1 = vec_sra(va1,v6us);
+    va2 = vec_sra(va2,v6us);
+    va3 = vec_sra(va3,v6us);
+
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
+    dst += stride;
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
+    dst += stride;
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
+    dst += stride;
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
+}
+
+#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
+    /*        a0  = SRC(0) + SRC(4); */ \
+    vec_s16 a0v = vec_add(s0, s4);    \
+    /*        a2  = SRC(0) - SRC(4); */ \
+    vec_s16 a2v = vec_sub(s0, s4);    \
+    /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
+    vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6);    \
+    /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
+    vec_s16 a6v = vec_add(vec_sra(s6, onev), s2);    \
+    /*        b0  =         a0 + a6; */ \
+    vec_s16 b0v = vec_add(a0v, a6v);  \
+    /*        b2  =         a2 + a4; */ \
+    vec_s16 b2v = vec_add(a2v, a4v);  \
+    /*        b4  =         a2 - a4; */ \
+    vec_s16 b4v = vec_sub(a2v, a4v);  \
+    /*        b6  =         a0 - a6; */ \
+    vec_s16 b6v = vec_sub(a0v, a6v);  \
+    /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
+    /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
+    vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
+    /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
+    /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
+    vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
+    /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
+    /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
+    vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
+    /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
+    vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
+    /*        b1 =                  (a7>>2)  +  a1; */ \
+    vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
+    /*        b3 =          a3 +        (a5>>2); */ \
+    vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
+    /*        b5 =                  (a3>>2)  -   a5; */ \
+    vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
+    /*        b7 =           a7 -        (a1>>2); */ \
+    vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
+    /* DST(0,    b0 + b7); */ \
+    d0 = vec_add(b0v, b7v); \
+    /* DST(1,    b2 + b5); */ \
+    d1 = vec_add(b2v, b5v); \
+    /* DST(2,    b4 + b3); */ \
+    d2 = vec_add(b4v, b3v); \
+    /* DST(3,    b6 + b1); */ \
+    d3 = vec_add(b6v, b1v); \
+    /* DST(4,    b6 - b1); */ \
+    d4 = vec_sub(b6v, b1v); \
+    /* DST(5,    b4 - b3); */ \
+    d5 = vec_sub(b4v, b3v); \
+    /* DST(6,    b2 - b5); */ \
+    d6 = vec_sub(b2v, b5v); \
+    /* DST(7,    b0 - b7); */ \
+    d7 = vec_sub(b0v, b7v); \
+}
+
+#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
+    /* unaligned load */                                       \
+    vec_u8 hv = vec_ld( 0, dest );                           \
+    vec_u8 lv = vec_ld( 7, dest );                           \
+    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
+    vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
+    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
+    vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
+    vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
+    vec_u8 edgehv;                                           \
+    /* unaligned store */                                      \
+    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
+    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
+    lv    = vec_sel( lv, bodyv, edgelv );                      \
+    vec_st( lv, 7, dest );                                     \
+    hv    = vec_ld( 0, dest );                                 \
+    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
+    hv    = vec_sel( hv, bodyv, edgehv );                      \
+    vec_st( hv, 0, dest );                                     \
+ }
+
+static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
+    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
+    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
+    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
+
+    vec_u8 perm_ldv = vec_lvsl(0, dst);
+    vec_u8 perm_stv = vec_lvsr(8, dst);
+
+    const vec_u16 onev = vec_splat_u16(1);
+    const vec_u16 twov = vec_splat_u16(2);
+    const vec_u16 sixv = vec_splat_u16(6);
+
+    const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
+    LOAD_ZERO;
+
+    dct[0] += 32; // rounding for the >>6 at the end
+
+    s0 = vec_ld(0x00, (int16_t*)dct);
+    s1 = vec_ld(0x10, (int16_t*)dct);
+    s2 = vec_ld(0x20, (int16_t*)dct);
+    s3 = vec_ld(0x30, (int16_t*)dct);
+    s4 = vec_ld(0x40, (int16_t*)dct);
+    s5 = vec_ld(0x50, (int16_t*)dct);
+    s6 = vec_ld(0x60, (int16_t*)dct);
+    s7 = vec_ld(0x70, (int16_t*)dct);
+
+    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
+                     d0, d1, d2, d3, d4, d5, d6, d7);
+
+    TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );
+
+    IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
+                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
+
+    ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
+    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
+}
+
+static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size)
+{
+    vec_s16 dc16;
+    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
+    LOAD_ZERO;
+    DECLARE_ALIGNED(16, int, dc);
+    int i;
+
+    dc = (block[0] + 32) >> 6;
+    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
+
+    if (size == 4)
+        dc16 = vec_sld(dc16, zero_s16v, 8);
+    dcplus = vec_packsu(dc16, zero_s16v);
+    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
+
+    aligner = vec_lvsr(0, dst);
+    dcplus = vec_perm(dcplus, dcplus, aligner);
+    dcminus = vec_perm(dcminus, dcminus, aligner);
+
+    for (i = 0; i < size; i += 4) {
+        v0 = vec_ld(0, dst+0*stride);
+        v1 = vec_ld(0, dst+1*stride);
+        v2 = vec_ld(0, dst+2*stride);
+        v3 = vec_ld(0, dst+3*stride);
+
+        v0 = vec_adds(v0, dcplus);
+        v1 = vec_adds(v1, dcplus);
+        v2 = vec_adds(v2, dcplus);
+        v3 = vec_adds(v3, dcplus);
+
+        v0 = vec_subs(v0, dcminus);
+        v1 = vec_subs(v1, dcminus);
+        v2 = vec_subs(v2, dcminus);
+        v3 = vec_subs(v3, dcminus);
+
+        vec_st(v0, 0, dst+0*stride);
+        vec_st(v1, 0, dst+1*stride);
+        vec_st(v2, 0, dst+2*stride);
+        vec_st(v3, 0, dst+3*stride);
+
+        dst += 4*stride;
+    }
+}
+
+static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
+{
+    h264_idct_dc_add_internal(dst, block, stride, 4);
+}
+
+static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
+{
+    h264_idct_dc_add_internal(dst, block, stride, 8);
+}
+
+static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
+        else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_altivec   (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16])
+            h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+
+#define transpose4x16(r0, r1, r2, r3) {      \
+    register vec_u8 r4;                    \
+    register vec_u8 r5;                    \
+    register vec_u8 r6;                    \
+    register vec_u8 r7;                    \
+                                             \
+    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
+    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
+    r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
+    r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
+                                             \
+    r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
+    r1 = vec_mergel(r4, r6);  /*all set 1*/  \
+    r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
+    r3 = vec_mergel(r5, r7);  /*all set 3*/  \
+}
+
+static inline void write16x4(uint8_t *dst, int dst_stride,
+                             register vec_u8 r0, register vec_u8 r1,
+                             register vec_u8 r2, register vec_u8 r3) {
+    DECLARE_ALIGNED(16, unsigned char, result)[64];
+    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
+    int int_dst_stride = dst_stride/4;
+
+    vec_st(r0, 0, result);
+    vec_st(r1, 16, result);
+    vec_st(r2, 32, result);
+    vec_st(r3, 48, result);
+    /* FIXME: there has to be a better way!!!! */
+    *dst_int = *src_int;
+    *(dst_int+   int_dst_stride) = *(src_int + 1);
+    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
+    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
+    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
+    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
+    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
+    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
+    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
+    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
+    *(dst_int+10*int_dst_stride) = *(src_int + 10);
+    *(dst_int+11*int_dst_stride) = *(src_int + 11);
+    *(dst_int+12*int_dst_stride) = *(src_int + 12);
+    *(dst_int+13*int_dst_stride) = *(src_int + 13);
+    *(dst_int+14*int_dst_stride) = *(src_int + 14);
+    *(dst_int+15*int_dst_stride) = *(src_int + 15);
+}
+
+/** \brief performs a 6x16 transpose of data in src, and stores it to dst
+    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
+    out of unaligned_load() */
+#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
+    register vec_u8 r0  = unaligned_load(0,             src);            \
+    register vec_u8 r1  = unaligned_load(   src_stride, src);            \
+    register vec_u8 r2  = unaligned_load(2* src_stride, src);            \
+    register vec_u8 r3  = unaligned_load(3* src_stride, src);            \
+    register vec_u8 r4  = unaligned_load(4* src_stride, src);            \
+    register vec_u8 r5  = unaligned_load(5* src_stride, src);            \
+    register vec_u8 r6  = unaligned_load(6* src_stride, src);            \
+    register vec_u8 r7  = unaligned_load(7* src_stride, src);            \
+    register vec_u8 r14 = unaligned_load(14*src_stride, src);            \
+    register vec_u8 r15 = unaligned_load(15*src_stride, src);            \
+                                                                           \
+    r8  = unaligned_load( 8*src_stride, src);                              \
+    r9  = unaligned_load( 9*src_stride, src);                              \
+    r10 = unaligned_load(10*src_stride, src);                              \
+    r11 = unaligned_load(11*src_stride, src);                              \
+    r12 = unaligned_load(12*src_stride, src);                              \
+    r13 = unaligned_load(13*src_stride, src);                              \
+                                                                           \
+    /*Merge first pairs*/                                                  \
+    r0 = vec_mergeh(r0, r8);    /*0, 8*/                                   \
+    r1 = vec_mergeh(r1, r9);    /*1, 9*/                                   \
+    r2 = vec_mergeh(r2, r10);   /*2,10*/                                   \
+    r3 = vec_mergeh(r3, r11);   /*3,11*/                                   \
+    r4 = vec_mergeh(r4, r12);   /*4,12*/                                   \
+    r5 = vec_mergeh(r5, r13);   /*5,13*/                                   \
+    r6 = vec_mergeh(r6, r14);   /*6,14*/                                   \
+    r7 = vec_mergeh(r7, r15);   /*7,15*/                                   \
+                                                                           \
+    /*Merge second pairs*/                                                 \
+    r8  = vec_mergeh(r0, r4);   /*0,4, 8,12 set 0*/                        \
+    r9  = vec_mergel(r0, r4);   /*0,4, 8,12 set 1*/                        \
+    r10 = vec_mergeh(r1, r5);   /*1,5, 9,13 set 0*/                        \
+    r11 = vec_mergel(r1, r5);   /*1,5, 9,13 set 1*/                        \
+    r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/                        \
+    r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/                        \
+    r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/                        \
+    r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/                        \
+                                                                           \
+    /*Third merge*/                                                        \
+    r0 = vec_mergeh(r8,  r12);  /*0,2,4,6,8,10,12,14 set 0*/               \
+    r1 = vec_mergel(r8,  r12);  /*0,2,4,6,8,10,12,14 set 1*/               \
+    r2 = vec_mergeh(r9,  r13);  /*0,2,4,6,8,10,12,14 set 2*/               \
+    r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/               \
+    r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/               \
+    r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/               \
+    /* Don't need to compute 3 and 7*/                                     \
+                                                                           \
+    /*Final merge*/                                                        \
+    r8  = vec_mergeh(r0, r4);   /*all set 0*/                              \
+    r9  = vec_mergel(r0, r4);   /*all set 1*/                              \
+    r10 = vec_mergeh(r1, r5);   /*all set 2*/                              \
+    r11 = vec_mergel(r1, r5);   /*all set 3*/                              \
+    r12 = vec_mergeh(r2, r6);   /*all set 4*/                              \
+    r13 = vec_mergel(r2, r6);   /*all set 5*/                              \
+    /* Don't need to compute 14 and 15*/                                   \
+                                                                           \
+}
+
+// out: o = |x-y| < a
+static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
+                                         register vec_u8 y,
+                                         register vec_u8 a) {
+
+    register vec_u8 diff = vec_subs(x, y);
+    register vec_u8 diffneg = vec_subs(y, x);
+    register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
+    o = (vec_u8)vec_cmplt(o, a);
+    return o;
+}
+
+static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
+                                           register vec_u8 p1,
+                                           register vec_u8 q0,
+                                           register vec_u8 q1,
+                                           register vec_u8 alpha,
+                                           register vec_u8 beta) {
+
+    register vec_u8 mask;
+    register vec_u8 tempmask;
+
+    mask = diff_lt_altivec(p0, q0, alpha);
+    tempmask = diff_lt_altivec(p1, p0, beta);
+    mask = vec_and(mask, tempmask);
+    tempmask = diff_lt_altivec(q1, q0, beta);
+    mask = vec_and(mask, tempmask);
+
+    return mask;
+}
+
+// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
+static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
+                                       register vec_u8 p1,
+                                       register vec_u8 p2,
+                                       register vec_u8 q0,
+                                       register vec_u8 tc0) {
+
+    register vec_u8 average = vec_avg(p0, q0);
+    register vec_u8 temp;
+    register vec_u8 uncliped;
+    register vec_u8 ones;
+    register vec_u8 max;
+    register vec_u8 min;
+    register vec_u8 newp1;
+
+    temp = vec_xor(average, p2);
+    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
+    ones = vec_splat_u8(1);
+    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
+    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
+    max = vec_adds(p1, tc0);
+    min = vec_subs(p1, tc0);
+    newp1 = vec_max(min, uncliped);
+    newp1 = vec_min(max, newp1);
+    return newp1;
+}
+
+#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
+                                                                                                  \
+    const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
+                                                                                                  \
+    register vec_u8 pq0bit = vec_xor(p0,q0);                                                    \
+    register vec_u8 q1minus;                                                                    \
+    register vec_u8 p0minus;                                                                    \
+    register vec_u8 stage1;                                                                     \
+    register vec_u8 stage2;                                                                     \
+    register vec_u8 vec160;                                                                     \
+    register vec_u8 delta;                                                                      \
+    register vec_u8 deltaneg;                                                                   \
+                                                                                                  \
+    q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
+    stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
+    stage2 = vec_sr(stage1, vec_splat_u8(1));  /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */     \
+    p0minus = vec_nor(p0, p0);                 /* 255 - p0 */                                     \
+    stage1 = vec_avg(q0, p0minus);             /* (q0 - p0 + 256)>>1 */                           \
+    pq0bit = vec_and(pq0bit, vec_splat_u8(1));                                                    \
+    stage2 = vec_avg(stage2, pq0bit);          /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
+    stage2 = vec_adds(stage2, stage1);         /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */  \
+    vec160 = vec_ld(0, &A0v);                                                                     \
+    deltaneg = vec_subs(vec160, stage2);       /* -d */                                           \
+    delta = vec_subs(stage2, vec160);          /* d */                                            \
+    deltaneg = vec_min(tc0masked, deltaneg);                                                      \
+    delta = vec_min(tc0masked, delta);                                                            \
+    p0 = vec_subs(p0, deltaneg);                                                                  \
+    q0 = vec_subs(q0, delta);                                                                     \
+    p0 = vec_adds(p0, delta);                                                                     \
+    q0 = vec_adds(q0, deltaneg);                                                                  \
+}
+
+#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
+    DECLARE_ALIGNED(16, unsigned char, temp)[16];                                             \
+    register vec_u8 alphavec;                                                              \
+    register vec_u8 betavec;                                                               \
+    register vec_u8 mask;                                                                  \
+    register vec_u8 p1mask;                                                                \
+    register vec_u8 q1mask;                                                                \
+    register vector signed   char tc0vec;                                                    \
+    register vec_u8 finaltc0;                                                              \
+    register vec_u8 tc0masked;                                                             \
+    register vec_u8 newp1;                                                                 \
+    register vec_u8 newq1;                                                                 \
+                                                                                             \
+    temp[0] = alpha;                                                                         \
+    temp[1] = beta;                                                                          \
+    alphavec = vec_ld(0, temp);                                                              \
+    betavec = vec_splat(alphavec, 0x1);                                                      \
+    alphavec = vec_splat(alphavec, 0x0);                                                     \
+    mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */            \
+                                                                                             \
+    *((int *)temp) = *((int *)tc0);                                                          \
+    tc0vec = vec_ld(0, (signed char*)temp);                                                  \
+    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
+    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
+    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
+    finaltc0 = vec_and((vec_u8)tc0vec, mask);     /* tc = tc0 */                           \
+                                                                                             \
+    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
+    p1mask = vec_and(p1mask, mask);                             /* if ( |p2 - p0| < beta) */ \
+    tc0masked = vec_and(p1mask, (vec_u8)tc0vec);                                           \
+    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
+    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
+    /*end if*/                                                                               \
+                                                                                             \
+    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
+    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
+    tc0masked = vec_and(q1mask, (vec_u8)tc0vec);                                           \
+    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
+    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
+    /*end if*/                                                                               \
+                                                                                             \
+    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);                                            \
+    p1 = newp1;                                                                              \
+    q1 = newq1;                                                                              \
+}
+
+static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
+
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
+        register vec_u8 p2 = vec_ld(-3*stride, pix);
+        register vec_u8 p1 = vec_ld(-2*stride, pix);
+        register vec_u8 p0 = vec_ld(-1*stride, pix);
+        register vec_u8 q0 = vec_ld(0, pix);
+        register vec_u8 q1 = vec_ld(stride, pix);
+        register vec_u8 q2 = vec_ld(2*stride, pix);
+        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
+        vec_st(p1, -2*stride, pix);
+        vec_st(p0, -1*stride, pix);
+        vec_st(q0, 0, pix);
+        vec_st(q1, stride, pix);
+    }
+}
+
+static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
+
+    register vec_u8 line0, line1, line2, line3, line4, line5;
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
+        return;
+    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
+    h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
+    transpose4x16(line1, line2, line3, line4);
+    write16x4(pix-2, stride, line1, line2, line3, line4);
+}
+
+static av_always_inline
+void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+    int y, aligned;
+    vec_u8 vblock;
+    vec_s16 vtemp, vweight, voffset, v0, v1;
+    vec_u16 vlog2_denom;
+    DECLARE_ALIGNED(16, int32_t, temp)[4];
+    LOAD_ZERO;
+
+    offset <<= log2_denom;
+    if(log2_denom) offset += 1<<(log2_denom-1);
+    temp[0] = log2_denom;
+    temp[1] = weight;
+    temp[2] = offset;
+
+    vtemp = (vec_s16)vec_ld(0, temp);
+    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
+    vweight = vec_splat(vtemp, 3);
+    voffset = vec_splat(vtemp, 5);
+    aligned = !((unsigned long)block & 0xf);
+
+    for (y=0; y<h; y++) {
+        vblock = vec_ld(0, block);
+
+        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
+        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
+
+        if (w == 16 || aligned) {
+            v0 = vec_mladd(v0, vweight, zero_s16v);
+            v0 = vec_adds(v0, voffset);
+            v0 = vec_sra(v0, vlog2_denom);
+        }
+        if (w == 16 || !aligned) {
+            v1 = vec_mladd(v1, vweight, zero_s16v);
+            v1 = vec_adds(v1, voffset);
+            v1 = vec_sra(v1, vlog2_denom);
+        }
+        vblock = vec_packsu(v0, v1);
+        vec_st(vblock, 0, block);
+
+        block += stride;
+    }
+}
+
+static av_always_inline
+void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+                               int weightd, int weights, int offset, int w, int h)
+{
+    int y, dst_aligned, src_aligned;
+    vec_u8 vsrc, vdst;
+    vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
+    vec_u16 vlog2_denom;
+    DECLARE_ALIGNED(16, int32_t, temp)[4];
+    LOAD_ZERO;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+    temp[0] = log2_denom+1;
+    temp[1] = weights;
+    temp[2] = weightd;
+    temp[3] = offset;
+
+    vtemp = (vec_s16)vec_ld(0, temp);
+    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
+    vweights = vec_splat(vtemp, 3);
+    vweightd = vec_splat(vtemp, 5);
+    voffset = vec_splat(vtemp, 7);
+    dst_aligned = !((unsigned long)dst & 0xf);
+    src_aligned = !((unsigned long)src & 0xf);
+
+    for (y=0; y<h; y++) {
+        vdst = vec_ld(0, dst);
+        vsrc = vec_ld(0, src);
+
+        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
+        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
+        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
+        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
+
+        if (w == 8) {
+            if (src_aligned)
+                v3 = v2;
+            else
+                v2 = v3;
+        }
+
+        if (w == 16 || dst_aligned) {
+            v0 = vec_mladd(v0, vweightd, zero_s16v);
+            v2 = vec_mladd(v2, vweights, zero_s16v);
+
+            v0 = vec_adds(v0, voffset);
+            v0 = vec_adds(v0, v2);
+            v0 = vec_sra(v0, vlog2_denom);
+        }
+        if (w == 16 || !dst_aligned) {
+            v1 = vec_mladd(v1, vweightd, zero_s16v);
+            v3 = vec_mladd(v3, vweights, zero_s16v);
+
+            v1 = vec_adds(v1, voffset);
+            v1 = vec_adds(v1, v3);
+            v1 = vec_sra(v1, vlog2_denom);
+        }
+        vdst = vec_packsu(v0, v1);
+        vec_st(vdst, 0, dst);
+
+        dst += stride;
+        src += stride;
+    }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
+    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+}\
+static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+
+void dsputil_h264_init_ppc(DSPContext* c) {    
+	c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
+	c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
+
+#define dspfunc(PFX, IDX, NUM) \
+	c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
+	c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
+	c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
+	c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
+	c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
+	c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
+	c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
+	c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
+
+	dspfunc(put_h264_qpel, 0, 16);
+	dspfunc(avg_h264_qpel, 0, 16);
+#undef dspfunc
+}
+
+void ff_h264dsp_init_ppc(H264DSPContext *c){
+	c->h264_idct_dc_add= h264_idct_dc_add_altivec;
+	c->h264_idct_add = ff_h264_idct_add_altivec;
+	c->h264_idct_add8 = ff_h264_idct_add8_altivec;
+	c->h264_idct_add16 = ff_h264_idct_add16_altivec;
+	c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
+
+	c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
+	c->h264_idct8_add = ff_h264_idct8_add_altivec;
+	c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
+	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
+	c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
+
+	c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
+	c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
+	c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
+	c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
+	c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
+	c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
+	c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
+	c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
+	c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
+	c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/h264_template_altivec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/h264_template_altivec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+//#define DEBUG_ALIGNMENT
+#ifdef DEBUG_ALIGNMENT
+#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
+#else
+#define ASSERT_ALIGNED(ptr) ;
+#endif
+
+/* this code assume that stride % 16 == 0 */
+
+#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
+        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
+        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
+\
+        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
+        psum = vec_mladd(vB, vsrc1ssH, psum);\
+        psum = vec_mladd(vC, vsrc2ssH, psum);\
+        psum = vec_mladd(vD, vsrc3ssH, psum);\
+        psum = BIAS2(psum);\
+        psum = vec_sr(psum, v6us);\
+\
+        vdst = vec_ld(0, dst);\
+        ppsum = (vec_u8)vec_pack(psum, psum);\
+        vfdst = vec_perm(vdst, ppsum, fperm);\
+\
+        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
+\
+        vec_st(fsum, 0, dst);\
+\
+        vsrc0ssH = vsrc2ssH;\
+        vsrc1ssH = vsrc3ssH;\
+\
+        dst += stride;\
+        src += stride;
+
+#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
+\
+        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
+        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
+\
+        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
+        psum = vec_mladd(vE, vsrc1ssH, psum);\
+        psum = vec_sr(psum, v6us);\
+\
+        vdst = vec_ld(0, dst);\
+        ppsum = (vec_u8)vec_pack(psum, psum);\
+        vfdst = vec_perm(vdst, ppsum, fperm);\
+\
+        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
+\
+        vec_st(fsum, 0, dst);\
+\
+        dst += stride;\
+        src += stride;
+
+#define noop(a) a
+#define add28(a) vec_add(v28ss, a)
+
+static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
+                                    int stride, int h, int x, int y) {
+  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
+    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
+                        {((8 - x) * (8 - y)),
+                         ((    x) * (8 - y)),
+                         ((8 - x) * (    y)),
+                         ((    x) * (    y))};
+    register int i;
+    vec_u8 fperm;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
+    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
+    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
+    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
+    LOAD_ZERO;
+    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
+    const vec_u16 v6us = vec_splat_u16(6);
+    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+
+    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrc0uc, vsrc1uc;
+    vec_s16 vsrc0ssH, vsrc1ssH;
+    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_s16 vsrc2ssH, vsrc3ssH, psum;
+    vec_u8 vdst, ppsum, vfdst, fsum;
+
+  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
+
+    if (((unsigned long)dst) % 16 == 0) {
+        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
+                         0x14, 0x15, 0x16, 0x17,
+                         0x08, 0x09, 0x0A, 0x0B,
+                         0x0C, 0x0D, 0x0E, 0x0F};
+    } else {
+        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
+                         0x04, 0x05, 0x06, 0x07,
+                         0x18, 0x19, 0x1A, 0x1B,
+                         0x1C, 0x1D, 0x1E, 0x1F};
+    }
+
+    vsrcAuc = vec_ld(0, src);
+
+    if (loadSecond)
+        vsrcBuc = vec_ld(16, src);
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+
+    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
+    if (reallyBadAlign)
+        vsrc1uc = vsrcBuc;
+    else
+        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+
+    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
+
+    if (ABCD[3]) {
+        if (!loadSecond) {// -> !reallyBadAlign
+            for (i = 0 ; i < h ; i++) {
+                vsrcCuc = vec_ld(stride + 0, src);
+                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+
+                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
+            }
+        } else {
+            vec_u8 vsrcDuc;
+            for (i = 0 ; i < h ; i++) {
+                vsrcCuc = vec_ld(stride + 0, src);
+                vsrcDuc = vec_ld(stride + 16, src);
+                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+                if (reallyBadAlign)
+                    vsrc3uc = vsrcDuc;
+                else
+                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
+
+                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
+            }
+        }
+    } else {
+        const vec_s16 vE = vec_add(vB, vC);
+        if (ABCD[2]) { // x == 0 B == 0
+            if (!loadSecond) {// -> !reallyBadAlign
+                for (i = 0 ; i < h ; i++) {
+                    vsrcCuc = vec_ld(stride + 0, src);
+                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+
+                    vsrc0uc = vsrc1uc;
+                }
+            } else {
+                vec_u8 vsrcDuc;
+                for (i = 0 ; i < h ; i++) {
+                    vsrcCuc = vec_ld(stride + 0, src);
+                    vsrcDuc = vec_ld(stride + 15, src);
+                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+
+                    vsrc0uc = vsrc1uc;
+                }
+            }
+        } else { // y == 0 C == 0
+            if (!loadSecond) {// -> !reallyBadAlign
+                for (i = 0 ; i < h ; i++) {
+                    vsrcCuc = vec_ld(0, src);
+                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+
+                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+                }
+            } else {
+                vec_u8 vsrcDuc;
+                for (i = 0 ; i < h ; i++) {
+                    vsrcCuc = vec_ld(0, src);
+                    vsrcDuc = vec_ld(15, src);
+                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+                    if (reallyBadAlign)
+                        vsrc1uc = vsrcDuc;
+                    else
+                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
+
+                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+                }
+            }
+        }
+    }
+    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
+}
+
+/* this code assume that stride % 16 == 0 */
+static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
+   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
+                        {((8 - x) * (8 - y)),
+                         ((    x) * (8 - y)),
+                         ((8 - x) * (    y)),
+                         ((    x) * (    y))};
+    register int i;
+    vec_u8 fperm;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
+    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
+    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
+    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
+    LOAD_ZERO;
+    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
+    const vec_u16 v6us  = vec_splat_u16(6);
+    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+
+    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrc0uc, vsrc1uc;
+    vec_s16 vsrc0ssH, vsrc1ssH;
+    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_s16 vsrc2ssH, vsrc3ssH, psum;
+    vec_u8 vdst, ppsum, vfdst, fsum;
+
+    if (((unsigned long)dst) % 16 == 0) {
+        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
+                         0x14, 0x15, 0x16, 0x17,
+                         0x08, 0x09, 0x0A, 0x0B,
+                         0x0C, 0x0D, 0x0E, 0x0F};
+    } else {
+        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
+                         0x04, 0x05, 0x06, 0x07,
+                         0x18, 0x19, 0x1A, 0x1B,
+                         0x1C, 0x1D, 0x1E, 0x1F};
+    }
+
+    vsrcAuc = vec_ld(0, src);
+
+    if (loadSecond)
+        vsrcBuc = vec_ld(16, src);
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+
+    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
+    if (reallyBadAlign)
+        vsrc1uc = vsrcBuc;
+    else
+        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+
+    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
+
+    if (!loadSecond) {// -> !reallyBadAlign
+        for (i = 0 ; i < h ; i++) {
+
+
+            vsrcCuc = vec_ld(stride + 0, src);
+
+            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+
+            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
+        }
+    } else {
+        vec_u8 vsrcDuc;
+        for (i = 0 ; i < h ; i++) {
+            vsrcCuc = vec_ld(stride + 0, src);
+            vsrcDuc = vec_ld(stride + 16, src);
+
+            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+            if (reallyBadAlign)
+                vsrc3uc = vsrcDuc;
+            else
+                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
+
+            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
+        }
+    }
+}
+
+#undef noop
+#undef add28
+#undef CHROMA_MC8_ALTIVEC_CORE
+
+/* this code assume stride % 16 == 0 */
+static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
+    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
+    register int i;
+
+    LOAD_ZERO;
+    const vec_u8 permM2 = vec_lvsl(-2, src);
+    const vec_u8 permM1 = vec_lvsl(-1, src);
+    const vec_u8 permP0 = vec_lvsl(+0, src);
+    const vec_u8 permP1 = vec_lvsl(+1, src);
+    const vec_u8 permP2 = vec_lvsl(+2, src);
+    const vec_u8 permP3 = vec_lvsl(+3, src);
+    const vec_s16 v5ss = vec_splat_s16(5);
+    const vec_u16 v5us = vec_splat_u16(5);
+    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
+
+    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+
+    register int align = ((((unsigned long)src) - 2) % 16);
+
+    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
+              srcP2A, srcP2B, srcP3A, srcP3B,
+              srcM1A, srcM1B, srcM2A, srcM2B,
+              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
+              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
+              psumA, psumB, sumA, sumB;
+
+    vec_u8 sum, vdst, fsum;
+
+    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
+
+    for (i = 0 ; i < 16 ; i ++) {
+        vec_u8 srcR1 = vec_ld(-2, src);
+        vec_u8 srcR2 = vec_ld(14, src);
+
+        switch (align) {
+        default: {
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = vec_perm(srcR1, srcR2, permP1);
+            srcP2 = vec_perm(srcR1, srcR2, permP2);
+            srcP3 = vec_perm(srcR1, srcR2, permP3);
+        } break;
+        case 11: {
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = vec_perm(srcR1, srcR2, permP1);
+            srcP2 = vec_perm(srcR1, srcR2, permP2);
+            srcP3 = srcR2;
+        } break;
+        case 12: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = vec_perm(srcR1, srcR2, permP1);
+            srcP2 = srcR2;
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        case 13: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = srcR2;
+            srcP2 = vec_perm(srcR2, srcR3, permP2);
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        case 14: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = srcR2;
+            srcP1 = vec_perm(srcR2, srcR3, permP1);
+            srcP2 = vec_perm(srcR2, srcR3, permP2);
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        case 15: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = srcR2;
+            srcP0 = vec_perm(srcR2, srcR3, permP0);
+            srcP1 = vec_perm(srcR2, srcR3, permP1);
+            srcP2 = vec_perm(srcR2, srcR3, permP2);
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        }
+
+        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
+        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
+        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
+        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
+        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
+        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
+        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
+        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
+        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
+        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+
+        sum1A = vec_adds(srcP0A, srcP1A);
+        sum1B = vec_adds(srcP0B, srcP1B);
+        sum2A = vec_adds(srcM1A, srcP2A);
+        sum2B = vec_adds(srcM1B, srcP2B);
+        sum3A = vec_adds(srcM2A, srcP3A);
+        sum3B = vec_adds(srcM2B, srcP3B);
+
+        pp1A = vec_mladd(sum1A, v20ss, v16ss);
+        pp1B = vec_mladd(sum1B, v20ss, v16ss);
+
+        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
+        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
+
+        pp3A = vec_add(sum3A, pp1A);
+        pp3B = vec_add(sum3B, pp1B);
+
+        psumA = vec_sub(pp3A, pp2A);
+        psumB = vec_sub(pp3B, pp2B);
+
+        sumA = vec_sra(psumA, v5us);
+        sumB = vec_sra(psumB, v5us);
+
+        sum = vec_packsu(sumA, sumB);
+
+        ASSERT_ALIGNED(dst);
+        vdst = vec_ld(0, dst);
+
+        OP_U8_ALTIVEC(fsum, sum, vdst);
+
+        vec_st(fsum, 0, dst);
+
+        src += srcStride;
+        dst += dstStride;
+    }
+    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
+}
+
+/* this code assume stride % 16 == 0 */
+static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
+    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
+
+    register int i;
+
+    LOAD_ZERO;
+    const vec_u8 perm = vec_lvsl(0, src);
+    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+    const vec_u16 v5us = vec_splat_u16(5);
+    const vec_s16 v5ss = vec_splat_s16(5);
+    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
+
+    uint8_t *srcbis = src - (srcStride * 2);
+
+    const vec_u8 srcM2a = vec_ld(0, srcbis);
+    const vec_u8 srcM2b = vec_ld(16, srcbis);
+    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
+    //srcbis += srcStride;
+    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcM1b = vec_ld(16, srcbis);
+    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
+    //srcbis += srcStride;
+    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcP0b = vec_ld(16, srcbis);
+    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
+    //srcbis += srcStride;
+    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcP1b = vec_ld(16, srcbis);
+    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
+    //srcbis += srcStride;
+    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcP2b = vec_ld(16, srcbis);
+    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
+    //srcbis += srcStride;
+
+    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
+    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
+    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
+    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
+    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
+    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
+    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
+    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
+    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
+    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
+
+    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
+              psumA, psumB, sumA, sumB,
+              srcP3ssA, srcP3ssB,
+              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
+
+    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
+
+    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
+
+    for (i = 0 ; i < 16 ; i++) {
+        srcP3a = vec_ld(0, srcbis += srcStride);
+        srcP3b = vec_ld(16, srcbis);
+        srcP3 = vec_perm(srcP3a, srcP3b, perm);
+        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
+        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
+        //srcbis += srcStride;
+
+        sum1A = vec_adds(srcP0ssA, srcP1ssA);
+        sum1B = vec_adds(srcP0ssB, srcP1ssB);
+        sum2A = vec_adds(srcM1ssA, srcP2ssA);
+        sum2B = vec_adds(srcM1ssB, srcP2ssB);
+        sum3A = vec_adds(srcM2ssA, srcP3ssA);
+        sum3B = vec_adds(srcM2ssB, srcP3ssB);
+
+        srcM2ssA = srcM1ssA;
+        srcM2ssB = srcM1ssB;
+        srcM1ssA = srcP0ssA;
+        srcM1ssB = srcP0ssB;
+        srcP0ssA = srcP1ssA;
+        srcP0ssB = srcP1ssB;
+        srcP1ssA = srcP2ssA;
+        srcP1ssB = srcP2ssB;
+        srcP2ssA = srcP3ssA;
+        srcP2ssB = srcP3ssB;
+
+        pp1A = vec_mladd(sum1A, v20ss, v16ss);
+        pp1B = vec_mladd(sum1B, v20ss, v16ss);
+
+        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
+        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
+
+        pp3A = vec_add(sum3A, pp1A);
+        pp3B = vec_add(sum3B, pp1B);
+
+        psumA = vec_sub(pp3A, pp2A);
+        psumB = vec_sub(pp3B, pp2B);
+
+        sumA = vec_sra(psumA, v5us);
+        sumB = vec_sra(psumB, v5us);
+
+        sum = vec_packsu(sumA, sumB);
+
+        ASSERT_ALIGNED(dst);
+        vdst = vec_ld(0, dst);
+
+        OP_U8_ALTIVEC(fsum, sum, vdst);
+
+        vec_st(fsum, 0, dst);
+
+        dst += dstStride;
+    }
+    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
+}
+
+/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
+static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
+    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
+    register int i;
+    LOAD_ZERO;
+    const vec_u8 permM2 = vec_lvsl(-2, src);
+    const vec_u8 permM1 = vec_lvsl(-1, src);
+    const vec_u8 permP0 = vec_lvsl(+0, src);
+    const vec_u8 permP1 = vec_lvsl(+1, src);
+    const vec_u8 permP2 = vec_lvsl(+2, src);
+    const vec_u8 permP3 = vec_lvsl(+3, src);
+    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+    const vec_u32 v10ui = vec_splat_u32(10);
+    const vec_s16 v5ss = vec_splat_s16(5);
+    const vec_s16 v1ss = vec_splat_s16(1);
+    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
+    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
+
+    register int align = ((((unsigned long)src) - 2) % 16);
+
+    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
+              srcP2A, srcP2B, srcP3A, srcP3B,
+              srcM1A, srcM1B, srcM2A, srcM2B,
+              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
+              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
+
+    const vec_u8 mperm = (const vec_u8)
+        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
+         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
+    int16_t *tmpbis = tmp;
+
+    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
+              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
+              tmpP2ssA, tmpP2ssB;
+
+    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
+              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
+              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
+              ssumAe, ssumAo, ssumBe, ssumBo;
+    vec_u8 fsum, sumv, sum, vdst;
+    vec_s16 ssume, ssumo;
+
+    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
+    src -= (2 * srcStride);
+    for (i = 0 ; i < 21 ; i ++) {
+        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+        vec_u8 srcR1 = vec_ld(-2, src);
+        vec_u8 srcR2 = vec_ld(14, src);
+
+        switch (align) {
+        default: {
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = vec_perm(srcR1, srcR2, permP1);
+            srcP2 = vec_perm(srcR1, srcR2, permP2);
+            srcP3 = vec_perm(srcR1, srcR2, permP3);
+        } break;
+        case 11: {
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = vec_perm(srcR1, srcR2, permP1);
+            srcP2 = vec_perm(srcR1, srcR2, permP2);
+            srcP3 = srcR2;
+        } break;
+        case 12: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = vec_perm(srcR1, srcR2, permP1);
+            srcP2 = srcR2;
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        case 13: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = vec_perm(srcR1, srcR2, permP0);
+            srcP1 = srcR2;
+            srcP2 = vec_perm(srcR2, srcR3, permP2);
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        case 14: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = vec_perm(srcR1, srcR2, permM1);
+            srcP0 = srcR2;
+            srcP1 = vec_perm(srcR2, srcR3, permP1);
+            srcP2 = vec_perm(srcR2, srcR3, permP2);
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        case 15: {
+            vec_u8 srcR3 = vec_ld(30, src);
+            srcM2 = vec_perm(srcR1, srcR2, permM2);
+            srcM1 = srcR2;
+            srcP0 = vec_perm(srcR2, srcR3, permP0);
+            srcP1 = vec_perm(srcR2, srcR3, permP1);
+            srcP2 = vec_perm(srcR2, srcR3, permP2);
+            srcP3 = vec_perm(srcR2, srcR3, permP3);
+        } break;
+        }
+
+        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
+        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
+        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
+        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
+        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
+        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
+        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
+        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
+        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
+        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+
+        sum1A = vec_adds(srcP0A, srcP1A);
+        sum1B = vec_adds(srcP0B, srcP1B);
+        sum2A = vec_adds(srcM1A, srcP2A);
+        sum2B = vec_adds(srcM1B, srcP2B);
+        sum3A = vec_adds(srcM2A, srcP3A);
+        sum3B = vec_adds(srcM2B, srcP3B);
+
+        pp1A = vec_mladd(sum1A, v20ss, sum3A);
+        pp1B = vec_mladd(sum1B, v20ss, sum3B);
+
+        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
+        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
+
+        psumA = vec_sub(pp1A, pp2A);
+        psumB = vec_sub(pp1B, pp2B);
+
+        vec_st(psumA, 0, tmp);
+        vec_st(psumB, 16, tmp);
+
+        src += srcStride;
+        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
+    }
+
+    tmpM2ssA = vec_ld(0, tmpbis);
+    tmpM2ssB = vec_ld(16, tmpbis);
+    tmpbis += tmpStride;
+    tmpM1ssA = vec_ld(0, tmpbis);
+    tmpM1ssB = vec_ld(16, tmpbis);
+    tmpbis += tmpStride;
+    tmpP0ssA = vec_ld(0, tmpbis);
+    tmpP0ssB = vec_ld(16, tmpbis);
+    tmpbis += tmpStride;
+    tmpP1ssA = vec_ld(0, tmpbis);
+    tmpP1ssB = vec_ld(16, tmpbis);
+    tmpbis += tmpStride;
+    tmpP2ssA = vec_ld(0, tmpbis);
+    tmpP2ssB = vec_ld(16, tmpbis);
+    tmpbis += tmpStride;
+
+    for (i = 0 ; i < 16 ; i++) {
+        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
+        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
+
+        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
+        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
+        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
+        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
+        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
+        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
+
+        tmpbis += tmpStride;
+
+        tmpM2ssA = tmpM1ssA;
+        tmpM2ssB = tmpM1ssB;
+        tmpM1ssA = tmpP0ssA;
+        tmpM1ssB = tmpP0ssB;
+        tmpP0ssA = tmpP1ssA;
+        tmpP0ssB = tmpP1ssB;
+        tmpP1ssA = tmpP2ssA;
+        tmpP1ssB = tmpP2ssB;
+        tmpP2ssA = tmpP3ssA;
+        tmpP2ssB = tmpP3ssB;
+
+        pp1Ae = vec_mule(sum1A, v20ss);
+        pp1Ao = vec_mulo(sum1A, v20ss);
+        pp1Be = vec_mule(sum1B, v20ss);
+        pp1Bo = vec_mulo(sum1B, v20ss);
+
+        pp2Ae = vec_mule(sum2A, v5ss);
+        pp2Ao = vec_mulo(sum2A, v5ss);
+        pp2Be = vec_mule(sum2B, v5ss);
+        pp2Bo = vec_mulo(sum2B, v5ss);
+
+        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
+        pp3Ao = vec_mulo(sum3A, v1ss);
+        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
+        pp3Bo = vec_mulo(sum3B, v1ss);
+
+        pp1cAe = vec_add(pp1Ae, v512si);
+        pp1cAo = vec_add(pp1Ao, v512si);
+        pp1cBe = vec_add(pp1Be, v512si);
+        pp1cBo = vec_add(pp1Bo, v512si);
+
+        pp32Ae = vec_sub(pp3Ae, pp2Ae);
+        pp32Ao = vec_sub(pp3Ao, pp2Ao);
+        pp32Be = vec_sub(pp3Be, pp2Be);
+        pp32Bo = vec_sub(pp3Bo, pp2Bo);
+
+        sumAe = vec_add(pp1cAe, pp32Ae);
+        sumAo = vec_add(pp1cAo, pp32Ao);
+        sumBe = vec_add(pp1cBe, pp32Be);
+        sumBo = vec_add(pp1cBo, pp32Bo);
+
+        ssumAe = vec_sra(sumAe, v10ui);
+        ssumAo = vec_sra(sumAo, v10ui);
+        ssumBe = vec_sra(sumBe, v10ui);
+        ssumBo = vec_sra(sumBo, v10ui);
+
+        ssume = vec_packs(ssumAe, ssumBe);
+        ssumo = vec_packs(ssumAo, ssumBo);
+
+        sumv = vec_packsu(ssume, ssumo);
+        sum = vec_perm(sumv, sumv, mperm);
+
+        ASSERT_ALIGNED(dst);
+        vdst = vec_ld(0, dst);
+
+        OP_U8_ALTIVEC(fsum, sum, vdst);
+
+        vec_st(fsum, 0, dst);
+
+        dst += dstStride;
+    }
+    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/idct_altivec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/idct_altivec.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2001 Michel Lespinasse
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * NOTE: This code is based on GPL code from the libmpeg2 project.  The
+ * author, Michel Lespinasses, has given explicit permission to release
+ * under LGPL as part of FFmpeg.
+ */
+
+/*
+ * FFmpeg integration by Dieter Shirley
+ *
+ * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
+ * project.  I've deleted all of the libmpeg2-specific code, renamed the
+ * functions and reordered the function parameters.  The only change to the
+ * IDCT function itself was to factor out the partial transposition, and to
+ * perform a full transpose at the end of the function.
+ */
+
+
+#include <stdlib.h>                                      /* malloc(), free() */
+#include <string.h>
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+#include "libavcodec/dsputil.h"
+#include "types_altivec.h"
+#include "dsputil_ppc.h"
+#include "dsputil_altivec.h"
+
+#define IDCT_HALF                                       \
+    /* 1st stage */                                     \
+    t1 = vec_mradds (a1, vx7, vx1 );                    \
+    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));    \
+    t7 = vec_mradds (a2, vx5, vx3);                     \
+    t3 = vec_mradds (ma2, vx3, vx5);                    \
+                                                        \
+    /* 2nd stage */                                     \
+    t5 = vec_adds (vx0, vx4);                           \
+    t0 = vec_subs (vx0, vx4);                           \
+    t2 = vec_mradds (a0, vx6, vx2);                     \
+    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));    \
+    t6 = vec_adds (t8, t3);                             \
+    t3 = vec_subs (t8, t3);                             \
+    t8 = vec_subs (t1, t7);                             \
+    t1 = vec_adds (t1, t7);                             \
+                                                        \
+    /* 3rd stage */                                     \
+    t7 = vec_adds (t5, t2);                             \
+    t2 = vec_subs (t5, t2);                             \
+    t5 = vec_adds (t0, t4);                             \
+    t0 = vec_subs (t0, t4);                             \
+    t4 = vec_subs (t8, t3);                             \
+    t3 = vec_adds (t8, t3);                             \
+                                                        \
+    /* 4th stage */                                     \
+    vy0 = vec_adds (t7, t1);                            \
+    vy7 = vec_subs (t7, t1);                            \
+    vy1 = vec_mradds (c4, t3, t5);                      \
+    vy6 = vec_mradds (mc4, t3, t5);                     \
+    vy2 = vec_mradds (c4, t4, t0);                      \
+    vy5 = vec_mradds (mc4, t4, t0);                     \
+    vy3 = vec_adds (t2, t6);                            \
+    vy4 = vec_subs (t2, t6);
+
+
+#define IDCT                                                            \
+    vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
+    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
+    vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
+    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
+    vec_u16 shift;                                                 \
+                                                                        \
+    c4 = vec_splat (constants[0], 0);                                   \
+    a0 = vec_splat (constants[0], 1);                                   \
+    a1 = vec_splat (constants[0], 2);                                   \
+    a2 = vec_splat (constants[0], 3);                                   \
+    mc4 = vec_splat (constants[0], 4);                                  \
+    ma2 = vec_splat (constants[0], 5);                                  \
+    bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3);     \
+                                                                        \
+    zero = vec_splat_s16 (0);                                           \
+    shift = vec_splat_u16 (4);                                          \
+                                                                        \
+    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);    \
+    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);    \
+    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);    \
+    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);    \
+    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);    \
+    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);    \
+    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);    \
+    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);    \
+                                                                        \
+    IDCT_HALF                                                           \
+                                                                        \
+    vx0 = vec_mergeh (vy0, vy4);                                        \
+    vx1 = vec_mergel (vy0, vy4);                                        \
+    vx2 = vec_mergeh (vy1, vy5);                                        \
+    vx3 = vec_mergel (vy1, vy5);                                        \
+    vx4 = vec_mergeh (vy2, vy6);                                        \
+    vx5 = vec_mergel (vy2, vy6);                                        \
+    vx6 = vec_mergeh (vy3, vy7);                                        \
+    vx7 = vec_mergel (vy3, vy7);                                        \
+                                                                        \
+    vy0 = vec_mergeh (vx0, vx4);                                        \
+    vy1 = vec_mergel (vx0, vx4);                                        \
+    vy2 = vec_mergeh (vx1, vx5);                                        \
+    vy3 = vec_mergel (vx1, vx5);                                        \
+    vy4 = vec_mergeh (vx2, vx6);                                        \
+    vy5 = vec_mergel (vx2, vx6);                                        \
+    vy6 = vec_mergeh (vx3, vx7);                                        \
+    vy7 = vec_mergel (vx3, vx7);                                        \
+                                                                        \
+    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                       \
+    vx1 = vec_mergel (vy0, vy4);                                        \
+    vx2 = vec_mergeh (vy1, vy5);                                        \
+    vx3 = vec_mergel (vy1, vy5);                                        \
+    vx4 = vec_mergeh (vy2, vy6);                                        \
+    vx5 = vec_mergel (vy2, vy6);                                        \
+    vx6 = vec_mergeh (vy3, vy7);                                        \
+    vx7 = vec_mergel (vy3, vy7);                                        \
+                                                                        \
+    IDCT_HALF                                                           \
+                                                                        \
+    shift = vec_splat_u16 (6);                                          \
+    vx0 = vec_sra (vy0, shift);                                         \
+    vx1 = vec_sra (vy1, shift);                                         \
+    vx2 = vec_sra (vy2, shift);                                         \
+    vx3 = vec_sra (vy3, shift);                                         \
+    vx4 = vec_sra (vy4, shift);                                         \
+    vx5 = vec_sra (vy5, shift);                                         \
+    vx6 = vec_sra (vy6, shift);                                         \
+    vx7 = vec_sra (vy7, shift);
+
+
+static const vec_s16 constants[5] = {
+    {23170, 13573,  6518, 21895, -23170, -21895,    32,    31},
+    {16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725},
+    {22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521},
+    {21407, 29692, 27969, 25172,  21407,  25172, 27969, 29692},
+    {19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722}
+};
+
+void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
+{
+POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
+    vec_s16 *block = (vec_s16*)blk;
+    vec_u8 tmp;
+
+#if CONFIG_POWERPC_PERF
+POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
+#endif
+    IDCT
+
+#define COPY(dest,src)                                          \
+    tmp = vec_packsu (src, src);                                \
+    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);       \
+    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
+
+    COPY (dest, vx0)    dest += stride;
+    COPY (dest, vx1)    dest += stride;
+    COPY (dest, vx2)    dest += stride;
+    COPY (dest, vx3)    dest += stride;
+    COPY (dest, vx4)    dest += stride;
+    COPY (dest, vx5)    dest += stride;
+    COPY (dest, vx6)    dest += stride;
+    COPY (dest, vx7)
+
+POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
+}
+
+void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
+{
+POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
+    vec_s16 *block = (vec_s16*)blk;
+    vec_u8 tmp;
+    vec_s16 tmp2, tmp3;
+    vec_u8 perm0;
+    vec_u8 perm1;
+    vec_u8 p0, p1, p;
+
+#if CONFIG_POWERPC_PERF
+POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
+#endif
+
+    IDCT
+
+    p0 = vec_lvsl (0, dest);
+    p1 = vec_lvsl (stride, dest);
+    p = vec_splat_u8 (-1);
+    perm0 = vec_mergeh (p, p0);
+    perm1 = vec_mergeh (p, p1);
+
+#define ADD(dest,src,perm)                                              \
+    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
+    tmp = vec_ld (0, dest);                                             \
+    tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
+    tmp3 = vec_adds (tmp2, src);                                        \
+    tmp = vec_packsu (tmp3, tmp3);                                      \
+    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
+    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
+
+    ADD (dest, vx0, perm0)      dest += stride;
+    ADD (dest, vx1, perm1)      dest += stride;
+    ADD (dest, vx2, perm0)      dest += stride;
+    ADD (dest, vx3, perm1)      dest += stride;
+    ADD (dest, vx4, perm0)      dest += stride;
+    ADD (dest, vx5, perm1)      dest += stride;
+    ADD (dest, vx6, perm0)      dest += stride;
+    ADD (dest, vx7, perm1)
+
+POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/mathops.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/mathops.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,79 @@
+/*
+ * simple math operations
+ * Copyright (c) 2001, 2002 Fabrice Bellard
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PPC_MATHOPS_H
+#define AVCODEC_PPC_MATHOPS_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/common.h"
+
+#if HAVE_PPC4XX
+/* signed 16x16 -> 32 multiply add accumulate */
+#define MAC16(rt, ra, rb) \
+    __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
+
+/* signed 16x16 -> 32 multiply */
+#define MUL16(ra, rb) \
+    ({ int __rt; \
+    __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
+    __rt; })
+#endif
+
+#define MULH MULH
+static inline av_const int MULH(int a, int b){
+    int r;
+    __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+    return r;
+}
+
+#if !ARCH_PPC64
+static inline av_const int64_t MAC64(int64_t d, int a, int b)
+{
+    union { uint64_t x; unsigned hl[2]; } x = { d };
+    int h, l;
+    __asm__ ("mullw %3, %4, %5   \n\t"
+             "mulhw %2, %4, %5   \n\t"
+             "addc  %1, %1, %3   \n\t"
+             "adde  %0, %0, %2   \n\t"
+             : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
+             : "r"(a), "r"(b));
+    return x.x;
+}
+#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
+
+static inline av_const int64_t MLS64(int64_t d, int a, int b)
+{
+    union { uint64_t x; unsigned hl[2]; } x = { d };
+    int h, l;
+    __asm__ ("mullw %3, %4, %5   \n\t"
+             "mulhw %2, %4, %5   \n\t"
+             "subfc %1, %3, %1   \n\t"
+             "subfe %0, %2, %0   \n\t"
+             : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
+             : "r"(a), "r"(b));
+    return x.x;
+}
+#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
+#endif
+
+#endif /* AVCODEC_PPC_MATHOPS_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/types_altivec.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/types_altivec.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H
+#define AVCODEC_PPC_TYPES_ALTIVEC_H
+
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8  vector unsigned char
+#define vec_s8  vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
+
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
+
+#define zero_u8v  (vec_u8)  zerov
+#define zero_s8v  (vec_s8)  zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
+
+#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/util_altivec.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/ppc/util_altivec.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,105 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Contains misc utility macros and inline functions
+ */
+
+#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H
+#define AVCODEC_PPC_UTIL_ALTIVEC_H
+
+#include <stdint.h>
+
+#include "config.h"
+
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
+#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
+
+// vcprmle is used to keep the same index as in the SSE version.
+// it's the same as vcprm, with the index inversed
+// ('le' is Little Endian)
+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
+
+// used to build inverse/identity vectors (vcii)
+// n is _n_egative, p is _p_ositive
+#define FLOAT_n -1.
+#define FLOAT_p 1.
+
+
+// Transpose 8x8 matrix of 16-bit elements (in-place)
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+    vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
+    vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
+ \
+    A1 = vec_mergeh (a, e); \
+    B1 = vec_mergel (a, e); \
+    C1 = vec_mergeh (b, f); \
+    D1 = vec_mergel (b, f); \
+    E1 = vec_mergeh (c, g); \
+    F1 = vec_mergel (c, g); \
+    G1 = vec_mergeh (d, h); \
+    H1 = vec_mergel (d, h); \
+ \
+    A2 = vec_mergeh (A1, E1); \
+    B2 = vec_mergel (A1, E1); \
+    C2 = vec_mergeh (B1, F1); \
+    D2 = vec_mergel (B1, F1); \
+    E2 = vec_mergeh (C1, G1); \
+    F2 = vec_mergel (C1, G1); \
+    G2 = vec_mergeh (D1, H1); \
+    H2 = vec_mergel (D1, H1); \
+ \
+    a = vec_mergeh (A2, E2); \
+    b = vec_mergel (A2, E2); \
+    c = vec_mergeh (B2, F2); \
+    d = vec_mergel (B2, F2); \
+    e = vec_mergeh (C2, G2); \
+    f = vec_mergel (C2, G2); \
+    g = vec_mergeh (D2, H2); \
+    h = vec_mergel (D2, H2); \
+} while (0)
+
+
+/** \brief loads unaligned vector \a *src with offset \a offset
+    and returns it */
+static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
+{
+    register vector unsigned char first = vec_ld(offset, src);
+    register vector unsigned char second = vec_ld(offset+15, src);
+    register vector unsigned char mask = vec_lvsl(offset, src);
+    return vec_perm(first, second, mask);
+}
+
+#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/raw.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/raw.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,39 @@
+/*
+ * Raw Video Codec
+ * Copyright (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Raw Video Codec
+ */
+
+#ifndef AVCODEC_RAW_H
+#define AVCODEC_RAW_H
+
+#include "avcodec.h"
+
+typedef struct PixelFormatTag {
+    enum PixelFormat pix_fmt;
+    unsigned int fourcc;
+} PixelFormatTag;
+
+extern const PixelFormatTag ff_raw_pixelFormatTags[];
+int raw_init_encoder(AVCodecContext *avctx);
+#endif /* AVCODEC_RAW_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/rectangle.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/rectangle.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,92 @@
+/*
+ * rectangle filling function
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * useful rectangle filling function
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_RECTANGLE_H
+#define AVCODEC_RECTANGLE_H
+
+#include <assert.h>
+//#include "config.h"
+#include "libavutil/common.h"
+#include "dsputil.h"
+
+/**
+ * fill a rectangle.
+ * @param h height of the rectangle, should be a constant
+ * @param w width of the rectangle, should be a constant
+ * @param size the size of val (1, 2 or 4), should be a constant
+ */
+static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
+    uint8_t *p= (uint8_t*)vp;
+    assert(size==1 || size==2 || size==4);
+    assert(w<=4);
+
+    w      *= size;
+    stride *= size;
+
+    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
+    assert((stride&(w-1))==0);
+    if(w==2){
+        const uint16_t v= size==4 ? val : val*0x0101;
+        *(uint16_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint16_t*)(p + 1*stride)= v;
+        if(h==2) return;
+        *(uint16_t*)(p + 2*stride)= v;
+        *(uint16_t*)(p + 3*stride)= v;
+    }else if(w==4){
+        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
+        *(uint32_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint32_t*)(p + 1*stride)= v;
+        if(h==2) return;
+        *(uint32_t*)(p + 2*stride)= v;
+        *(uint32_t*)(p + 3*stride)= v;
+    }else if(w==8){
+        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
+        *(uint64_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint64_t*)(p + 1*stride)= v;
+        if(h==2) return;
+        *(uint64_t*)(p + 2*stride)= v;
+        *(uint64_t*)(p + 3*stride)= v;
+    }else if(w==16){
+        const uint64_t v= val*0x0100000001ULL;
+        *(uint64_t*)(p + 0+0*stride)= v;
+        *(uint64_t*)(p + 8+0*stride)= v;
+        *(uint64_t*)(p + 0+1*stride)= v;
+        *(uint64_t*)(p + 8+1*stride)= v;
+        if(h==2) return;
+        *(uint64_t*)(p + 0+2*stride)= v;
+        *(uint64_t*)(p + 8+2*stride)= v;
+        *(uint64_t*)(p + 0+3*stride)= v;
+        *(uint64_t*)(p + 8+3*stride)= v;
+    }else
+        assert(0);
+    assert(h==4);
+}
+
+#endif /* AVCODEC_RECTANGLE_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/scratch.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/scratch.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,295 @@
+static void *entropy_thread(void *arg){
+	H264Context *h = (H264Context *) arg;
+	EDSlice *s;
+	
+	H264Cabac hcabac;
+	CABACContext cabac;
+	
+	ff_init_cabac_states();
+	
+	if (init_cabac(h, &hcabac)<0)
+		return NULL;
+	
+	for(;;){
+		{
+			pthread_mutex_lock(&h->lock[ENTROPY]);
+			while (h->ed_cnt<=0)
+				pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
+			s= &h->ed_q[h->ed_fo];
+			pthread_mutex_unlock(&h->lock[ENTROPY]);
+			h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
+		}
+		if (s->state<0)
+			break;
+		
+		decode_slice_entropy(&hcabac, &cabac, s);
+		
+		{
+			pthread_mutex_lock(&h->lock[MBDEC]);
+			while (h->mbdec_cnt >= MAX_SLICE_COUNT)
+				pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
+			h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s);
+			h->mbdec_cnt++;
+			h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
+			pthread_cond_signal(&h->cond[MBDEC]);
+			pthread_mutex_unlock(&h->lock[MBDEC]);
+		}
+		{
+			pthread_mutex_lock(&h->lock[ENTROPY]);
+			h->ed_cnt--;
+			pthread_cond_signal(&h->cond[ENTROPY]);
+			pthread_mutex_unlock(&h->lock[ENTROPY]);
+		}
+	}
+	
+	{
+		pthread_mutex_lock(&h->lock[MBDEC]);
+		while (h->mbdec_cnt >= MAX_SLICE_COUNT)
+			pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
+		h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s);
+		h->mbdec_cnt++;
+		h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
+		pthread_cond_signal(&h->cond[MBDEC]);
+		pthread_mutex_unlock(&h->lock[MBDEC]);
+		
+	}
+	
+	free_cabac(&hcabac);
+	
+	pthread_exit(NULL);
+	return NULL;
+	
+}
+/*
+* The following code is the main loop of the file converter
+*/
+int av_transcode_1ed(int ifile, int ofile, int frame_width, int frame_height) {
+	H264Context *h;
+	pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;
+	
+	h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height);
+	
+	timer_start = av_gettime();
+	
+	//    pthread_create(&read_thr, NULL, read_thread, h);
+	//    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
+	pthread_create(&entropy_thr, NULL, entropy_mbd_thread, h);
+	
+	// pthread_create(&mbdec_thr, NULL, mbdec_thread, h);
+	
+	//   pthread_create(&write_thr, NULL, write_thread, h);
+	
+	//   pthread_join(read_thr, NULL);
+	//    pthread_join(parsenal_thr, NULL);
+	pthread_join(entropy_thr, NULL);
+	//    pthread_join(mbdec_thr, NULL);
+	//	printf("before write_thr\n");
+	//    pthread_join(write_thr, NULL);
+	
+	/* finished ! */
+	ff_h264_decode_end(h);
+	
+	return 0;
+}
+
+static void reset_h264mb(EDSlice *s, int mb_width, int mb_height){
+	for (int i=0; i<mb_height; i++){
+		for (int j=0; j<mb_width; j++){
+			H264Mb *m = &s->mbs[i*mb_width + j];
+
+			m->left_mb_xy=0;
+			m->top_mb_xy = 0;
+		}
+	}
+}
+
+static void *entropy_mbd_thread(void *arg){
+	H264Context *h = (H264Context *) arg;
+
+	EDSlice slice, *s=&slice;
+	MBSlice mbslice, *s2=&mbslice;
+	H264Cabac hcabac;
+	CABACContext cabac;
+	int frames =0;
+	MBDecContext mbdec, *d=&mbdec;
+	int size=h->width*h->height;
+	WriteContext write, *w=&write;
+	AVCodecParserContext parser, *pc= &parser;
+	NalContext nal, *n=&nal;
+
+
+	memset(pc, 0, sizeof(AVCodecParserContext));
+	pc->buffer_size = 2048;
+	pc->final_frame = 0;
+	pc->cur_len= 0;
+	pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE);
+	pc->size = 2048;
+	pc->eof_reached =0;
+	pc->ifile = h->ifile;
+
+	//init parse
+	memset(n, 0, sizeof(NalContext));
+	n->width = h->width;
+	n->height = h->height;
+	n->mb_height = h->mb_height;
+	n->mb_width  = h->mb_width;
+	n->b4_stride = n->mb_width*4 + 1;
+	n->mb_stride = n->mb_width + 1;
+	n->outputed_poc = INT_MIN;
+// 	memset(s, 0, sizeof(EDSlice));
+// 	ff_init_slice(n, s);
+//
+
+	memset(w, 0, sizeof(WriteContext));
+	w->bit_buffer_size= FFMAX(1024*256, 6*size + 200);
+	w->bit_buffer=  av_mallocz(w->bit_buffer_size);
+
+
+
+	ff_h264dsp_init(&d->hdsp);
+	ff_h264_pred_init(&d->hpc);
+	dsputil_init(&d->dsp);
+	d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab;
+	d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab;
+	d->mb_height = (h->height + 15) / 16;
+	d->mb_width  = (h->width  + 15) / 16;
+	d->linesize = h->width + EDGE_WIDTH*2;
+	d->uvlinesize = d->linesize>>1;
+
+	for(int i=0; i<16; i++){
+		d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3);
+	}
+	for(int i=0; i<4; i++){
+		d->block_offset[16+i]=
+		d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3);
+	}
+
+	d->scratchpad= av_mallocz((h->width+64)*4*16*2*sizeof(uint8_t));
+
+	ff_init_cabac_states();
+
+	if (init_cabac(h, &hcabac)<0)
+		return NULL;
+
+	while(!pc->final_frame && frames_max++ < 1000){
+		Picture *out;
+
+		RawFrame *frm;
+		Picture *pic=NULL;
+
+		RawFrame frm_read;
+		frm_read.state =0;
+		av_read_frame_internal(pc, &frm_read);
+		frm = &frm_read;
+
+		if (frm->state < 0)
+			break;
+/*
+		{
+			pthread_mutex_lock(&h->lock[PARSE2]);
+			while (h->slice_cnt<=0)
+				pthread_cond_wait(&h->cond[PARSE2], &h->lock[PARSE2]);
+			h->slice_cnt--;
+			s= &h->slices[h->slice_next++];
+			h->slice_next %= MAX_SLICE_COUNT;
+			pthread_mutex_unlock(&h->lock[PARSE2]);
+		}*/
+		ff_init_slice(n, s);
+		reset_h264mb(s, n->mb_width, n->mb_height);
+		for(int i=0; i<MAX_PIC_COUNT; i++){
+			if(h->picture[i].reference==0){
+				pic= &h->picture[i];
+				break;
+			}
+		}
+// 		{
+// 			pthread_mutex_lock(&h->lock[PARSE3]);
+// 			while (h->free_pic_cnt<=0)
+// 				pthread_cond_wait(&h->cond[PARSE3], &h->lock[PARSE3]);
+// 			h->free_pic_cnt--;
+// 			/* use first free picture */
+// 			for(int i=0; i<MAX_PIC_COUNT; i++){
+// 				if(h->picture[i].reference==0){
+// 					pic= &h->picture[i];
+// 					break;
+// 				}
+// 			}
+// 			pthread_mutex_unlock(&h->lock[PARSE3]);
+// 		}
+		ff_alloc_picture(n, s, pic);
+
+		decode_nal_units(n, s, frm, pic);
+
+
+		decode_slice_entropy(&hcabac, &cabac, s);
+		memcpy( s2, s, sizeof(MBSlice)); //this only copys the COMMON_SLICE part
+		av_freep(&s->gb.raw);
+		decode_slice_mb_seq(d, s2);
+
+//         if (s2->release_cnt>0) {
+//             int i;
+//             for (i=0; i<s2->release_cnt; i++){
+//                 if ((s2->release_ref[i]->reference & ~2) == 0)
+//                     default_release_buffer(h, s2->release_ref[i]);
+//                 else
+//                     s2->release_ref[i]->reference &= ~2;
+//             }
+//             s->release_cnt=0;
+//         }
+
+if (s->release_cnt>0) {
+	int i;
+	for (i=0; i<s->release_cnt; i++){
+		s->release_ref[i]->reference &= ~2;
+	}
+	s->release_cnt=0;
+}
+
+
+        {
+			pthread_mutex_lock(&h->lock[PARSE2]);
+			h->slice_cnt++;
+			pthread_cond_signal(&h->cond[PARSE2]);
+			pthread_mutex_unlock(&h->lock[PARSE2]);
+		}
+
+		out =output_frame(w, s2->current_picture, h->ofile, h->width, h->height);
+		print_report(w->frame_number, w->video_size, 0);
+
+		if (out){
+// 			if ((out->reference & ~1) == 0)
+// 				default_release_buffer(h, out);
+// 			else
+				out->reference &= ~1;
+		}
+
+		{
+			pthread_mutex_lock(&h->lock[ENTROPY]);
+			h->ed_cnt--;
+			pthread_cond_signal(&h->cond[ENTROPY]);
+			pthread_mutex_unlock(&h->lock[ENTROPY]);
+		}
+	}
+	while (output_frame(w, NULL, h->ofile, h->width, h->height));
+	print_report(w->frame_number, w->video_size, 1);
+
+	av_free(w->bit_buffer);
+
+	{//propagate exit
+		pthread_mutex_lock(&h->lock[WRITE]);
+		while (h->write_cnt>= MAX_DELAYED_PIC_COUNT)
+			pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
+		last_pic.reference = -1;
+		h->write_q[h->write_fi] = &last_pic;
+		h->write_cnt++;
+		h->write_fi++; h->write_fi %= MAX_DELAYED_PIC_COUNT;
+		pthread_cond_signal(&h->cond[WRITE]);
+		pthread_mutex_unlock(&h->lock[WRITE]);
+
+	}
+	free_cabac(&hcabac);
+
+	pthread_exit(NULL);
+	return NULL;
+
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/simple_idct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/simple_idct.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,372 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * simpleidct in C.
+ */
+
+/*
+  based upon some outcommented c code from mpeg2dec (idct_mmx.c
+  written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
+ */
+#include "avcodec.h"
+#include "dsputil.h"
+#include "mathops.h"
+#include "simple_idct.h"
+
+#if 0
+#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
+#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
+#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
+#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
+#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
+#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
+#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
+#define ROW_SHIFT 8
+#define COL_SHIFT 17
+#else
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define ROW_SHIFT 11
+#define COL_SHIFT 20 // 6
+#endif
+
+static inline void idctRowCondDC (DCTELEM * row)
+{
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        uint64_t temp;
+
+#if HAVE_BIGENDIAN
+#define ROW0_MASK 0xffff000000000000LL
+#else
+#define ROW0_MASK 0xffffLL
+#endif
+        if(sizeof(DCTELEM)==2){
+            if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
+                  ((uint64_t *)row)[1]) == 0) {
+                temp = (row[0] << 3) & 0xffff;
+                temp += temp << 16;
+                temp += temp << 32;
+                ((uint64_t *)row)[0] = temp;
+                ((uint64_t *)row)[1] = temp;
+                return;
+            }
+        }else{
+            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
+                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
+                return;
+            }
+        }
+
+        a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+        a1 = a0;
+        a2 = a0;
+        a3 = a0;
+
+        /* no need to optimize : gcc does it */
+        a0 += W2 * row[2];
+        a1 += W6 * row[2];
+        a2 -= W6 * row[2];
+        a3 -= W2 * row[2];
+
+        b0 = MUL16(W1, row[1]);
+        MAC16(b0, W3, row[3]);
+        b1 = MUL16(W3, row[1]);
+        MAC16(b1, -W7, row[3]);
+        b2 = MUL16(W5, row[1]);
+        MAC16(b2, -W1, row[3]);
+        b3 = MUL16(W7, row[1]);
+        MAC16(b3, -W5, row[3]);
+
+        temp = ((uint64_t*)row)[1];
+
+        if (temp != 0) {
+            a0 += W4*row[4] + W6*row[6];
+            a1 += - W4*row[4] - W2*row[6];
+            a2 += - W4*row[4] + W2*row[6];
+            a3 += W4*row[4] - W6*row[6];
+
+            MAC16(b0, W5, row[5]);
+            MAC16(b0, W7, row[7]);
+
+            MAC16(b1, -W1, row[5]);
+            MAC16(b1, -W5, row[7]);
+
+            MAC16(b2, W7, row[5]);
+            MAC16(b2, W3, row[7]);
+
+            MAC16(b3, W3, row[5]);
+            MAC16(b3, -W1, row[7]);
+        }
+
+        row[0] = (a0 + b0) >> ROW_SHIFT;
+        row[7] = (a0 - b0) >> ROW_SHIFT;
+        row[1] = (a1 + b1) >> ROW_SHIFT;
+        row[6] = (a1 - b1) >> ROW_SHIFT;
+        row[2] = (a2 + b2) >> ROW_SHIFT;
+        row[5] = (a2 - b2) >> ROW_SHIFT;
+        row[3] = (a3 + b3) >> ROW_SHIFT;
+        row[4] = (a3 - b3) >> ROW_SHIFT;
+}
+
+static inline void idctSparseColPut (uint8_t *dest, int line_size,
+                                     DCTELEM * col)
+{
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+
+        /* XXX: I did that only to give same values as previous code */
+        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
+        a1 = a0;
+        a2 = a0;
+        a3 = a0;
+
+        a0 +=  + W2*col[8*2];
+        a1 +=  + W6*col[8*2];
+        a2 +=  - W6*col[8*2];
+        a3 +=  - W2*col[8*2];
+
+        b0 = MUL16(W1, col[8*1]);
+        b1 = MUL16(W3, col[8*1]);
+        b2 = MUL16(W5, col[8*1]);
+        b3 = MUL16(W7, col[8*1]);
+
+        MAC16(b0, + W3, col[8*3]);
+        MAC16(b1, - W7, col[8*3]);
+        MAC16(b2, - W1, col[8*3]);
+        MAC16(b3, - W5, col[8*3]);
+
+        if(col[8*4]){
+            a0 += + W4*col[8*4];
+            a1 += - W4*col[8*4];
+            a2 += - W4*col[8*4];
+            a3 += + W4*col[8*4];
+        }
+
+        if (col[8*5]) {
+            MAC16(b0, + W5, col[8*5]);
+            MAC16(b1, - W1, col[8*5]);
+            MAC16(b2, + W7, col[8*5]);
+            MAC16(b3, + W3, col[8*5]);
+        }
+
+        if(col[8*6]){
+            a0 += + W6*col[8*6];
+            a1 += - W2*col[8*6];
+            a2 += + W2*col[8*6];
+            a3 += - W6*col[8*6];
+        }
+
+        if (col[8*7]) {
+            MAC16(b0, + W7, col[8*7]);
+            MAC16(b1, - W5, col[8*7]);
+            MAC16(b2, + W3, col[8*7]);
+            MAC16(b3, - W1, col[8*7]);
+        }
+
+        dest[0] = cm[(a0 + b0) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a1 + b1) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a2 + b2) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a3 + b3) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a3 - b3) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a2 - b2) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a1 - b1) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a0 - b0) >> COL_SHIFT];
+}
+
+static inline void idctSparseColAdd (uint8_t *dest, int line_size,
+                                     DCTELEM * col)
+{
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+
+        /* XXX: I did that only to give same values as previous code */
+        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
+        a1 = a0;
+        a2 = a0;
+        a3 = a0;
+
+        a0 +=  + W2*col[8*2];
+        a1 +=  + W6*col[8*2];
+        a2 +=  - W6*col[8*2];
+        a3 +=  - W2*col[8*2];
+
+        b0 = MUL16(W1, col[8*1]);
+        b1 = MUL16(W3, col[8*1]);
+        b2 = MUL16(W5, col[8*1]);
+        b3 = MUL16(W7, col[8*1]);
+
+        MAC16(b0, + W3, col[8*3]);
+        MAC16(b1, - W7, col[8*3]);
+        MAC16(b2, - W1, col[8*3]);
+        MAC16(b3, - W5, col[8*3]);
+
+        if(col[8*4]){
+            a0 += + W4*col[8*4];
+            a1 += - W4*col[8*4];
+            a2 += - W4*col[8*4];
+            a3 += + W4*col[8*4];
+        }
+
+        if (col[8*5]) {
+            MAC16(b0, + W5, col[8*5]);
+            MAC16(b1, - W1, col[8*5]);
+            MAC16(b2, + W7, col[8*5]);
+            MAC16(b3, + W3, col[8*5]);
+        }
+
+        if(col[8*6]){
+            a0 += + W6*col[8*6];
+            a1 += - W2*col[8*6];
+            a2 += + W2*col[8*6];
+            a3 += - W6*col[8*6];
+        }
+
+        if (col[8*7]) {
+            MAC16(b0, + W7, col[8*7]);
+            MAC16(b1, - W5, col[8*7]);
+            MAC16(b2, + W3, col[8*7]);
+            MAC16(b3, - W1, col[8*7]);
+        }
+
+        dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
+}
+
+static inline void idctSparseCol (DCTELEM * col)
+{
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+
+        /* XXX: I did that only to give same values as previous code */
+        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
+        a1 = a0;
+        a2 = a0;
+        a3 = a0;
+
+        a0 +=  + W2*col[8*2];
+        a1 +=  + W6*col[8*2];
+        a2 +=  - W6*col[8*2];
+        a3 +=  - W2*col[8*2];
+
+        b0 = MUL16(W1, col[8*1]);
+        b1 = MUL16(W3, col[8*1]);
+        b2 = MUL16(W5, col[8*1]);
+        b3 = MUL16(W7, col[8*1]);
+
+        MAC16(b0, + W3, col[8*3]);
+        MAC16(b1, - W7, col[8*3]);
+        MAC16(b2, - W1, col[8*3]);
+        MAC16(b3, - W5, col[8*3]);
+
+        if(col[8*4]){
+            a0 += + W4*col[8*4];
+            a1 += - W4*col[8*4];
+            a2 += - W4*col[8*4];
+            a3 += + W4*col[8*4];
+        }
+
+        if (col[8*5]) {
+            MAC16(b0, + W5, col[8*5]);
+            MAC16(b1, - W1, col[8*5]);
+            MAC16(b2, + W7, col[8*5]);
+            MAC16(b3, + W3, col[8*5]);
+        }
+
+        if(col[8*6]){
+            a0 += + W6*col[8*6];
+            a1 += - W2*col[8*6];
+            a2 += + W2*col[8*6];
+            a3 += - W6*col[8*6];
+        }
+
+        if (col[8*7]) {
+            MAC16(b0, + W7, col[8*7]);
+            MAC16(b1, - W5, col[8*7]);
+            MAC16(b2, + W3, col[8*7]);
+            MAC16(b3, - W1, col[8*7]);
+        }
+
+        col[0 ] = ((a0 + b0) >> COL_SHIFT);
+        col[8 ] = ((a1 + b1) >> COL_SHIFT);
+        col[16] = ((a2 + b2) >> COL_SHIFT);
+        col[24] = ((a3 + b3) >> COL_SHIFT);
+        col[32] = ((a3 - b3) >> COL_SHIFT);
+        col[40] = ((a2 - b2) >> COL_SHIFT);
+        col[48] = ((a1 - b1) >> COL_SHIFT);
+        col[56] = ((a0 - b0) >> COL_SHIFT);
+}
+
+void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    int i;
+    for(i=0; i<8; i++)
+        idctRowCondDC(block + i*8);
+
+    for(i=0; i<8; i++)
+        idctSparseColPut(dest + i, line_size, block + i);
+}
+
+void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    int i;
+    for(i=0; i<8; i++)
+        idctRowCondDC(block + i*8);
+
+    for(i=0; i<8; i++)
+        idctSparseColAdd(dest + i, line_size, block + i);
+}
+
+void ff_simple_idct(DCTELEM *block)
+{
+    int i;
+    for(i=0; i<8; i++)
+        idctRowCondDC(block + i*8);
+
+    for(i=0; i<8; i++)
+        idctSparseCol(block + i);
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/simple_idct.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/simple_idct.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,47 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * simple idct header.
+ */
+
+#ifndef AVCODEC_SIMPLE_IDCT_H
+#define AVCODEC_SIMPLE_IDCT_H
+
+#include <stdint.h>
+#include "dsputil.h"
+
+void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_mmx(int16_t *block);
+void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct(DCTELEM *block);
+
+void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block);
+
+void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block);
+
+#endif /* AVCODEC_SIMPLE_IDCT_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/utils.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/utils.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,68 @@
+/*
+ * utils for libavcodec
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * utils.
+ */
+
+/* needed for mkstemp() */
+#define _XOPEN_SOURCE 600
+
+#include "avcodec.h"
+#include "dsputil.h"
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <float.h>
+//#undef NDEBUG
+#include <assert.h>
+
+#include <fcntl.h>
+
+void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size)
+{
+    if(min_size < *size)
+        return ptr;
+
+    *size= FFMAX(17*min_size/16 + 32, min_size);
+
+    ptr= av_realloc(ptr, *size);
+    if(!ptr) //we could set this to the unmodified min_size but this is safer if the user lost the ptr and uses NULL now
+        *size= 0;
+
+    return ptr;
+}
+
+void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size)
+{
+    void **p = ptr;
+    if (min_size < *size)
+        return;
+    *size= FFMAX(17*min_size/16 + 32, min_size);
+    av_free(*p);
+    *p = av_malloc(*size);
+    if (!*p) *size = 0;
+}
+
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/cpuid.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/cpuid.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,135 @@
+/*
+ * CPU detection code, extracted from mmx.h
+ * (c)1997-99 by H. Dietz and R. Fisher
+ * Converted to C and improved by Fabrice Bellard.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+
+#undef printf
+
+/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
+#define cpuid(index,eax,ebx,ecx,edx)\
+    __asm__ volatile\
+        ("mov %%"REG_b", %%"REG_S"\n\t"\
+         "cpuid\n\t"\
+         "xchg %%"REG_b", %%"REG_S\
+         : "=a" (eax), "=S" (ebx),\
+           "=c" (ecx), "=d" (edx)\
+         : "0" (index));
+
+/* Function to test if multimedia instructions are supported...  */
+int mm_support()
+{
+    int rval = 0;
+    int eax, ebx, ecx, edx;
+    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+
+#if ARCH_X86_32
+    x86_reg a, c;
+    __asm__ volatile (
+        /* See if CPUID instruction is supported ... */
+        /* ... Get copies of EFLAGS into eax and ecx */
+        "pushfl\n\t"
+        "pop %0\n\t"
+        "mov %0, %1\n\t"
+
+        /* ... Toggle the ID bit in one copy and store */
+        /*     to the EFLAGS reg */
+        "xor $0x200000, %0\n\t"
+        "push %0\n\t"
+        "popfl\n\t"
+
+        /* ... Get the (hopefully modified) EFLAGS */
+        "pushfl\n\t"
+        "pop %0\n\t"
+        : "=a" (a), "=c" (c)
+        :
+        : "cc"
+        );
+
+    if (a == c)
+        return 0; /* CPUID not supported */
+#endif
+
+    cpuid(0, max_std_level, ebx, ecx, edx);
+
+    if(max_std_level >= 1){
+        cpuid(1, eax, ebx, ecx, std_caps);
+        if (std_caps & (1<<23))
+            rval |= FF_MM_MMX;
+        if (std_caps & (1<<25))
+            rval |= FF_MM_MMX2
+#if HAVE_SSE
+                  | FF_MM_SSE;
+        if (std_caps & (1<<26))
+            rval |= FF_MM_SSE2;
+        if (ecx & 1)
+            rval |= FF_MM_SSE3;
+        if (ecx & 0x00000200 )
+            rval |= FF_MM_SSSE3;
+        if (ecx & 0x00080000 )
+            rval |= FF_MM_SSE4;
+        if (ecx & 0x00100000 )
+            rval |= FF_MM_SSE42;
+#endif
+                  ;
+    }
+
+    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
+
+    if(max_ext_level >= 0x80000001){
+        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
+        if (ext_caps & (1<<31))
+            rval |= FF_MM_3DNOW;
+        if (ext_caps & (1<<30))
+            rval |= FF_MM_3DNOWEXT;
+        if (ext_caps & (1<<23))
+            rval |= FF_MM_MMX;
+        if (ext_caps & (1<<22))
+            rval |= FF_MM_MMX2;
+    }
+
+#if 0
+    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
+        (rval&FF_MM_MMX) ? "MMX ":"",
+        (rval&FF_MM_MMX2) ? "MMX2 ":"",
+        (rval&FF_MM_SSE) ? "SSE ":"",
+        (rval&FF_MM_SSE2) ? "SSE2 ":"",
+        (rval&FF_MM_SSE3) ? "SSE3 ":"",
+        (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
+        (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
+        (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
+        (rval&FF_MM_3DNOW) ? "3DNow ":"",
+        (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
+#endif
+    return rval;
+}
+
+#ifdef TEST
+int main ( void )
+{
+    int mm_flags;
+    mm_flags = mm_support();
+    printf("mm_support = 0x%08X\n",mm_flags);
+    return 0;
+}
+#endif
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_h264_template_mmx.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/dsputil_h264_template_mmx.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
+ *                    Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MMX optimized version of (put|avg)_h264_chroma_mc8.
+ * H264_CHROMA_MC8_TMPL must be defined to the desired function name
+ * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
+ * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
+ */
+static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
+{
+    DECLARE_ALIGNED(8, uint64_t, AA);
+    DECLARE_ALIGNED(8, uint64_t, DD);
+    int i;
+
+    if(y==0 && x==0) {
+        /* no filter needed */
+        H264_CHROMA_MC8_MV0(dst, src, stride, h);
+        return;
+    }
+
+    assert(x<8 && y<8 && x>=0 && y>=0);
+
+    if(y==0 || x==0)
+    {
+        /* 1 dimensional filter only */
+        const int dxy = x ? 1 : stride;
+
+        __asm__ volatile(
+            "movd %0, %%mm5\n\t"
+            "movq %1, %%mm4\n\t"
+            "movq %2, %%mm6\n\t"         /* mm6 = rnd >> 3 */
+            "punpcklwd %%mm5, %%mm5\n\t"
+            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
+            "pxor %%mm7, %%mm7\n\t"
+            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
+            :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
+
+        for(i=0; i<h; i++) {
+            __asm__ volatile(
+                /* mm0 = src[0..7], mm1 = src[1..8] */
+                "movq %0, %%mm0\n\t"
+                "movq %1, %%mm2\n\t"
+                :: "m"(src[0]), "m"(src[dxy]));
+
+            __asm__ volatile(
+                /* [mm0,mm1] = A * src[0..7] */
+                /* [mm2,mm3] = B * src[1..8] */
+                "movq %%mm0, %%mm1\n\t"
+                "movq %%mm2, %%mm3\n\t"
+                "punpcklbw %%mm7, %%mm0\n\t"
+                "punpckhbw %%mm7, %%mm1\n\t"
+                "punpcklbw %%mm7, %%mm2\n\t"
+                "punpckhbw %%mm7, %%mm3\n\t"
+                "pmullw %%mm4, %%mm0\n\t"
+                "pmullw %%mm4, %%mm1\n\t"
+                "pmullw %%mm5, %%mm2\n\t"
+                "pmullw %%mm5, %%mm3\n\t"
+
+                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
+                "paddw %%mm6, %%mm0\n\t"
+                "paddw %%mm6, %%mm1\n\t"
+                "paddw %%mm2, %%mm0\n\t"
+                "paddw %%mm3, %%mm1\n\t"
+                "psrlw $3, %%mm0\n\t"
+                "psrlw $3, %%mm1\n\t"
+                "packuswb %%mm1, %%mm0\n\t"
+                H264_CHROMA_OP(%0, %%mm0)
+                "movq %%mm0, %0\n\t"
+                : "=m" (dst[0]));
+
+            src += stride;
+            dst += stride;
+        }
+        return;
+    }
+
+    /* general case, bilinear */
+    __asm__ volatile("movd %2, %%mm4\n\t"
+                 "movd %3, %%mm6\n\t"
+                 "punpcklwd %%mm4, %%mm4\n\t"
+                 "punpcklwd %%mm6, %%mm6\n\t"
+                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
+                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
+                 "movq %%mm4, %%mm5\n\t"
+                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
+                 "psllw $3, %%mm5\n\t"
+                 "psllw $3, %%mm6\n\t"
+                 "movq %%mm5, %%mm7\n\t"
+                 "paddw %%mm6, %%mm7\n\t"
+                 "movq %%mm4, %1\n\t"         /* DD = x * y */
+                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
+                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
+                 "paddw %4, %%mm4\n\t"
+                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
+                 "pxor %%mm7, %%mm7\n\t"
+                 "movq %%mm4, %0\n\t"
+                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
+
+    __asm__ volatile(
+        /* mm0 = src[0..7], mm1 = src[1..8] */
+        "movq %0, %%mm0\n\t"
+        "movq %1, %%mm1\n\t"
+        : : "m" (src[0]), "m" (src[1]));
+
+    for(i=0; i<h; i++) {
+        src += stride;
+
+        __asm__ volatile(
+            /* mm2 = A * src[0..3] + B * src[1..4] */
+            /* mm3 = A * src[4..7] + B * src[5..8] */
+            "movq %%mm0, %%mm2\n\t"
+            "movq %%mm1, %%mm3\n\t"
+            "punpckhbw %%mm7, %%mm0\n\t"
+            "punpcklbw %%mm7, %%mm1\n\t"
+            "punpcklbw %%mm7, %%mm2\n\t"
+            "punpckhbw %%mm7, %%mm3\n\t"
+            "pmullw %0, %%mm0\n\t"
+            "pmullw %0, %%mm2\n\t"
+            "pmullw %%mm5, %%mm1\n\t"
+            "pmullw %%mm5, %%mm3\n\t"
+            "paddw %%mm1, %%mm2\n\t"
+            "paddw %%mm0, %%mm3\n\t"
+            : : "m" (AA));
+
+        __asm__ volatile(
+            /* [mm2,mm3] += C * src[0..7] */
+            "movq %0, %%mm0\n\t"
+            "movq %%mm0, %%mm1\n\t"
+            "punpcklbw %%mm7, %%mm0\n\t"
+            "punpckhbw %%mm7, %%mm1\n\t"
+            "pmullw %%mm6, %%mm0\n\t"
+            "pmullw %%mm6, %%mm1\n\t"
+            "paddw %%mm0, %%mm2\n\t"
+            "paddw %%mm1, %%mm3\n\t"
+            : : "m" (src[0]));
+
+        __asm__ volatile(
+            /* [mm2,mm3] += D * src[1..8] */
+            "movq %1, %%mm1\n\t"
+            "movq %%mm1, %%mm0\n\t"
+            "movq %%mm1, %%mm4\n\t"
+            "punpcklbw %%mm7, %%mm0\n\t"
+            "punpckhbw %%mm7, %%mm4\n\t"
+            "pmullw %2, %%mm0\n\t"
+            "pmullw %2, %%mm4\n\t"
+            "paddw %%mm0, %%mm2\n\t"
+            "paddw %%mm4, %%mm3\n\t"
+            "movq %0, %%mm0\n\t"
+            : : "m" (src[0]), "m" (src[1]), "m" (DD));
+
+        __asm__ volatile(
+            /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
+            "paddw %1, %%mm2\n\t"
+            "paddw %1, %%mm3\n\t"
+            "psrlw $6, %%mm2\n\t"
+            "psrlw $6, %%mm3\n\t"
+            "packuswb %%mm3, %%mm2\n\t"
+            H264_CHROMA_OP(%0, %%mm2)
+            "movq %%mm2, %0\n\t"
+            : "=m" (dst[0]) : "m" (*rnd_reg));
+        dst+= stride;
+    }
+}
+
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
+{
+    __asm__ volatile(
+        "pxor   %%mm7, %%mm7        \n\t"
+        "movd %5, %%mm2             \n\t"
+        "movd %6, %%mm3             \n\t"
+        "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
+        "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
+        "punpcklwd %%mm2, %%mm2     \n\t"
+        "punpcklwd %%mm3, %%mm3     \n\t"
+        "punpcklwd %%mm2, %%mm2     \n\t"
+        "punpcklwd %%mm3, %%mm3     \n\t"
+        "psubw %%mm2, %%mm4         \n\t"
+        "psubw %%mm3, %%mm5         \n\t"
+
+        "movd  (%1), %%mm0          \n\t"
+        "movd 1(%1), %%mm6          \n\t"
+        "add %3, %1                 \n\t"
+        "punpcklbw %%mm7, %%mm0     \n\t"
+        "punpcklbw %%mm7, %%mm6     \n\t"
+        "pmullw %%mm4, %%mm0        \n\t"
+        "pmullw %%mm2, %%mm6        \n\t"
+        "paddw %%mm0, %%mm6         \n\t"
+
+        "1:                         \n\t"
+        "movd  (%1), %%mm0          \n\t"
+        "movd 1(%1), %%mm1          \n\t"
+        "add %3, %1                 \n\t"
+        "punpcklbw %%mm7, %%mm0     \n\t"
+        "punpcklbw %%mm7, %%mm1     \n\t"
+        "pmullw %%mm4, %%mm0        \n\t"
+        "pmullw %%mm2, %%mm1        \n\t"
+        "paddw %%mm0, %%mm1         \n\t"
+        "movq %%mm1, %%mm0          \n\t"
+        "pmullw %%mm5, %%mm6        \n\t"
+        "pmullw %%mm3, %%mm1        \n\t"
+        "paddw %4, %%mm6            \n\t"
+        "paddw %%mm6, %%mm1         \n\t"
+        "psrlw $6, %%mm1            \n\t"
+        "packuswb %%mm1, %%mm1      \n\t"
+        H264_CHROMA_OP4((%0), %%mm1, %%mm6)
+        "movd %%mm1, (%0)           \n\t"
+        "add %3, %0                 \n\t"
+        "movd  (%1), %%mm6          \n\t"
+        "movd 1(%1), %%mm1          \n\t"
+        "add %3, %1                 \n\t"
+        "punpcklbw %%mm7, %%mm6     \n\t"
+        "punpcklbw %%mm7, %%mm1     \n\t"
+        "pmullw %%mm4, %%mm6        \n\t"
+        "pmullw %%mm2, %%mm1        \n\t"
+        "paddw %%mm6, %%mm1         \n\t"
+        "movq %%mm1, %%mm6          \n\t"
+        "pmullw %%mm5, %%mm0        \n\t"
+        "pmullw %%mm3, %%mm1        \n\t"
+        "paddw %4, %%mm0            \n\t"
+        "paddw %%mm0, %%mm1         \n\t"
+        "psrlw $6, %%mm1            \n\t"
+        "packuswb %%mm1, %%mm1      \n\t"
+        H264_CHROMA_OP4((%0), %%mm1, %%mm0)
+        "movd %%mm1, (%0)           \n\t"
+        "add %3, %0                 \n\t"
+        "sub $2, %2                 \n\t"
+        "jnz 1b                     \n\t"
+        : "+r"(dst), "+r"(src), "+r"(h)
+        : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
+    );
+}
+
+#ifdef H264_CHROMA_MC2_TMPL
+static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    int tmp = ((1<<16)-1)*x + 8;
+    int CD= tmp*y;
+    int AB= (tmp<<3) - CD;
+    __asm__ volatile(
+        /* mm5 = {A,B,A,B} */
+        /* mm6 = {C,D,C,D} */
+        "movd %0, %%mm5\n\t"
+        "movd %1, %%mm6\n\t"
+        "punpckldq %%mm5, %%mm5\n\t"
+        "punpckldq %%mm6, %%mm6\n\t"
+        "pxor %%mm7, %%mm7\n\t"
+        /* mm0 = src[0,1,1,2] */
+        "movd %2, %%mm2\n\t"
+        "punpcklbw %%mm7, %%mm2\n\t"
+        "pshufw $0x94, %%mm2, %%mm2\n\t"
+        :: "r"(AB), "r"(CD), "m"(src[0]));
+
+
+    __asm__ volatile(
+        "1:\n\t"
+        "add %4, %1\n\t"
+        /* mm1 = A * src[0,1] + B * src[1,2] */
+        "movq    %%mm2, %%mm1\n\t"
+        "pmaddwd %%mm5, %%mm1\n\t"
+        /* mm0 = src[0,1,1,2] */
+        "movd (%1), %%mm0\n\t"
+        "punpcklbw %%mm7, %%mm0\n\t"
+        "pshufw $0x94, %%mm0, %%mm0\n\t"
+        /* mm1 += C * src[0,1] + D * src[1,2] */
+        "movq    %%mm0, %%mm2\n\t"
+        "pmaddwd %%mm6, %%mm0\n\t"
+        "paddw      %3, %%mm1\n\t"
+        "paddw   %%mm0, %%mm1\n\t"
+        /* dst[0,1] = pack((mm1 + 32) >> 6) */
+        "psrlw $6, %%mm1\n\t"
+        "packssdw %%mm7, %%mm1\n\t"
+        "packuswb %%mm7, %%mm1\n\t"
+        H264_CHROMA_OP4((%0), %%mm1, %%mm3)
+        "movd %%mm1, %%esi\n\t"
+        "movw %%si, (%0)\n\t"
+        "add %4, %0\n\t"
+        "sub $1, %2\n\t"
+        "jnz 1b\n\t"
+        : "+r" (dst), "+r"(src), "+r"(h)
+        : "m" (ff_pw_32), "r"((x86_reg)stride)
+        : "%esi");
+
+}
+#endif
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_h264_template_ssse3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/dsputil_h264_template_ssse3.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
+ * H264_CHROMA_MC8_TMPL must be defined to the desired function name
+ * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
+ * AVG_OP must be defined to empty for put and the identify for avg
+ */
+static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
+{
+    if(y==0 && x==0) {
+        /* no filter needed */
+        H264_CHROMA_MC8_MV0(dst, src, stride, h);
+        return;
+    }
+
+    assert(x<8 && y<8 && x>=0 && y>=0);
+
+    if(y==0 || x==0)
+    {
+        /* 1 dimensional filter only */
+        __asm__ volatile(
+            "movd %0, %%xmm7 \n\t"
+            "movq %1, %%xmm6 \n\t"
+            "pshuflw $0, %%xmm7, %%xmm7 \n\t"
+            "movlhps %%xmm6, %%xmm6 \n\t"
+            "movlhps %%xmm7, %%xmm7 \n\t"
+            :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
+        );
+
+        if(x) {
+            __asm__ volatile(
+                "1: \n\t"
+                "movq (%1), %%xmm0 \n\t"
+                "movq 1(%1), %%xmm1 \n\t"
+                "movq (%1,%3), %%xmm2 \n\t"
+                "movq 1(%1,%3), %%xmm3 \n\t"
+                "punpcklbw %%xmm1, %%xmm0 \n\t"
+                "punpcklbw %%xmm3, %%xmm2 \n\t"
+                "pmaddubsw %%xmm7, %%xmm0 \n\t"
+                "pmaddubsw %%xmm7, %%xmm2 \n\t"
+         AVG_OP("movq (%0), %%xmm4 \n\t")
+         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
+                "paddw %%xmm6, %%xmm0 \n\t"
+                "paddw %%xmm6, %%xmm2 \n\t"
+                "psrlw $3, %%xmm0 \n\t"
+                "psrlw $3, %%xmm2 \n\t"
+                "packuswb %%xmm2, %%xmm0 \n\t"
+         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
+                "movq %%xmm0, (%0) \n\t"
+                "movhps %%xmm0, (%0,%3) \n\t"
+                "sub $2, %2 \n\t"
+                "lea (%1,%3,2), %1 \n\t"
+                "lea (%0,%3,2), %0 \n\t"
+                "jg 1b \n\t"
+                :"+r"(dst), "+r"(src), "+r"(h)
+                :"r"((x86_reg)stride)
+            );
+        } else {
+            __asm__ volatile(
+                "1: \n\t"
+                "movq (%1), %%xmm0 \n\t"
+                "movq (%1,%3), %%xmm1 \n\t"
+                "movdqa %%xmm1, %%xmm2 \n\t"
+                "movq (%1,%3,2), %%xmm3 \n\t"
+                "punpcklbw %%xmm1, %%xmm0 \n\t"
+                "punpcklbw %%xmm3, %%xmm2 \n\t"
+                "pmaddubsw %%xmm7, %%xmm0 \n\t"
+                "pmaddubsw %%xmm7, %%xmm2 \n\t"
+         AVG_OP("movq (%0), %%xmm4 \n\t")
+         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
+                "paddw %%xmm6, %%xmm0 \n\t"
+                "paddw %%xmm6, %%xmm2 \n\t"
+                "psrlw $3, %%xmm0 \n\t"
+                "psrlw $3, %%xmm2 \n\t"
+                "packuswb %%xmm2, %%xmm0 \n\t"
+         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
+                "movq %%xmm0, (%0) \n\t"
+                "movhps %%xmm0, (%0,%3) \n\t"
+                "sub $2, %2 \n\t"
+                "lea (%1,%3,2), %1 \n\t"
+                "lea (%0,%3,2), %0 \n\t"
+                "jg 1b \n\t"
+                :"+r"(dst), "+r"(src), "+r"(h)
+                :"r"((x86_reg)stride)
+            );
+        }
+        return;
+    }
+
+    /* general case, bilinear */
+    __asm__ volatile(
+        "movd %0, %%xmm7 \n\t"
+        "movd %1, %%xmm6 \n\t"
+        "movdqa %2, %%xmm5 \n\t"
+        "pshuflw $0, %%xmm7, %%xmm7 \n\t"
+        "pshuflw $0, %%xmm6, %%xmm6 \n\t"
+        "movlhps %%xmm7, %%xmm7 \n\t"
+        "movlhps %%xmm6, %%xmm6 \n\t"
+        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
+    );
+
+    __asm__ volatile(
+        "movq (%1), %%xmm0 \n\t"
+        "movq 1(%1), %%xmm1 \n\t"
+        "punpcklbw %%xmm1, %%xmm0 \n\t"
+        "add %3, %1 \n\t"
+        "1: \n\t"
+        "movq (%1), %%xmm1 \n\t"
+        "movq 1(%1), %%xmm2 \n\t"
+        "movq (%1,%3), %%xmm3 \n\t"
+        "movq 1(%1,%3), %%xmm4 \n\t"
+        "lea (%1,%3,2), %1 \n\t"
+        "punpcklbw %%xmm2, %%xmm1 \n\t"
+        "punpcklbw %%xmm4, %%xmm3 \n\t"
+        "movdqa %%xmm1, %%xmm2 \n\t"
+        "movdqa %%xmm3, %%xmm4 \n\t"
+        "pmaddubsw %%xmm7, %%xmm0 \n\t"
+        "pmaddubsw %%xmm6, %%xmm1 \n\t"
+        "pmaddubsw %%xmm7, %%xmm2 \n\t"
+        "pmaddubsw %%xmm6, %%xmm3 \n\t"
+        "paddw %%xmm5, %%xmm0 \n\t"
+        "paddw %%xmm5, %%xmm2 \n\t"
+        "paddw %%xmm0, %%xmm1 \n\t"
+        "paddw %%xmm2, %%xmm3 \n\t"
+        "movdqa %%xmm4, %%xmm0 \n\t"
+        "psrlw $6, %%xmm1 \n\t"
+        "psrlw $6, %%xmm3 \n\t"
+ AVG_OP("movq (%0), %%xmm2 \n\t")
+ AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
+        "packuswb %%xmm3, %%xmm1 \n\t"
+ AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
+        "movq %%xmm1, (%0)\n\t"
+        "movhps %%xmm1, (%0,%3)\n\t"
+        "sub $2, %2 \n\t"
+        "lea (%0,%3,2), %0 \n\t"
+        "jg 1b \n\t"
+        :"+r"(dst), "+r"(src), "+r"(h)
+        :"r"((x86_reg)stride)
+    );
+}
+
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    __asm__ volatile(
+        "movd %0, %%mm7 \n\t"
+        "movd %1, %%mm6 \n\t"
+        "movq %2, %%mm5 \n\t"
+        "pshufw $0, %%mm7, %%mm7 \n\t"
+        "pshufw $0, %%mm6, %%mm6 \n\t"
+        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
+    );
+
+    __asm__ volatile(
+        "movd (%1), %%mm0 \n\t"
+        "punpcklbw 1(%1), %%mm0 \n\t"
+        "add %3, %1 \n\t"
+        "1: \n\t"
+        "movd (%1), %%mm1 \n\t"
+        "movd (%1,%3), %%mm3 \n\t"
+        "punpcklbw 1(%1), %%mm1 \n\t"
+        "punpcklbw 1(%1,%3), %%mm3 \n\t"
+        "lea (%1,%3,2), %1 \n\t"
+        "movq %%mm1, %%mm2 \n\t"
+        "movq %%mm3, %%mm4 \n\t"
+        "pmaddubsw %%mm7, %%mm0 \n\t"
+        "pmaddubsw %%mm6, %%mm1 \n\t"
+        "pmaddubsw %%mm7, %%mm2 \n\t"
+        "pmaddubsw %%mm6, %%mm3 \n\t"
+        "paddw %%mm5, %%mm0 \n\t"
+        "paddw %%mm5, %%mm2 \n\t"
+        "paddw %%mm0, %%mm1 \n\t"
+        "paddw %%mm2, %%mm3 \n\t"
+        "movq %%mm4, %%mm0 \n\t"
+        "psrlw $6, %%mm1 \n\t"
+        "psrlw $6, %%mm3 \n\t"
+        "packuswb %%mm1, %%mm1 \n\t"
+        "packuswb %%mm3, %%mm3 \n\t"
+ AVG_OP("pavgb (%0), %%mm1 \n\t")
+ AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
+        "movd %%mm1, (%0)\n\t"
+        "movd %%mm3, (%0,%3)\n\t"
+        "sub $2, %2 \n\t"
+        "lea (%0,%3,2), %0 \n\t"
+        "jg 1b \n\t"
+        :"+r"(dst), "+r"(src), "+r"(h)
+        :"r"((x86_reg)stride)
+    );
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_mmx.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/dsputil_mmx.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,821 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavutil/internal.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h264_dsp.h"
+#include "dsputil_mmx.h"
+
+
+//#undef NDEBUG
+//#include <assert.h>
+
+int mm_flags; /* multimedia extension flags */
+
+/* pixel operations */
+DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
+
+DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
+{0x8000000080000000ULL, 0x8000000080000000ULL};
+
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
+DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
+
+#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
+#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
+#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
+
+#define MOVQ_BFE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
+    "paddb %%" #regd ", %%" #regd " \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
+#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd " \n\t" \
+    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
+
+#define MOVQ_WTWO(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd " \n\t" \
+    "psllw $1, %%" #regd " \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "pand " #regb ", " #regr "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pand " #regfe "," #regb "  \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "paddb " #regb ", " #regr " \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "por  " #regb ", " #regr "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pand " #regfe "," #regb "  \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psubb " #regb ", " #regr " \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "movq " #regc ", " #regp "  \n\t"\
+    "pand " #regb ", " #regr "  \n\t"\
+    "pand " #regd ", " #regp "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pxor " #regc ", " #regd "  \n\t"\
+    "pand %%mm6, " #regb "      \n\t"\
+    "pand %%mm6, " #regd "      \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psrlq $1, " #regd "        \n\t"\
+    "paddb " #regb ", " #regr " \n\t"\
+    "paddb " #regd ", " #regp " \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "movq " #regc ", " #regp "  \n\t"\
+    "por  " #regb ", " #regr "  \n\t"\
+    "por  " #regd ", " #regp "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pxor " #regc ", " #regd "  \n\t"\
+    "pand %%mm6, " #regb "      \n\t"\
+    "pand %%mm6, " #regd "      \n\t"\
+    "psrlq $1, " #regd "        \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psubb " #regb ", " #regr " \n\t"\
+    "psubb " #regd ", " #regp " \n\t"
+
+/***********************************/
+/* MMX2 specific */
+
+#define DEF(x) x ## _mmx2
+
+/* Introduced only in MMX2 set */
+#define PAVGB "pavgb"
+#define OP_AVG PAVGB
+
+#include "dsputil_mmx_avg_template.c"
+
+#undef DEF
+#undef PAVGB
+#undef OP_AVG
+
+#define put_no_rnd_pixels16_mmx put_pixels16_mmx
+#define put_no_rnd_pixels8_mmx put_pixels8_mmx
+#define put_pixels16_mmx2 put_pixels16_mmx
+#define put_pixels8_mmx2 put_pixels8_mmx
+#define put_pixels4_mmx2 put_pixels4_mmx
+#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
+#define put_pixels16_3dnow put_pixels16_mmx
+#define put_pixels8_3dnow put_pixels8_mmx
+#define put_pixels4_3dnow put_pixels4_mmx
+#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
+
+/***********************************/
+/* standard MMX */
+
+void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    const DCTELEM *p;
+    uint8_t *pix;
+
+    /* read the pixels */
+    p = block;
+    pix = pixels;
+    /* unrolled loop */
+        __asm__ volatile(
+                "movq   %3, %%mm0               \n\t"
+                "movq   8%3, %%mm1              \n\t"
+                "movq   16%3, %%mm2             \n\t"
+                "movq   24%3, %%mm3             \n\t"
+                "movq   32%3, %%mm4             \n\t"
+                "movq   40%3, %%mm5             \n\t"
+                "movq   48%3, %%mm6             \n\t"
+                "movq   56%3, %%mm7             \n\t"
+                "packuswb %%mm1, %%mm0          \n\t"
+                "packuswb %%mm3, %%mm2          \n\t"
+                "packuswb %%mm5, %%mm4          \n\t"
+                "packuswb %%mm7, %%mm6          \n\t"
+                "movq   %%mm0, (%0)             \n\t"
+                "movq   %%mm2, (%0, %1)         \n\t"
+                "movq   %%mm4, (%0, %1, 2)      \n\t"
+                "movq   %%mm6, (%0, %2)         \n\t"
+                ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
+                :"memory");
+        pix += line_size*4;
+        p += 32;
+
+    // if here would be an exact copy of the code above
+    // compiler would generate some very strange code
+    // thus using "r"
+    __asm__ volatile(
+            "movq       (%3), %%mm0             \n\t"
+            "movq       8(%3), %%mm1            \n\t"
+            "movq       16(%3), %%mm2           \n\t"
+            "movq       24(%3), %%mm3           \n\t"
+            "movq       32(%3), %%mm4           \n\t"
+            "movq       40(%3), %%mm5           \n\t"
+            "movq       48(%3), %%mm6           \n\t"
+            "movq       56(%3), %%mm7           \n\t"
+            "packuswb %%mm1, %%mm0              \n\t"
+            "packuswb %%mm3, %%mm2              \n\t"
+            "packuswb %%mm5, %%mm4              \n\t"
+            "packuswb %%mm7, %%mm6              \n\t"
+            "movq       %%mm0, (%0)             \n\t"
+            "movq       %%mm2, (%0, %1)         \n\t"
+            "movq       %%mm4, (%0, %1, 2)      \n\t"
+            "movq       %%mm6, (%0, %2)         \n\t"
+            ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
+            :"memory");
+}
+
+DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+#define put_signed_pixels_clamped_mmx_half(off) \
+            "movq    "#off"(%2), %%mm1          \n\t"\
+            "movq 16+"#off"(%2), %%mm2          \n\t"\
+            "movq 32+"#off"(%2), %%mm3          \n\t"\
+            "movq 48+"#off"(%2), %%mm4          \n\t"\
+            "packsswb  8+"#off"(%2), %%mm1      \n\t"\
+            "packsswb 24+"#off"(%2), %%mm2      \n\t"\
+            "packsswb 40+"#off"(%2), %%mm3      \n\t"\
+            "packsswb 56+"#off"(%2), %%mm4      \n\t"\
+            "paddb %%mm0, %%mm1                 \n\t"\
+            "paddb %%mm0, %%mm2                 \n\t"\
+            "paddb %%mm0, %%mm3                 \n\t"\
+            "paddb %%mm0, %%mm4                 \n\t"\
+            "movq %%mm1, (%0)                   \n\t"\
+            "movq %%mm2, (%0, %3)               \n\t"\
+            "movq %%mm3, (%0, %3, 2)            \n\t"\
+            "movq %%mm4, (%0, %1)               \n\t"
+
+void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    x86_reg line_skip = line_size;
+    x86_reg line_skip3;
+
+    __asm__ volatile (
+            "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
+            "lea (%3, %3, 2), %1                \n\t"
+            put_signed_pixels_clamped_mmx_half(0)
+            "lea (%0, %3, 4), %0                \n\t"
+            put_signed_pixels_clamped_mmx_half(64)
+            :"+&r" (pixels), "=&r" (line_skip3)
+            :"r" (block), "r"(line_skip)
+            :"memory");
+}
+
+void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    const DCTELEM *p;
+    uint8_t *pix;
+    int i;
+
+    /* read the pixels */
+    p = block;
+    pix = pixels;
+    MOVQ_ZERO(mm7);
+    i = 4;
+    do {
+        __asm__ volatile(
+                "movq   (%2), %%mm0     \n\t"
+                "movq   8(%2), %%mm1    \n\t"
+                "movq   16(%2), %%mm2   \n\t"
+                "movq   24(%2), %%mm3   \n\t"
+                "movq   %0, %%mm4       \n\t"
+                "movq   %1, %%mm6       \n\t"
+                "movq   %%mm4, %%mm5    \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "punpckhbw %%mm7, %%mm5 \n\t"
+                "paddsw %%mm4, %%mm0    \n\t"
+                "paddsw %%mm5, %%mm1    \n\t"
+                "movq   %%mm6, %%mm5    \n\t"
+                "punpcklbw %%mm7, %%mm6 \n\t"
+                "punpckhbw %%mm7, %%mm5 \n\t"
+                "paddsw %%mm6, %%mm2    \n\t"
+                "paddsw %%mm5, %%mm3    \n\t"
+                "packuswb %%mm1, %%mm0  \n\t"
+                "packuswb %%mm3, %%mm2  \n\t"
+                "movq   %%mm0, %0       \n\t"
+                "movq   %%mm2, %1       \n\t"
+                :"+m"(*pix), "+m"(*(pix+line_size))
+                :"r"(p)
+                :"memory");
+        pix += line_size*2;
+        p += 16;
+    } while (--i);
+}
+
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ASMALIGN(3)
+         "1:                            \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size)
+         : "%"REG_a, "memory"
+        );
+}
+
+static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "1:                            \n\t"
+         "movdqu (%1), %%xmm0           \n\t"
+         "movdqu (%1,%3), %%xmm1        \n\t"
+         "movdqu (%1,%3,2), %%xmm2      \n\t"
+         "movdqu (%1,%4), %%xmm3        \n\t"
+         "movdqa %%xmm0, (%2)           \n\t"
+         "movdqa %%xmm1, (%2,%3)        \n\t"
+         "movdqa %%xmm2, (%2,%3,2)      \n\t"
+         "movdqa %%xmm3, (%2,%4)        \n\t"
+         "subl $4, %0                   \n\t"
+         "lea (%1,%3,4), %1             \n\t"
+         "lea (%2,%3,4), %2             \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+         : "memory"
+        );
+}
+
+static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "1:                            \n\t"
+         "movdqu (%1), %%xmm0           \n\t"
+         "movdqu (%1,%3), %%xmm1        \n\t"
+         "movdqu (%1,%3,2), %%xmm2      \n\t"
+         "movdqu (%1,%4), %%xmm3        \n\t"
+         "pavgb  (%2), %%xmm0           \n\t"
+         "pavgb  (%2,%3), %%xmm1        \n\t"
+         "pavgb  (%2,%3,2), %%xmm2      \n\t"
+         "pavgb  (%2,%4), %%xmm3        \n\t"
+         "movdqa %%xmm0, (%2)           \n\t"
+         "movdqa %%xmm1, (%2,%3)        \n\t"
+         "movdqa %%xmm2, (%2,%3,2)      \n\t"
+         "movdqa %%xmm3, (%2,%4)        \n\t"
+         "subl $4, %0                   \n\t"
+         "lea (%1,%3,4), %1             \n\t"
+         "lea (%2,%3,4), %2             \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+         : "memory"
+        );
+}
+
+static void clear_block_sse(DCTELEM *block)
+{
+    __asm__ volatile(
+        "xorps  %%xmm0, %%xmm0  \n"
+        "movaps %%xmm0,    (%0) \n"
+        "movaps %%xmm0,  16(%0) \n"
+        "movaps %%xmm0,  32(%0) \n"
+        "movaps %%xmm0,  48(%0) \n"
+        "movaps %%xmm0,  64(%0) \n"
+        "movaps %%xmm0,  80(%0) \n"
+        "movaps %%xmm0,  96(%0) \n"
+        "movaps %%xmm0, 112(%0) \n"
+        :: "r"(block)
+        : "memory"
+    );
+}
+
+static void clear_blocks_sse(DCTELEM *blocks)
+{\
+    __asm__ volatile(
+        "xorps  %%xmm0, %%xmm0  \n"
+        "mov     %1, %%"REG_a"  \n"
+        "1:                     \n"
+        "movaps %%xmm0,    (%0, %%"REG_a") \n"
+        "movaps %%xmm0,  16(%0, %%"REG_a") \n"
+        "movaps %%xmm0,  32(%0, %%"REG_a") \n"
+        "movaps %%xmm0,  48(%0, %%"REG_a") \n"
+        "movaps %%xmm0,  64(%0, %%"REG_a") \n"
+        "movaps %%xmm0,  80(%0, %%"REG_a") \n"
+        "movaps %%xmm0,  96(%0, %%"REG_a") \n"
+        "movaps %%xmm0, 112(%0, %%"REG_a") \n"
+        "add $128, %%"REG_a"    \n"
+        " js 1b                 \n"
+        : : "r" (((uint8_t *)blocks)+128*6),
+            "i" (-128*6)
+        : "%"REG_a
+    );
+}
+
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
+        "movd  %4, %%mm0                \n\t"
+        "movd  %5, %%mm1                \n\t"
+        "movd  %6, %%mm2                \n\t"
+        "movd  %7, %%mm3                \n\t"
+        "punpcklbw %%mm1, %%mm0         \n\t"
+        "punpcklbw %%mm3, %%mm2         \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "punpcklwd %%mm2, %%mm0         \n\t"
+        "punpckhwd %%mm2, %%mm1         \n\t"
+        "movd  %%mm0, %0                \n\t"
+        "punpckhdq %%mm0, %%mm0         \n\t"
+        "movd  %%mm0, %1                \n\t"
+        "movd  %%mm1, %2                \n\t"
+        "punpckhdq %%mm1, %%mm1         \n\t"
+        "movd  %%mm1, %3                \n\t"
+
+        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
+          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
+          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
+          "=m" (*(uint32_t*)(dst + 3*dst_stride))
+        :  "m" (*(uint32_t*)(src + 0*src_stride)),
+           "m" (*(uint32_t*)(src + 1*src_stride)),
+           "m" (*(uint32_t*)(src + 2*src_stride)),
+           "m" (*(uint32_t*)(src + 3*src_stride))
+    );
+}
+
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
+\
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+#define PREFETCH(name, op) \
+static void name(void *mem, int stride, int h){\
+    const uint8_t *p= mem;\
+    do{\
+        __asm__ volatile(#op" %0" :: "m"(*p));\
+        p+= stride;\
+    }while(--h);\
+}
+PREFETCH(prefetch_mmx2,  prefetcht0)
+#undef PREFETCH 
+
+#include "h264dsp_mmx.c"
+
+void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+
+void dsputil_init_mmx(DSPContext* c)
+{
+    mm_flags = mm_support();
+
+    if (mm_flags & FF_MM_MMX) {
+        c->clear_block  = clear_block_sse;
+        c->clear_blocks = clear_blocks_sse;
+        c->prefetch = prefetch_mmx2;
+
+
+#define H264_QPEL_FUNCS(x, y, CPU)\
+            c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
+            c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
+            c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
+            c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
+
+        if((mm_flags & FF_MM_SSE2)){
+            c->put_pixels_tab[0][0] = put_pixels16_sse2;
+            c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
+
+        }
+        if(mm_flags & FF_MM_SSE2){
+            H264_QPEL_FUNCS(0, 1, sse2);
+            H264_QPEL_FUNCS(0, 2, sse2);
+            H264_QPEL_FUNCS(0, 3, sse2);
+            H264_QPEL_FUNCS(1, 1, sse2);
+            H264_QPEL_FUNCS(1, 2, sse2);
+            H264_QPEL_FUNCS(1, 3, sse2);
+            H264_QPEL_FUNCS(2, 1, sse2);
+            H264_QPEL_FUNCS(2, 2, sse2);
+            H264_QPEL_FUNCS(2, 3, sse2);
+            H264_QPEL_FUNCS(3, 1, sse2);
+            H264_QPEL_FUNCS(3, 2, sse2);
+            H264_QPEL_FUNCS(3, 3, sse2);
+        }
+#if HAVE_SSSE3
+        if(mm_flags & FF_MM_SSSE3){
+            H264_QPEL_FUNCS(1, 0, ssse3);
+            H264_QPEL_FUNCS(1, 1, ssse3);
+            H264_QPEL_FUNCS(1, 2, ssse3);
+            H264_QPEL_FUNCS(1, 3, ssse3);
+            H264_QPEL_FUNCS(2, 0, ssse3);
+            H264_QPEL_FUNCS(2, 1, ssse3);
+            H264_QPEL_FUNCS(2, 2, ssse3);
+            H264_QPEL_FUNCS(2, 3, ssse3);
+            H264_QPEL_FUNCS(3, 0, ssse3);
+            H264_QPEL_FUNCS(3, 1, ssse3);
+            H264_QPEL_FUNCS(3, 2, ssse3);
+            H264_QPEL_FUNCS(3, 3, ssse3);
+
+            c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
+            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
+            c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
+            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
+        }
+#endif
+
+
+    }
+}
+
+void ff_h264dsp_init_x86(H264DSPContext *c)
+{
+    mm_flags = mm_support();
+
+    if (mm_flags & FF_MM_MMX) {
+        c->h264_idct_dc_add=
+        c->h264_idct_add= ff_h264_idct_add_mmx;
+        c->h264_idct8_dc_add=
+        c->h264_idct8_add= ff_h264_idct8_add_mmx;
+
+        if (mm_flags & FF_MM_MMX2) {            
+            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
+			c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
+            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
+
+			c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+			c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
+
+			c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
+            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
+            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
+            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
+            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
+            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
+
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+        }
+        if(mm_flags & FF_MM_SSE2){
+            c->h264_idct8_add = ff_h264_idct8_add_sse2;
+            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+        }
+
+    }
+}
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_mmx.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/dsputil_mmx.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,170 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2007  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DSPUTIL_MMX_H
+#define AVCODEC_X86_DSPUTIL_MMX_H
+
+#include <stdint.h>
+#include "libavcodec/dsputil.h"
+
+typedef struct { uint64_t a, b; } xmm_reg;
+
+extern const uint64_t ff_bone;
+extern const uint64_t ff_wtwo;
+
+extern const uint64_t ff_pdw_80000000[2];
+
+extern const uint64_t ff_pw_3;
+extern const uint64_t ff_pw_4;
+extern const xmm_reg  ff_pw_5;
+extern const xmm_reg  ff_pw_8;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg  ff_pw_16;
+extern const uint64_t ff_pw_20;
+extern const xmm_reg  ff_pw_28;
+extern const xmm_reg  ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const xmm_reg  ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_255;
+
+extern const uint64_t ff_pb_1;
+extern const uint64_t ff_pb_3;
+extern const uint64_t ff_pb_7;
+extern const uint64_t ff_pb_1F;
+extern const uint64_t ff_pb_3F;
+extern const uint64_t ff_pb_81;
+extern const uint64_t ff_pb_A1;
+extern const uint64_t ff_pb_FC;
+
+extern const double ff_pd_1[2];
+extern const double ff_pd_2[2];
+
+#define LOAD4(stride,in,a,b,c,d)\
+    "movq 0*"#stride"+"#in", "#a"\n\t"\
+    "movq 1*"#stride"+"#in", "#b"\n\t"\
+    "movq 2*"#stride"+"#in", "#c"\n\t"\
+    "movq 3*"#stride"+"#in", "#d"\n\t"
+
+#define STORE4(stride,out,a,b,c,d)\
+    "movq "#a", 0*"#stride"+"#out"\n\t"\
+    "movq "#b", 1*"#stride"+"#out"\n\t"\
+    "movq "#c", 2*"#stride"+"#out"\n\t"\
+    "movq "#d", 3*"#stride"+"#out"\n\t"
+
+/* in/out: mma=mma+mmb, mmb=mmb-mma */
+#define SUMSUB_BA( a, b ) \
+    "paddw "#b", "#a" \n\t"\
+    "paddw "#b", "#b" \n\t"\
+    "psubw "#a", "#b" \n\t"
+
+#define SBUTTERFLY(a,b,t,n,m)\
+    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
+    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
+    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
+
+#define TRANSPOSE4(a,b,c,d,t)\
+    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
+    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
+    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
+    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
+
+// e,f,g,h can be memory
+// out: a,d,t,c
+#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
+    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
+    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
+    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
+    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
+    SBUTTERFLY(a, b, t, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
+                                 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
+    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
+                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
+    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
+                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
+    SBUTTERFLY(t, b, c, wd, q)   /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
+                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
+
+#if ARCH_X86_64
+// permutes 01234567 -> 05736421
+#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
+    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
+    SBUTTERFLY(c,d,b,wd,dqa)\
+    SBUTTERFLY(e,f,d,wd,dqa)\
+    SBUTTERFLY(g,h,f,wd,dqa)\
+    SBUTTERFLY(a,c,h,dq,dqa)\
+    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
+    SBUTTERFLY(e,g,b,dq,dqa)\
+    SBUTTERFLY(d,f,g,dq,dqa)\
+    SBUTTERFLY(a,e,f,qdq,dqa)\
+    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
+    SBUTTERFLY(h,b,d,qdq,dqa)\
+    SBUTTERFLY(c,g,b,qdq,dqa)\
+    "movdqa %%xmm8, "#g"              \n\t"
+#else
+#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
+    "movdqa "#h", "#t"                \n\t"\
+    SBUTTERFLY(a,b,h,wd,dqa)\
+    "movdqa "#h", 16"#t"              \n\t"\
+    "movdqa "#t", "#h"                \n\t"\
+    SBUTTERFLY(c,d,b,wd,dqa)\
+    SBUTTERFLY(e,f,d,wd,dqa)\
+    SBUTTERFLY(g,h,f,wd,dqa)\
+    SBUTTERFLY(a,c,h,dq,dqa)\
+    "movdqa "#h", "#t"                \n\t"\
+    "movdqa 16"#t", "#h"              \n\t"\
+    SBUTTERFLY(h,b,c,dq,dqa)\
+    SBUTTERFLY(e,g,b,dq,dqa)\
+    SBUTTERFLY(d,f,g,dq,dqa)\
+    SBUTTERFLY(a,e,f,qdq,dqa)\
+    SBUTTERFLY(h,d,e,qdq,dqa)\
+    "movdqa "#h", 16"#t"              \n\t"\
+    "movdqa "#t", "#h"                \n\t"\
+    SBUTTERFLY(h,b,d,qdq,dqa)\
+    SBUTTERFLY(c,g,b,qdq,dqa)\
+    "movdqa 16"#t", "#g"              \n\t"
+#endif
+
+#define MOVQ_WONE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd ::)
+
+void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
+void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
+void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
+void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
+
+void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag,
+                                   double *autoc);
+
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
+
+#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_mmx_avg_template.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/dsputil_mmx_avg_template.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,250 @@
+/*
+ * DSP utils : average functions are compiled twice for 3dnow/mmx2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%2), %%mm0               \n\t"
+        "movq (%2, %3), %%mm1           \n\t"
+        PAVGB" (%1), %%mm0              \n\t"
+        PAVGB" (%1, %3), %%mm1          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%2), %%mm0               \n\t"
+        "movq (%2, %3), %%mm1           \n\t"
+        PAVGB" (%1), %%mm0              \n\t"
+        PAVGB" (%1, %3), %%mm1          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/h264dsp_mmx.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/h264dsp_mmx.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,1741 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
+
+/***********************************/
+/* IDCT */
+
+#define SUMSUB_BADC( a, b, c, d ) \
+    "paddw "#b", "#a" \n\t"\
+    "paddw "#d", "#c" \n\t"\
+    "paddw "#b", "#b" \n\t"\
+    "paddw "#d", "#d" \n\t"\
+    "psubw "#a", "#b" \n\t"\
+    "psubw "#c", "#d" \n\t"
+
+#define SUMSUBD2_AB( a, b, t ) \
+    "movq  "#b", "#t" \n\t"\
+    "psraw  $1 , "#b" \n\t"\
+    "paddw "#a", "#b" \n\t"\
+    "psraw  $1 , "#a" \n\t"\
+    "psubw "#t", "#a" \n\t"
+
+#define IDCT4_1D( s02, s13, d02, d13, t ) \
+    SUMSUB_BA  ( s02, d02 )\
+    SUMSUBD2_AB( s13, d13, t )\
+    SUMSUB_BADC( d13, s02, s13, d02 )
+
+#define STORE_DIFF_4P( p, t, z ) \
+    "psraw      $6,     "#p" \n\t"\
+    "movd       (%0),   "#t" \n\t"\
+    "punpcklbw "#z",    "#t" \n\t"\
+    "paddsw    "#t",    "#p" \n\t"\
+    "packuswb  "#z",    "#p" \n\t"\
+    "movd      "#p",    (%0) \n\t"
+
+static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    /* Load dct coeffs */
+    __asm__ volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq  8(%0), %%mm1 \n\t"
+        "movq 16(%0), %%mm2 \n\t"
+        "movq 24(%0), %%mm3 \n\t"
+    :: "r"(block) );
+
+    __asm__ volatile(
+        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
+
+        "movq      %0,    %%mm6 \n\t"
+        /* in: 1,4,0,2  out: 1,2,3,0 */
+        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
+
+        "paddw     %%mm6, %%mm3 \n\t"
+
+        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
+
+        "pxor %%mm7, %%mm7    \n\t"
+    :: "m"(ff_pw_32));
+
+    __asm__ volatile(
+    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
+        : "+r"(dst)
+        : "r" ((x86_reg)stride)
+    );
+}
+
+static inline void h264_idct8_1d(int16_t *block)
+{
+    __asm__ volatile(
+        "movq 112(%0), %%mm7  \n\t"
+        "movq  80(%0), %%mm0  \n\t"
+        "movq  48(%0), %%mm3  \n\t"
+        "movq  16(%0), %%mm5  \n\t"
+
+        "movq   %%mm0, %%mm4  \n\t"
+        "movq   %%mm5, %%mm1  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm1  \n\t"
+        "paddw  %%mm0, %%mm4  \n\t"
+        "paddw  %%mm5, %%mm1  \n\t"
+        "paddw  %%mm7, %%mm4  \n\t"
+        "paddw  %%mm0, %%mm1  \n\t"
+        "psubw  %%mm5, %%mm4  \n\t"
+        "paddw  %%mm3, %%mm1  \n\t"
+
+        "psubw  %%mm3, %%mm5  \n\t"
+        "psubw  %%mm3, %%mm0  \n\t"
+        "paddw  %%mm7, %%mm5  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
+        "psraw  $1,    %%mm3  \n\t"
+        "psraw  $1,    %%mm7  \n\t"
+        "psubw  %%mm3, %%mm5  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
+
+        "movq   %%mm4, %%mm3  \n\t"
+        "movq   %%mm1, %%mm7  \n\t"
+        "psraw  $2,    %%mm1  \n\t"
+        "psraw  $2,    %%mm3  \n\t"
+        "paddw  %%mm5, %%mm3  \n\t"
+        "psraw  $2,    %%mm5  \n\t"
+        "paddw  %%mm0, %%mm1  \n\t"
+        "psraw  $2,    %%mm0  \n\t"
+        "psubw  %%mm4, %%mm5  \n\t"
+        "psubw  %%mm0, %%mm7  \n\t"
+
+        "movq  32(%0), %%mm2  \n\t"
+        "movq  96(%0), %%mm6  \n\t"
+        "movq   %%mm2, %%mm4  \n\t"
+        "movq   %%mm6, %%mm0  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm6  \n\t"
+        "psubw  %%mm0, %%mm4  \n\t"
+        "paddw  %%mm2, %%mm6  \n\t"
+
+        "movq    (%0), %%mm2  \n\t"
+        "movq  64(%0), %%mm0  \n\t"
+        SUMSUB_BA( %%mm0, %%mm2 )
+        SUMSUB_BA( %%mm6, %%mm0 )
+        SUMSUB_BA( %%mm4, %%mm2 )
+        SUMSUB_BA( %%mm7, %%mm6 )
+        SUMSUB_BA( %%mm5, %%mm4 )
+        SUMSUB_BA( %%mm3, %%mm2 )
+        SUMSUB_BA( %%mm1, %%mm0 )
+        :: "r"(block)
+    );
+}
+
+static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    int i;
+    DECLARE_ALIGNED(8, int16_t, b2)[64];
+
+    block[0] += 32;
+
+    for(i=0; i<2; i++){
+        DECLARE_ALIGNED(8, uint64_t, tmp);
+
+        h264_idct8_1d(block+4*i);
+
+        __asm__ volatile(
+            "movq   %%mm7,    %0   \n\t"
+            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+            "movq   %%mm0,  8(%1)  \n\t"
+            "movq   %%mm6, 24(%1)  \n\t"
+            "movq   %%mm7, 40(%1)  \n\t"
+            "movq   %%mm4, 56(%1)  \n\t"
+            "movq    %0,    %%mm7  \n\t"
+            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+            "movq   %%mm7,   (%1)  \n\t"
+            "movq   %%mm1, 16(%1)  \n\t"
+            "movq   %%mm0, 32(%1)  \n\t"
+            "movq   %%mm3, 48(%1)  \n\t"
+            : "=m"(tmp)
+            : "r"(b2+32*i)
+            : "memory"
+        );
+    }
+
+    for(i=0; i<2; i++){
+        h264_idct8_1d(b2+4*i);
+
+        __asm__ volatile(
+            "psraw     $6, %%mm7  \n\t"
+            "psraw     $6, %%mm6  \n\t"
+            "psraw     $6, %%mm5  \n\t"
+            "psraw     $6, %%mm4  \n\t"
+            "psraw     $6, %%mm3  \n\t"
+            "psraw     $6, %%mm2  \n\t"
+            "psraw     $6, %%mm1  \n\t"
+            "psraw     $6, %%mm0  \n\t"
+
+            "movq   %%mm7,    (%0)  \n\t"
+            "movq   %%mm5,  16(%0)  \n\t"
+            "movq   %%mm3,  32(%0)  \n\t"
+            "movq   %%mm1,  48(%0)  \n\t"
+            "movq   %%mm0,  64(%0)  \n\t"
+            "movq   %%mm2,  80(%0)  \n\t"
+            "movq   %%mm4,  96(%0)  \n\t"
+            "movq   %%mm6, 112(%0)  \n\t"
+            :: "r"(b2+4*i)
+            : "memory"
+        );
+    }
+
+    add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+#define STORE_DIFF_8P( p, d, t, z )\
+        "movq       "#d", "#t" \n"\
+        "psraw       $6,  "#p" \n"\
+        "punpcklbw  "#z", "#t" \n"\
+        "paddsw     "#t", "#p" \n"\
+        "packuswb   "#p", "#p" \n"\
+        "movq       "#p", "#d" \n"
+
+#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
+        "movdqa     "#c", "#a" \n"\
+        "movdqa     "#g", "#e" \n"\
+        "psraw       $1,  "#c" \n"\
+        "psraw       $1,  "#g" \n"\
+        "psubw      "#e", "#c" \n"\
+        "paddw      "#a", "#g" \n"\
+        "movdqa     "#b", "#e" \n"\
+        "psraw       $1,  "#e" \n"\
+        "paddw      "#b", "#e" \n"\
+        "paddw      "#d", "#e" \n"\
+        "paddw      "#f", "#e" \n"\
+        "movdqa     "#f", "#a" \n"\
+        "psraw       $1,  "#a" \n"\
+        "paddw      "#f", "#a" \n"\
+        "paddw      "#h", "#a" \n"\
+        "psubw      "#b", "#a" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#d", "#f" \n"\
+        "paddw      "#h", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "psraw       $1,  "#d" \n"\
+        "psraw       $1,  "#h" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "movdqa     "#e", "#d" \n"\
+        "movdqa     "#a", "#h" \n"\
+        "psraw       $2,  "#d" \n"\
+        "psraw       $2,  "#h" \n"\
+        "paddw      "#f", "#d" \n"\
+        "paddw      "#b", "#h" \n"\
+        "psraw       $2,  "#f" \n"\
+        "psraw       $2,  "#b" \n"\
+        "psubw      "#f", "#e" \n"\
+        "psubw      "#a", "#b" \n"\
+        "movdqa 0x00(%1), "#a" \n"\
+        "movdqa 0x40(%1), "#f" \n"\
+        SUMSUB_BA(f, a)\
+        SUMSUB_BA(g, f)\
+        SUMSUB_BA(c, a)\
+        SUMSUB_BA(e, g)\
+        SUMSUB_BA(b, c)\
+        SUMSUB_BA(h, a)\
+        SUMSUB_BA(d, f)
+
+static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile(
+        "movdqa   0x10(%1), %%xmm1 \n"
+        "movdqa   0x20(%1), %%xmm2 \n"
+        "movdqa   0x30(%1), %%xmm3 \n"
+        "movdqa   0x50(%1), %%xmm5 \n"
+        "movdqa   0x60(%1), %%xmm6 \n"
+        "movdqa   0x70(%1), %%xmm7 \n"
+        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
+        "paddw          %4, %%xmm4 \n"
+        "movdqa     %%xmm4, 0x00(%1) \n"
+        "movdqa     %%xmm2, 0x40(%1) \n"
+        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
+        "movdqa     %%xmm6, 0x60(%1) \n"
+        "movdqa     %%xmm7, 0x70(%1) \n"
+        "pxor       %%xmm7, %%xmm7 \n"
+        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
+        "lea     (%0,%2,4), %0 \n"
+        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
+        "movdqa   0x60(%1), %%xmm0 \n"
+        "movdqa   0x70(%1), %%xmm1 \n"
+        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
+        :"+r"(dst)
+        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
+    );
+}
+
+static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    __asm__ volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    __asm__ volatile(
+        "movd          %0, %%mm2 \n\t"
+        "movd          %1, %%mm3 \n\t"
+        "movd          %2, %%mm4 \n\t"
+        "movd          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movd       %%mm2, %0    \n\t"
+        "movd       %%mm3, %1    \n\t"
+        "movd       %%mm4, %2    \n\t"
+        "movd       %%mm5, %3    \n\t"
+        :"+m"(*(uint32_t*)(dst+0*stride)),
+         "+m"(*(uint32_t*)(dst+1*stride)),
+         "+m"(*(uint32_t*)(dst+2*stride)),
+         "+m"(*(uint32_t*)(dst+3*stride))
+    );
+}
+
+static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    int y;
+    __asm__ volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    for(y=2; y--; dst += 4*stride){
+    __asm__ volatile(
+        "movq          %0, %%mm2 \n\t"
+        "movq          %1, %%mm3 \n\t"
+        "movq          %2, %%mm4 \n\t"
+        "movq          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movq       %%mm2, %0    \n\t"
+        "movq       %%mm3, %1    \n\t"
+        "movq       %%mm4, %2    \n\t"
+        "movq       %%mm5, %3    \n\t"
+        :"+m"(*(uint64_t*)(dst+0*stride)),
+         "+m"(*(uint64_t*)(dst+1*stride)),
+         "+m"(*(uint64_t*)(dst+2*stride)),
+         "+m"(*(uint64_t*)(dst+3*stride))
+    );
+    }
+}
+
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
+        else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16])
+            ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+
+/***********************************/
+/* deblocking */
+
+// out: o = |x-y|>a
+// clobbers: t
+#define DIFF_GT_MMX(x,y,a,o,t)\
+    "movq     "#y", "#t"  \n\t"\
+    "movq     "#x", "#o"  \n\t"\
+    "psubusb  "#x", "#t"  \n\t"\
+    "psubusb  "#y", "#o"  \n\t"\
+    "por      "#t", "#o"  \n\t"\
+    "psubusb  "#a", "#o"  \n\t"
+
+// out: o = |x-y|>a
+// clobbers: t
+#define DIFF_GT2_MMX(x,y,a,o,t)\
+    "movq     "#y", "#t"  \n\t"\
+    "movq     "#x", "#o"  \n\t"\
+    "psubusb  "#x", "#t"  \n\t"\
+    "psubusb  "#y", "#o"  \n\t"\
+    "psubusb  "#a", "#t"  \n\t"\
+    "psubusb  "#a", "#o"  \n\t"\
+    "pcmpeqb  "#t", "#o"  \n\t"\
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
+// out: mm5=beta-1, mm7=mask
+// clobbers: mm4,mm6
+#define H264_DEBLOCK_MASK(alpha1, beta1) \
+    "pshufw $0, "#alpha1", %%mm4 \n\t"\
+    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
+    "packuswb  %%mm4, %%mm4      \n\t"\
+    "packuswb  %%mm5, %%mm5      \n\t"\
+    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
+    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
+    "por       %%mm4, %%mm7      \n\t"\
+    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
+    "por       %%mm4, %%mm7      \n\t"\
+    "pxor      %%mm6, %%mm6      \n\t"\
+    "pcmpeqb   %%mm6, %%mm7      \n\t"
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
+// out: mm1=p0' mm2=q0'
+// clobbers: mm0,3-6
+#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
+        "movq    %%mm1              , %%mm5 \n\t"\
+        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
+        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
+        "pcmpeqb %%mm4              , %%mm4 \n\t"\
+        "pxor    %%mm4              , %%mm3 \n\t"\
+        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
+        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
+        "pxor    %%mm1              , %%mm4 \n\t"\
+        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
+        "pavgb   %%mm5              , %%mm3 \n\t"\
+        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
+        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
+        "psubusb %%mm3              , %%mm6 \n\t"\
+        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
+        "pminub  %%mm7              , %%mm6 \n\t"\
+        "pminub  %%mm7              , %%mm3 \n\t"\
+        "psubusb %%mm6              , %%mm1 \n\t"\
+        "psubusb %%mm3              , %%mm2 \n\t"\
+        "paddusb %%mm3              , %%mm1 \n\t"\
+        "paddusb %%mm6              , %%mm2 \n\t"
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
+// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+// clobbers: q2, tmp, tc0
+#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
+        "movq     %%mm1,  "#tmp"   \n\t"\
+        "pavgb    %%mm2,  "#tmp"   \n\t"\
+        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
+        "pxor   "q2addr", "#tmp"   \n\t"\
+        "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
+        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
+        "movq     "#p1",  "#tmp"   \n\t"\
+        "psubusb  "#tc0", "#tmp"   \n\t"\
+        "paddusb  "#p1",  "#tc0"   \n\t"\
+        "pmaxub   "#tmp", "#q2"    \n\t"\
+        "pminub   "#tc0", "#q2"    \n\t"\
+        "movq     "#q2",  "q1addr" \n\t"
+
+static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
+{
+    DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
+
+    __asm__ volatile(
+        "movq    (%2,%4), %%mm0    \n\t" //p1
+        "movq    (%2,%4,2), %%mm1  \n\t" //p0
+        "movq    (%3),    %%mm2    \n\t" //q0
+        "movq    (%3,%4), %%mm3    \n\t" //q1
+        H264_DEBLOCK_MASK(%7, %8)
+
+        "movd      %6,    %%mm4    \n\t"
+        "punpcklbw %%mm4, %%mm4    \n\t"
+        "punpcklwd %%mm4, %%mm4    \n\t"
+        "pcmpeqb   %%mm3, %%mm3    \n\t"
+        "movq      %%mm4, %%mm6    \n\t"
+        "pcmpgtb   %%mm3, %%mm4    \n\t"
+        "movq      %%mm6, %1       \n\t"
+        "pand      %%mm4, %%mm7    \n\t"
+        "movq      %%mm7, %0       \n\t"
+
+        /* filter p1 */
+        "movq     (%2),   %%mm3    \n\t" //p2
+        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
+        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
+        "pand     %1,     %%mm7    \n\t" // mask & tc0
+        "movq     %%mm7,  %%mm4    \n\t"
+        "psubb    %%mm6,  %%mm7    \n\t"
+        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
+        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
+
+        /* filter q1 */
+        "movq    (%3,%4,2), %%mm4  \n\t" //q2
+        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
+        "pand     %0,     %%mm6    \n\t"
+        "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
+        "pand     %%mm6,  %%mm5    \n\t"
+        "psubb    %%mm6,  %%mm7    \n\t"
+        "movq    (%3,%4), %%mm3    \n\t"
+        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
+
+        /* filter p0, q0 */
+        H264_DEBLOCK_P0_Q0(%9, unused)
+        "movq      %%mm1, (%2,%4,2) \n\t"
+        "movq      %%mm2, (%3)      \n\t"
+
+        : "=m"(tmp0[0]), "=m"(tmp0[1])
+        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
+          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
+          "m"(ff_bone)
+    );
+}
+
+static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    if((tc0[0] & tc0[1]) >= 0)
+        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
+    if((tc0[2] & tc0[3]) >= 0)
+        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
+}
+static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    //FIXME: could cut some load/stores by merging transpose with filter
+    // also, it only needs to transpose 6x8
+    DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
+    int i;
+    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
+        if((tc0[0] & tc0[1]) < 0)
+            continue;
+        transpose4x4(trans,       pix-4,          8, stride);
+        transpose4x4(trans  +4*8, pix,            8, stride);
+        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
+        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
+        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
+        transpose4x4(pix-2,          trans  +2*8, stride, 8);
+        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
+    }
+}
+
+static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
+{
+    __asm__ volatile(
+        "movq    (%0),    %%mm0     \n\t" //p1
+        "movq    (%0,%2), %%mm1     \n\t" //p0
+        "movq    (%1),    %%mm2     \n\t" //q0
+        "movq    (%1,%2), %%mm3     \n\t" //q1
+        H264_DEBLOCK_MASK(%4, %5)
+        "movd      %3,    %%mm6     \n\t"
+        "punpcklbw %%mm6, %%mm6     \n\t"
+        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
+        H264_DEBLOCK_P0_Q0(%6, %7)
+        "movq      %%mm1, (%0,%2)   \n\t"
+        "movq      %%mm2, (%1)      \n\t"
+
+        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
+           "r"(*(uint32_t*)tc0),
+           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
+    );
+}
+
+static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
+}
+
+static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    //FIXME: could cut some load/stores by merging transpose with filter
+    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
+    transpose4x4(trans, pix-2, 8, stride);
+    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
+    transpose4x4(pix-2, trans, stride, 8);
+    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
+
+// p0 = (p0 + q1 + 2*p1 + 2) >> 2
+#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
+    "movq    "#p0", %%mm4  \n\t"\
+    "pxor    "#q1", %%mm4  \n\t"\
+    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
+    "pavgb   "#q1", "#p0"  \n\t"\
+    "psubusb %%mm4, "#p0"  \n\t"\
+    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
+
+static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
+{
+    __asm__ volatile(
+        "movq    (%0),    %%mm0     \n\t"
+        "movq    (%0,%2), %%mm1     \n\t"
+        "movq    (%1),    %%mm2     \n\t"
+        "movq    (%1,%2), %%mm3     \n\t"
+        H264_DEBLOCK_MASK(%3, %4)
+        "movq    %%mm1,   %%mm5     \n\t"
+        "movq    %%mm2,   %%mm6     \n\t"
+        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
+        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
+        "psubb   %%mm5,   %%mm1     \n\t"
+        "psubb   %%mm6,   %%mm2     \n\t"
+        "pand    %%mm7,   %%mm1     \n\t"
+        "pand    %%mm7,   %%mm2     \n\t"
+        "paddb   %%mm5,   %%mm1     \n\t"
+        "paddb   %%mm6,   %%mm2     \n\t"
+        "movq    %%mm1,   (%0,%2)   \n\t"
+        "movq    %%mm2,   (%1)      \n\t"
+        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
+           "m"(alpha1), "m"(beta1), "m"(ff_bone)
+    );
+}
+
+static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
+}
+
+static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
+{
+    //FIXME: could cut some load/stores by merging transpose with filter
+    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
+    transpose4x4(trans, pix-2, 8, stride);
+    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
+    transpose4x4(pix-2, trans, stride, 8);
+    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
+
+static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
+                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
+    int dir;
+    __asm__ volatile(
+        "movq %0, %%mm7 \n"
+        "movq %1, %%mm6 \n"
+        ::"m"(ff_pb_1), "m"(ff_pb_3)
+    );
+    if(field)
+        __asm__ volatile(
+            "movq %0, %%mm6 \n"
+            ::"m"(ff_pb_3_1)
+        );
+    __asm__ volatile(
+        "movq  %%mm6, %%mm5 \n"
+        "paddb %%mm5, %%mm5 \n"
+    :);
+
+    // could do a special case for dir==0 && edges==1, but it only reduces the
+    // average filter time by 1.2%
+    for( dir=1; dir>=0; dir-- ) {
+        const x86_reg d_idx = dir ? -8 : -1;
+        const int mask_mv = dir ? mask_mv1 : mask_mv0;
+        DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
+        int b_idx, edge;
+        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
+            __asm__ volatile(
+                "pand %0, %%mm0 \n\t"
+                ::"m"(mask_dir)
+            );
+            if(!(mask_mv & edge)) {
+                if(bidir) {
+                    __asm__ volatile(
+                        "movd         (%1,%0), %%mm2 \n"
+                        "punpckldq  40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
+                        "pshufw $0x44,   (%1), %%mm0 \n" // { ref0[b], ref0[b] }
+                        "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
+                        "pshufw $0x4E, %%mm2, %%mm3 \n"
+                        "psubb         %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
+                        "psubb         %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
+                        "1: \n"
+                        "por           %%mm1, %%mm0 \n"
+                        "movq      (%2,%0,4), %%mm1 \n"
+                        "movq     8(%2,%0,4), %%mm2 \n"
+                        "movq          %%mm1, %%mm3 \n"
+                        "movq          %%mm2, %%mm4 \n"
+                        "psubw          (%2), %%mm1 \n"
+                        "psubw         8(%2), %%mm2 \n"
+                        "psubw       160(%2), %%mm3 \n"
+                        "psubw       168(%2), %%mm4 \n"
+                        "packsswb      %%mm2, %%mm1 \n"
+                        "packsswb      %%mm4, %%mm3 \n"
+                        "paddb         %%mm6, %%mm1 \n"
+                        "paddb         %%mm6, %%mm3 \n"
+                        "psubusb       %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
+                        "psubusb       %%mm5, %%mm3 \n"
+                        "packsswb      %%mm3, %%mm1 \n"
+                        "add $40, %0 \n"
+                        "cmp $40, %0 \n"
+                        "jl 1b \n"
+                        "sub $80, %0 \n"
+                        "pshufw $0x4E, %%mm1, %%mm1 \n"
+                        "por           %%mm1, %%mm0 \n"
+                        "pshufw $0x4E, %%mm0, %%mm1 \n"
+                        "pminub        %%mm1, %%mm0 \n"
+                        ::"r"(d_idx),
+                          "r"(ref[0]+b_idx),
+                          "r"(mv[0]+b_idx)
+                    );
+                } else {
+                    __asm__ volatile(
+                        "movd        (%1), %%mm0 \n"
+                        "psubb    (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
+                        "movq        (%2), %%mm1 \n"
+                        "movq       8(%2), %%mm2 \n"
+                        "psubw  (%2,%0,4), %%mm1 \n"
+                        "psubw 8(%2,%0,4), %%mm2 \n"
+                        "packsswb   %%mm2, %%mm1 \n"
+                        "paddb      %%mm6, %%mm1 \n"
+                        "psubusb    %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
+                        "packsswb   %%mm1, %%mm1 \n"
+                        "por        %%mm1, %%mm0 \n"
+                        ::"r"(d_idx),
+                          "r"(ref[0]+b_idx),
+                          "r"(mv[0]+b_idx)
+                    );
+                }
+            }
+            __asm__ volatile(
+                "movd %0, %%mm1 \n"
+                "por  %1, %%mm1 \n" // nnz[b] || nnz[bn]
+                ::"m"(nnz[b_idx]),
+                  "m"(nnz[b_idx+d_idx])
+            );
+            __asm__ volatile(
+                "pminub    %%mm7, %%mm1 \n"
+                "pminub    %%mm7, %%mm0 \n"
+                "psllw        $1, %%mm1 \n"
+                "pxor      %%mm2, %%mm2 \n"
+                "pmaxub    %%mm0, %%mm1 \n"
+                "punpcklbw %%mm2, %%mm1 \n"
+                "movq      %%mm1, %0    \n"
+                :"=m"(*bS[dir][edge])
+                ::"memory"
+            );
+        }
+        edges = 4;
+        step = 1;
+    }
+    __asm__ volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq  8(%0), %%mm1 \n\t"
+        "movq 16(%0), %%mm2 \n\t"
+        "movq 24(%0), %%mm3 \n\t"
+        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
+        "movq %%mm0,   (%0) \n\t"
+        "movq %%mm3,  8(%0) \n\t"
+        "movq %%mm4, 16(%0) \n\t"
+        "movq %%mm2, 24(%0) \n\t"
+        ::"r"(bS[0])
+        :"memory"
+    );
+}
+
+/***********************************/
+/* motion compensation */
+
+#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
+        "mov"#q" "#C", "#T"         \n\t"\
+        "mov"#d" (%0), "#F"         \n\t"\
+        "paddw "#D", "#T"           \n\t"\
+        "psllw $2, "#T"             \n\t"\
+        "psubw "#B", "#T"           \n\t"\
+        "psubw "#E", "#T"           \n\t"\
+        "punpcklbw "#Z", "#F"       \n\t"\
+        "pmullw %4, "#T"            \n\t"\
+        "paddw %5, "#A"             \n\t"\
+        "add %2, %0                 \n\t"\
+        "paddw "#F", "#A"           \n\t"\
+        "paddw "#A", "#T"           \n\t"\
+        "psraw $5, "#T"             \n\t"\
+        "packuswb "#T", "#T"        \n\t"\
+        OP(T, (%1), A, d)\
+        "add %3, %1                 \n\t"
+
+#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
+        "mov"#q" "#C", "#T"         \n\t"\
+        "mov"#d" (%0), "#F"         \n\t"\
+        "paddw "#D", "#T"           \n\t"\
+        "psllw $2, "#T"             \n\t"\
+        "paddw %4, "#A"             \n\t"\
+        "psubw "#B", "#T"           \n\t"\
+        "psubw "#E", "#T"           \n\t"\
+        "punpcklbw "#Z", "#F"       \n\t"\
+        "pmullw %3, "#T"            \n\t"\
+        "paddw "#F", "#A"           \n\t"\
+        "add %2, %0                 \n\t"\
+        "paddw "#A", "#T"           \n\t"\
+        "mov"#q" "#T", "#OF"(%1)    \n\t"
+
+#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
+#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
+#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
+#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
+
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %0, %%mm6             \n\t"\
+        :: "m"(ff_pw_5)\
+    );\
+    do{\
+    __asm__ volatile(\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "psllw $2, %%mm0            \n\t"\
+        "psllw $2, %%mm1            \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movd   -2(%0), %%mm2       \n\t"\
+        "movd    7(%0), %%mm5       \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "movq %5, %%mm5             \n\t"\
+        "paddw %%mm5, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm4, %%mm1         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "psraw $5, %%mm1            \n\t"\
+        "movq (%2), %%mm4           \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        PAVGB" %%mm4, %%mm0         \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
+\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int w = size>>4;\
+    do{\
+    int h = size;\
+    __asm__ volatile(\
+        "1:                         \n\t"\
+        "movq     (%0), %%mm0       \n\t"\
+        "movq    8(%0), %%mm3       \n\t"\
+        "movq    2(%0), %%mm1       \n\t"\
+        "movq   10(%0), %%mm4       \n\t"\
+        "paddw   %%mm4, %%mm0       \n\t"\
+        "paddw   %%mm3, %%mm1       \n\t"\
+        "paddw  18(%0), %%mm3       \n\t"\
+        "paddw  16(%0), %%mm4       \n\t"\
+        "movq    4(%0), %%mm2       \n\t"\
+        "movq   12(%0), %%mm5       \n\t"\
+        "paddw   6(%0), %%mm2       \n\t"\
+        "paddw  14(%0), %%mm5       \n\t"\
+        "psubw %%mm1, %%mm0         \n\t"\
+        "psubw %%mm4, %%mm3         \n\t"\
+        "psraw $2, %%mm0            \n\t"\
+        "psraw $2, %%mm3            \n\t"\
+        "psubw %%mm1, %%mm0         \n\t"\
+        "psubw %%mm4, %%mm3         \n\t"\
+        "paddsw %%mm2, %%mm0        \n\t"\
+        "paddsw %%mm5, %%mm3        \n\t"\
+        "psraw $2, %%mm0            \n\t"\
+        "psraw $2, %%mm3            \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm5, %%mm3         \n\t"\
+        "psraw $6, %%mm0            \n\t"\
+        "psraw $6, %%mm3            \n\t"\
+        "packuswb %%mm3, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm7, q)\
+        "add $48, %0                \n\t"\
+        "add %3, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(tmp), "+c"(dst), "+g"(h)\
+        : "S"((x86_reg)dstStride)\
+        : "memory"\
+    );\
+    tmp += 8 - size*24;\
+    dst += 8 - size*dstStride;\
+    }while(w--);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    do{\
+    __asm__ volatile(\
+        "movq      (%1), %%mm0          \n\t"\
+        "movq     8(%1), %%mm1          \n\t"\
+        "movq    48(%1), %%mm2          \n\t"\
+        "movq  8+48(%1), %%mm3          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "psraw      $5,  %%mm2          \n\t"\
+        "psraw      $5,  %%mm3          \n\t"\
+        "packuswb %%mm1, %%mm0          \n\t"\
+        "packuswb %%mm3, %%mm2          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm2          \n\t"\
+        OP(%%mm0, (%2), %%mm5, q)\
+        OP(%%mm2, (%2,%4), %%mm5, q)\
+        ::"a"(src8), "c"(src16), "d"(dst),\
+          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
+        :"memory");\
+        src8 += 2L*src8Stride;\
+        src16 += 48;\
+        dst += 2L*dstStride;\
+    }while(h-=2);\
+}\
+static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
+    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#if ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=16;\
+    __asm__ volatile(\
+        "pxor %%xmm15, %%xmm15      \n\t"\
+        "movdqa %6, %%xmm14         \n\t"\
+        "movdqa %7, %%xmm13         \n\t"\
+        "1:                         \n\t"\
+        "lddqu    6(%0), %%xmm1     \n\t"\
+        "lddqu   -2(%0), %%xmm7     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm15, %%xmm1  \n\t"\
+        "punpcklbw %%xmm15, %%xmm0  \n\t"\
+        "punpcklbw %%xmm15, %%xmm7  \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm0, %%xmm6     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm0, %%xmm8     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm0, %%xmm9     \n\t"\
+        "movdqa  %%xmm0, %%xmm12    \n\t"\
+        "movdqa  %%xmm1, %%xmm11    \n\t"\
+        "palignr $10,%%xmm0, %%xmm11\n\t"\
+        "palignr $10,%%xmm7, %%xmm12\n\t"\
+        "palignr $2, %%xmm0, %%xmm4 \n\t"\
+        "palignr $2, %%xmm7, %%xmm9 \n\t"\
+        "palignr $4, %%xmm0, %%xmm3 \n\t"\
+        "palignr $4, %%xmm7, %%xmm8 \n\t"\
+        "palignr $6, %%xmm0, %%xmm2 \n\t"\
+        "palignr $6, %%xmm7, %%xmm6 \n\t"\
+        "paddw   %%xmm0 ,%%xmm11    \n\t"\
+        "palignr $8, %%xmm0, %%xmm1 \n\t"\
+        "palignr $8, %%xmm7, %%xmm0 \n\t"\
+        "paddw   %%xmm12,%%xmm7     \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm8, %%xmm6     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "paddw   %%xmm9, %%xmm0     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "psllw   $2,     %%xmm6     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "psubw   %%xmm0, %%xmm6     \n\t"\
+        "paddw   %%xmm13,%%xmm11    \n\t"\
+        "paddw   %%xmm13,%%xmm7     \n\t"\
+        "pmullw  %%xmm14,%%xmm2     \n\t"\
+        "pmullw  %%xmm14,%%xmm6     \n\t"\
+        "lddqu   (%2),   %%xmm3     \n\t"\
+        "paddw   %%xmm11,%%xmm2     \n\t"\
+        "paddw   %%xmm7, %%xmm6     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm6     \n\t"\
+        "packuswb %%xmm2,%%xmm6     \n\t"\
+        "pavgb   %%xmm3, %%xmm6     \n\t"\
+        OP(%%xmm6, (%1), %%xmm4, dqa)\
+        "add %5, %0                 \n\t"\
+        "add %5, %1                 \n\t"\
+        "add %4, %2                 \n\t"\
+        "decl %3                    \n\t"\
+        "jg 1b                      \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movdqa %0, %%xmm6          \n\t"\
+        :: "m"(ff_pw_5)\
+    );\
+    do{\
+    __asm__ volatile(\
+        "lddqu   -2(%0), %%xmm1     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "palignr $2, %%xmm0, %%xmm4 \n\t"\
+        "palignr $4, %%xmm0, %%xmm3 \n\t"\
+        "palignr $6, %%xmm0, %%xmm2 \n\t"\
+        "palignr $8, %%xmm0, %%xmm1 \n\t"\
+        "palignr $10,%%xmm0, %%xmm5 \n\t"\
+        "paddw   %%xmm5, %%xmm0     \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "movq    (%2),   %%xmm3     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "paddw   %5,     %%xmm0     \n\t"\
+        "pmullw  %%xmm6, %%xmm2     \n\t"\
+        "paddw   %%xmm0, %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "packuswb %%xmm2, %%xmm2    \n\t"\
+        "pavgb   %%xmm3, %%xmm2     \n\t"\
+        OP(%%xmm2, (%1), %%xmm4, q)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movdqa %5, %%xmm6          \n\t"\
+        "1:                         \n\t"\
+        "lddqu   -2(%0), %%xmm1     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "palignr $2, %%xmm0, %%xmm4 \n\t"\
+        "palignr $4, %%xmm0, %%xmm3 \n\t"\
+        "palignr $6, %%xmm0, %%xmm2 \n\t"\
+        "palignr $8, %%xmm0, %%xmm1 \n\t"\
+        "palignr $10,%%xmm0, %%xmm5 \n\t"\
+        "paddw   %%xmm5, %%xmm0     \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "paddw   %6,     %%xmm0     \n\t"\
+        "pmullw  %%xmm6, %%xmm2     \n\t"\
+        "paddw   %%xmm0, %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "packuswb %%xmm2, %%xmm2    \n\t"\
+        OP(%%xmm2, (%1), %%xmm4, q)\
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+g"(h)\
+        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    src -= 2*srcStride;\
+    \
+    __asm__ volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movq (%0), %%xmm0          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm1          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm2          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm3          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm4          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "punpcklbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm2   \n\t"\
+        "punpcklbw %%xmm7, %%xmm3   \n\t"\
+        "punpcklbw %%xmm7, %%xmm4   \n\t"\
+        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    if(h==16){\
+        __asm__ volatile(\
+            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+            \
+            : "+a"(src), "+c"(dst)\
+            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+            : "memory"\
+        );\
+    }\
+}\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
+    int w = (size+8)>>3;
+    src -= 2*srcStride+2;
+    while(w--){
+        __asm__ volatile(
+            "pxor %%xmm7, %%xmm7        \n\t"
+            "movq (%0), %%xmm0          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm1          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm2          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm3          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm4          \n\t"
+            "add %2, %0                 \n\t"
+            "punpcklbw %%xmm7, %%xmm0   \n\t"
+            "punpcklbw %%xmm7, %%xmm1   \n\t"
+            "punpcklbw %%xmm7, %%xmm2   \n\t"
+            "punpcklbw %%xmm7, %%xmm3   \n\t"
+            "punpcklbw %%xmm7, %%xmm4   \n\t"
+            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
+            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
+            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
+            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
+            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
+            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
+            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
+            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
+            : "+a"(src)
+            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+            : "memory"
+        );
+        if(size==16){
+            __asm__ volatile(
+                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
+                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
+                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
+                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
+                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
+                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
+                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
+                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
+                : "+a"(src)
+                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+                : "memory"
+            );
+        }
+        tmp += 8;
+        src += 8 - (size+5)*srcStride;
+    }
+}
+
+#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int h = size;\
+    if(size == 16){\
+        __asm__ volatile(\
+            "1:                         \n\t"\
+            "movdqa 32(%0), %%xmm4      \n\t"\
+            "movdqa 16(%0), %%xmm5      \n\t"\
+            "movdqa   (%0), %%xmm7      \n\t"\
+            "movdqa %%xmm4, %%xmm3      \n\t"\
+            "movdqa %%xmm4, %%xmm2      \n\t"\
+            "movdqa %%xmm4, %%xmm1      \n\t"\
+            "movdqa %%xmm4, %%xmm0      \n\t"\
+            "palignr $10, %%xmm5, %%xmm0 \n\t"\
+            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
+            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
+            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
+            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
+            "paddw  %%xmm5, %%xmm0      \n\t"\
+            "paddw  %%xmm4, %%xmm1      \n\t"\
+            "paddw  %%xmm3, %%xmm2      \n\t"\
+            "movdqa %%xmm5, %%xmm6      \n\t"\
+            "movdqa %%xmm5, %%xmm4      \n\t"\
+            "movdqa %%xmm5, %%xmm3      \n\t"\
+            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
+            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
+            "palignr $10, %%xmm7, %%xmm3 \n\t"\
+            "paddw  %%xmm6, %%xmm4      \n\t"\
+            "movdqa %%xmm5, %%xmm6      \n\t"\
+            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
+            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
+            "paddw  %%xmm7, %%xmm3      \n\t"\
+            "paddw  %%xmm6, %%xmm5      \n\t"\
+            \
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psubw  %%xmm4, %%xmm3      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm3      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psubw  %%xmm4, %%xmm3      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "paddw  %%xmm5, %%xmm3      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm3      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "paddw  %%xmm5, %%xmm3      \n\t"\
+            "psraw      $6, %%xmm0      \n\t"\
+            "psraw      $6, %%xmm3      \n\t"\
+            "packuswb %%xmm0, %%xmm3    \n\t"\
+            OP(%%xmm3, (%1), %%xmm7, dqa)\
+            "add $48, %0                \n\t"\
+            "add %3, %1                 \n\t"\
+            "decl %2                    \n\t"\
+            " jnz 1b                    \n\t"\
+            : "+a"(tmp), "+c"(dst), "+g"(h)\
+            : "S"((x86_reg)dstStride)\
+            : "memory"\
+        );\
+    }else{\
+        __asm__ volatile(\
+            "1:                         \n\t"\
+            "movdqa 16(%0), %%xmm1      \n\t"\
+            "movdqa   (%0), %%xmm0      \n\t"\
+            "movdqa %%xmm1, %%xmm2      \n\t"\
+            "movdqa %%xmm1, %%xmm3      \n\t"\
+            "movdqa %%xmm1, %%xmm4      \n\t"\
+            "movdqa %%xmm1, %%xmm5      \n\t"\
+            "palignr $10, %%xmm0, %%xmm5 \n\t"\
+            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
+            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
+            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
+            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
+            "paddw  %%xmm5, %%xmm0      \n\t"\
+            "paddw  %%xmm4, %%xmm1      \n\t"\
+            "paddw  %%xmm3, %%xmm2      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "psraw      $6, %%xmm0      \n\t"\
+            "packuswb %%xmm0, %%xmm0    \n\t"\
+            OP(%%xmm0, (%1), %%xmm7, q)\
+            "add $48, %0                \n\t"\
+            "add %3, %1                 \n\t"\
+            "decl %2                    \n\t"\
+            " jnz 1b                    \n\t"\
+            : "+a"(tmp), "+c"(dst), "+g"(h)\
+            : "S"((x86_reg)dstStride)\
+            : "memory"\
+        );\
+    }\
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
+#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
+
+#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
+#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
+
+#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
+#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
+#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
+#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
+
+#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
+#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
+#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
+#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
+
+#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
+#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+//     put_pixels16_sse2(dst, src, stride, 16);
+// }
+// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+//     avg_pixels16_sse2(dst, src, stride, 16);
+// }
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+///this does not get detected correctly, uncomment on AMD machine
+#ifdef HAVE_AMD3DNOW
+#define PAVGB "pavgusb"
+//QPEL_H264(put_,       PUT_OP, 3dnow)
+//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
+#undef PAVGB
+#endif
+
+#define PAVGB "pavgb"
+QPEL_H264(put_,       PUT_OP, mmx2)
+QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
+QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
+QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
+#if HAVE_SSSE3
+QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
+QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
+QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
+#endif
+#undef PAVGB
+
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+#if HAVE_SSSE3
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+#endif
+
+/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
+DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
+    0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
+};
+
+#if HAVE_SSSE3
+#define AVG_OP(X)
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
+#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
+#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
+#include "dsputil_h264_template_ssse3.c"
+static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+}
+
+#undef AVG_OP
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+#define AVG_OP(X) X
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
+#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
+#include "dsputil_h264_template_ssse3.c"
+static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+}
+#undef AVG_OP
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+#endif
+
+/***********************************/
+/* weighted prediction */
+
+static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+    int x, y;
+    offset <<= log2_denom;
+    offset += (1 << log2_denom) >> 1;
+    __asm__ volatile(
+        "movd    %0, %%mm4        \n\t"
+        "movd    %1, %%mm5        \n\t"
+        "movd    %2, %%mm6        \n\t"
+        "pshufw  $0, %%mm4, %%mm4 \n\t"
+        "pshufw  $0, %%mm5, %%mm5 \n\t"
+        "pxor    %%mm7, %%mm7     \n\t"
+        :: "g"(weight), "g"(offset), "g"(log2_denom)
+    );
+    for(y=0; y<h; y+=2){
+        for(x=0; x<w; x+=4){
+            __asm__ volatile(
+                "movd      %0,    %%mm0 \n\t"
+                "movd      %1,    %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm0 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pmullw    %%mm4, %%mm0 \n\t"
+                "pmullw    %%mm4, %%mm1 \n\t"
+                "paddsw    %%mm5, %%mm0 \n\t"
+                "paddsw    %%mm5, %%mm1 \n\t"
+                "psraw     %%mm6, %%mm0 \n\t"
+                "psraw     %%mm6, %%mm1 \n\t"
+                "packuswb  %%mm7, %%mm0 \n\t"
+                "packuswb  %%mm7, %%mm1 \n\t"
+                "movd      %%mm0, %0    \n\t"
+                "movd      %%mm1, %1    \n\t"
+                : "+m"(*(uint32_t*)(dst+x)),
+                  "+m"(*(uint32_t*)(dst+x+stride))
+            );
+        }
+        dst += 2*stride;
+    }
+}
+
+static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
+{
+    int x, y;
+    offset = ((offset + 1) | 1) << log2_denom;
+    __asm__ volatile(
+        "movd    %0, %%mm3        \n\t"
+        "movd    %1, %%mm4        \n\t"
+        "movd    %2, %%mm5        \n\t"
+        "movd    %3, %%mm6        \n\t"
+        "pshufw  $0, %%mm3, %%mm3 \n\t"
+        "pshufw  $0, %%mm4, %%mm4 \n\t"
+        "pshufw  $0, %%mm5, %%mm5 \n\t"
+        "pxor    %%mm7, %%mm7     \n\t"
+        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
+    );
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x+=4){
+            __asm__ volatile(
+                "movd      %0,    %%mm0 \n\t"
+                "movd      %1,    %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm0 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pmullw    %%mm3, %%mm0 \n\t"
+                "pmullw    %%mm4, %%mm1 \n\t"
+                "paddsw    %%mm1, %%mm0 \n\t"
+                "paddsw    %%mm5, %%mm0 \n\t"
+                "psraw     %%mm6, %%mm0 \n\t"
+                "packuswb  %%mm0, %%mm0 \n\t"
+                "movd      %%mm0, %0    \n\t"
+                : "+m"(*(uint32_t*)(dst+x))
+                :  "m"(*(uint32_t*)(src+x))
+            );
+        }
+        src += stride;
+        dst += stride;
+    }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+} \
+static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
+    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+H264_WEIGHT( 4, 8)
+H264_WEIGHT( 4, 4)
+H264_WEIGHT( 4, 2)
+
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/mathops.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/mathops.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,67 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_MATHOPS_H
+#define AVCODEC_X86_MATHOPS_H
+
+#include "config.h"
+#include "libavutil/common.h"
+
+#if ARCH_X86_32
+#define MULL(ra, rb, shift) \
+        ({ int rt, dummy; __asm__ (\
+            "imull %3               \n\t"\
+            "shrdl %4, %%edx, %%eax \n\t"\
+            : "=a"(rt), "=d"(dummy)\
+            : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\
+         rt; })
+
+#define MULH(ra, rb) \
+    ({ int rt, dummy;\
+     __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\
+     rt; })
+
+#define MUL64(ra, rb) \
+    ({ int64_t rt;\
+     __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\
+     rt; })
+#endif
+
+// avoid +32 for shift optimization (gcc should do that ...)
+#define NEG_SSR32 NEG_SSR32
+static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
+    __asm__ ("sarl %1, %0\n\t"
+         : "+r" (a)
+         : "ic" ((uint8_t)(-s))
+    );
+    return a;
+}
+
+#define NEG_USR32 NEG_USR32
+static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
+    __asm__ ("shrl %1, %0\n\t"
+         : "+r" (a)
+         : "ic" ((uint8_t)(-s))
+    );
+    return a;
+}
+
+#endif /* AVCODEC_X86_MATHOPS_H */
diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/mmx.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavcodec/x86/mmx.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,267 @@
+/*
+ * mmx.h
+ * Copyright (C) 1997-2001 H. Dietz and R. Fisher
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_X86_MMX_H
+#define AVCODEC_X86_MMX_H
+
+#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected.
+
+
+#define         mmx_i2r(op,imm,reg) \
+        __asm__ volatile (#op " %0, %%" #reg \
+                              : /* nothing */ \
+                              : "i" (imm) )
+
+#define         mmx_m2r(op,mem,reg) \
+        __asm__ volatile (#op " %0, %%" #reg \
+                              : /* nothing */ \
+                              : "m" (mem))
+
+#define         mmx_r2m(op,reg,mem) \
+        __asm__ volatile (#op " %%" #reg ", %0" \
+                              : "=m" (mem) \
+                              : /* nothing */ )
+
+#define         mmx_r2r(op,regs,regd) \
+        __asm__ volatile (#op " %" #regs ", %" #regd)
+
+
+#define         emms() __asm__ volatile ("emms")
+
+#define         movd_m2r(var,reg)           mmx_m2r (movd, var, reg)
+#define         movd_r2m(reg,var)           mmx_r2m (movd, reg, var)
+#define         movd_r2r(regs,regd)         mmx_r2r (movd, regs, regd)
+
+#define         movq_m2r(var,reg)           mmx_m2r (movq, var, reg)
+#define         movq_r2m(reg,var)           mmx_r2m (movq, reg, var)
+#define         movq_r2r(regs,regd)         mmx_r2r (movq, regs, regd)
+
+#define         packssdw_m2r(var,reg)       mmx_m2r (packssdw, var, reg)
+#define         packssdw_r2r(regs,regd)     mmx_r2r (packssdw, regs, regd)
+#define         packsswb_m2r(var,reg)       mmx_m2r (packsswb, var, reg)
+#define         packsswb_r2r(regs,regd)     mmx_r2r (packsswb, regs, regd)
+
+#define         packuswb_m2r(var,reg)       mmx_m2r (packuswb, var, reg)
+#define         packuswb_r2r(regs,regd)     mmx_r2r (packuswb, regs, regd)
+
+#define         paddb_m2r(var,reg)          mmx_m2r (paddb, var, reg)
+#define         paddb_r2r(regs,regd)        mmx_r2r (paddb, regs, regd)
+#define         paddd_m2r(var,reg)          mmx_m2r (paddd, var, reg)
+#define         paddd_r2r(regs,regd)        mmx_r2r (paddd, regs, regd)
+#define         paddw_m2r(var,reg)          mmx_m2r (paddw, var, reg)
+#define         paddw_r2r(regs,regd)        mmx_r2r (paddw, regs, regd)
+
+#define         paddsb_m2r(var,reg)         mmx_m2r (paddsb, var, reg)
+#define         paddsb_r2r(regs,regd)       mmx_r2r (paddsb, regs, regd)
+#define         paddsw_m2r(var,reg)         mmx_m2r (paddsw, var, reg)
+#define         paddsw_r2r(regs,regd)       mmx_r2r (paddsw, regs, regd)
+
+#define         paddusb_m2r(var,reg)        mmx_m2r (paddusb, var, reg)
+#define         paddusb_r2r(regs,regd)      mmx_r2r (paddusb, regs, regd)
+#define         paddusw_m2r(var,reg)        mmx_m2r (paddusw, var, reg)
+#define         paddusw_r2r(regs,regd)      mmx_r2r (paddusw, regs, regd)
+
+#define         pand_m2r(var,reg)           mmx_m2r (pand, var, reg)
+#define         pand_r2r(regs,regd)         mmx_r2r (pand, regs, regd)
+
+#define         pandn_m2r(var,reg)          mmx_m2r (pandn, var, reg)
+#define         pandn_r2r(regs,regd)        mmx_r2r (pandn, regs, regd)
+
+#define         pcmpeqb_m2r(var,reg)        mmx_m2r (pcmpeqb, var, reg)
+#define         pcmpeqb_r2r(regs,regd)      mmx_r2r (pcmpeqb, regs, regd)
+#define         pcmpeqd_m2r(var,reg)        mmx_m2r (pcmpeqd, var, reg)
+#define         pcmpeqd_r2r(regs,regd)      mmx_r2r (pcmpeqd, regs, regd)
+#define         pcmpeqw_m2r(var,reg)        mmx_m2r (pcmpeqw, var, reg)
+#define         pcmpeqw_r2r(regs,regd)      mmx_r2r (pcmpeqw, regs, regd)
+
+#define         pcmpgtb_m2r(var,reg)        mmx_m2r (pcmpgtb, var, reg)
+#define         pcmpgtb_r2r(regs,regd)      mmx_r2r (pcmpgtb, regs, regd)
+#define         pcmpgtd_m2r(var,reg)        mmx_m2r (pcmpgtd, var, reg)
+#define         pcmpgtd_r2r(regs,regd)      mmx_r2r (pcmpgtd, regs, regd)
+#define         pcmpgtw_m2r(var,reg)        mmx_m2r (pcmpgtw, var, reg)
+#define         pcmpgtw_r2r(regs,regd)      mmx_r2r (pcmpgtw, regs, regd)
+
+#define         pmaddwd_m2r(var,reg)        mmx_m2r (pmaddwd, var, reg)
+#define         pmaddwd_r2r(regs,regd)      mmx_r2r (pmaddwd, regs, regd)
+
+#define         pmulhw_m2r(var,reg)         mmx_m2r (pmulhw, var, reg)
+#define         pmulhw_r2r(regs,regd)       mmx_r2r (pmulhw, regs, regd)
+
+#define         pmullw_m2r(var,reg)         mmx_m2r (pmullw, var, reg)
+#define         pmullw_r2r(regs,regd)       mmx_r2r (pmullw, regs, regd)
+
+#define         por_m2r(var,reg)            mmx_m2r (por, var, reg)
+#define         por_r2r(regs,regd)          mmx_r2r (por, regs, regd)
+
+#define         pslld_i2r(imm,reg)          mmx_i2r (pslld, imm, reg)
+#define         pslld_m2r(var,reg)          mmx_m2r (pslld, var, reg)
+#define         pslld_r2r(regs,regd)        mmx_r2r (pslld, regs, regd)
+#define         psllq_i2r(imm,reg)          mmx_i2r (psllq, imm, reg)
+#define         psllq_m2r(var,reg)          mmx_m2r (psllq, var, reg)
+#define         psllq_r2r(regs,regd)        mmx_r2r (psllq, regs, regd)
+#define         psllw_i2r(imm,reg)          mmx_i2r (psllw, imm, reg)
+#define         psllw_m2r(var,reg)          mmx_m2r (psllw, var, reg)
+#define         psllw_r2r(regs,regd)        mmx_r2r (psllw, regs, regd)
+
+#define         psrad_i2r(imm,reg)          mmx_i2r (psrad, imm, reg)
+#define         psrad_m2r(var,reg)          mmx_m2r (psrad, var, reg)
+#define         psrad_r2r(regs,regd)        mmx_r2r (psrad, regs, regd)
+#define         psraw_i2r(imm,reg)          mmx_i2r (psraw, imm, reg)
+#define         psraw_m2r(var,reg)          mmx_m2r (psraw, var, reg)
+#define         psraw_r2r(regs,regd)        mmx_r2r (psraw, regs, regd)
+
+#define         psrld_i2r(imm,reg)          mmx_i2r (psrld, imm, reg)
+#define         psrld_m2r(var,reg)          mmx_m2r (psrld, var, reg)
+#define         psrld_r2r(regs,regd)        mmx_r2r (psrld, regs, regd)
+#define         psrlq_i2r(imm,reg)          mmx_i2r (psrlq, imm, reg)
+#define         psrlq_m2r(var,reg)          mmx_m2r (psrlq, var, reg)
+#define         psrlq_r2r(regs,regd)        mmx_r2r (psrlq, regs, regd)
+#define         psrlw_i2r(imm,reg)          mmx_i2r (psrlw, imm, reg)
+#define         psrlw_m2r(var,reg)          mmx_m2r (psrlw, var, reg)
+#define         psrlw_r2r(regs,regd)        mmx_r2r (psrlw, regs, regd)
+
+#define         psubb_m2r(var,reg)          mmx_m2r (psubb, var, reg)
+#define         psubb_r2r(regs,regd)        mmx_r2r (psubb, regs, regd)
+#define         psubd_m2r(var,reg)          mmx_m2r (psubd, var, reg)
+#define         psubd_r2r(regs,regd)        mmx_r2r (psubd, regs, regd)
+#define         psubw_m2r(var,reg)          mmx_m2r (psubw, var, reg)
+#define         psubw_r2r(regs,regd)        mmx_r2r (psubw, regs, regd)
+
+#define         psubsb_m2r(var,reg)         mmx_m2r (psubsb, var, reg)
+#define         psubsb_r2r(regs,regd)       mmx_r2r (psubsb, regs, regd)
+#define         psubsw_m2r(var,reg)         mmx_m2r (psubsw, var, reg)
+#define         psubsw_r2r(regs,regd)       mmx_r2r (psubsw, regs, regd)
+
+#define         psubusb_m2r(var,reg)        mmx_m2r (psubusb, var, reg)
+#define         psubusb_r2r(regs,regd)      mmx_r2r (psubusb, regs, regd)
+#define         psubusw_m2r(var,reg)        mmx_m2r (psubusw, var, reg)
+#define         psubusw_r2r(regs,regd)      mmx_r2r (psubusw, regs, regd)
+
+#define         punpckhbw_m2r(var,reg)      mmx_m2r (punpckhbw, var, reg)
+#define         punpckhbw_r2r(regs,regd)    mmx_r2r (punpckhbw, regs, regd)
+#define         punpckhdq_m2r(var,reg)      mmx_m2r (punpckhdq, var, reg)
+#define         punpckhdq_r2r(regs,regd)    mmx_r2r (punpckhdq, regs, regd)
+#define         punpckhwd_m2r(var,reg)      mmx_m2r (punpckhwd, var, reg)
+#define         punpckhwd_r2r(regs,regd)    mmx_r2r (punpckhwd, regs, regd)
+
+#define         punpcklbw_m2r(var,reg)      mmx_m2r (punpcklbw, var, reg)
+#define         punpcklbw_r2r(regs,regd)    mmx_r2r (punpcklbw, regs, regd)
+#define         punpckldq_m2r(var,reg)      mmx_m2r (punpckldq, var, reg)
+#define         punpckldq_r2r(regs,regd)    mmx_r2r (punpckldq, regs, regd)
+#define         punpcklwd_m2r(var,reg)      mmx_m2r (punpcklwd, var, reg)
+#define         punpcklwd_r2r(regs,regd)    mmx_r2r (punpcklwd, regs, regd)
+
+#define         pxor_m2r(var,reg)           mmx_m2r (pxor, var, reg)
+#define         pxor_r2r(regs,regd)         mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define         pavgusb_m2r(var,reg)        mmx_m2r (pavgusb, var, reg)
+#define         pavgusb_r2r(regs,regd)      mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define         mmx_m2ri(op,mem,reg,imm) \
+        __asm__ volatile (#op " %1, %0, %%" #reg \
+                              : /* nothing */ \
+                              : "m" (mem), "i" (imm))
+#define         mmx_r2ri(op,regs,regd,imm) \
+        __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \
+                              : /* nothing */ \
+                              : "i" (imm) )
+
+#define         mmx_fetch(mem,hint) \
+        __asm__ volatile ("prefetch" #hint " %0" \
+                              : /* nothing */ \
+                              : "m" (mem))
+
+
+#define         maskmovq(regs,maskreg)      mmx_r2ri (maskmovq, regs, maskreg)
+
+#define         movntq_r2m(mmreg,var)       mmx_r2m (movntq, mmreg, var)
+
+#define         pavgb_m2r(var,reg)          mmx_m2r (pavgb, var, reg)
+#define         pavgb_r2r(regs,regd)        mmx_r2r (pavgb, regs, regd)
+#define         pavgw_m2r(var,reg)          mmx_m2r (pavgw, var, reg)
+#define         pavgw_r2r(regs,regd)        mmx_r2r (pavgw, regs, regd)
+
+#define         pextrw_r2r(mmreg,reg,imm)   mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define         pinsrw_r2r(reg,mmreg,imm)   mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define         pmaxsw_m2r(var,reg)         mmx_m2r (pmaxsw, var, reg)
+#define         pmaxsw_r2r(regs,regd)       mmx_r2r (pmaxsw, regs, regd)
+
+#define         pmaxub_m2r(var,reg)         mmx_m2r (pmaxub, var, reg)
+#define         pmaxub_r2r(regs,regd)       mmx_r2r (pmaxub, regs, regd)
+
+#define         pminsw_m2r(var,reg)         mmx_m2r (pminsw, var, reg)
+#define         pminsw_r2r(regs,regd)       mmx_r2r (pminsw, regs, regd)
+
+#define         pminub_m2r(var,reg)         mmx_m2r (pminub, var, reg)
+#define         pminub_r2r(regs,regd)       mmx_r2r (pminub, regs, regd)
+
+#define         pmovmskb(mmreg,reg) \
+        __asm__ volatile ("movmskps %" #mmreg ", %" #reg)
+
+#define         pmulhuw_m2r(var,reg)        mmx_m2r (pmulhuw, var, reg)
+#define         pmulhuw_r2r(regs,regd)      mmx_r2r (pmulhuw, regs, regd)
+
+#define         prefetcht0(mem)             mmx_fetch (mem, t0)
+#define         prefetcht1(mem)             mmx_fetch (mem, t1)
+#define         prefetcht2(mem)             mmx_fetch (mem, t2)
+#define         prefetchnta(mem)            mmx_fetch (mem, nta)
+
+#define         psadbw_m2r(var,reg)         mmx_m2r (psadbw, var, reg)
+#define         psadbw_r2r(regs,regd)       mmx_r2r (psadbw, regs, regd)
+
+#define         pshufw_m2r(var,reg,imm)     mmx_m2ri(pshufw, var, reg, imm)
+#define         pshufw_r2r(regs,regd,imm)   mmx_r2ri(pshufw, regs, regd, imm)
+
+#define         sfence() __asm__ volatile ("sfence\n\t")
+
+/* SSE2 */
+#define         pshufhw_m2r(var,reg,imm)    mmx_m2ri(pshufhw, var, reg, imm)
+#define         pshufhw_r2r(regs,regd,imm)  mmx_r2ri(pshufhw, regs, regd, imm)
+#define         pshuflw_m2r(var,reg,imm)    mmx_m2ri(pshuflw, var, reg, imm)
+#define         pshuflw_r2r(regs,regd,imm)  mmx_r2ri(pshuflw, regs, regd, imm)
+
+#define         pshufd_r2r(regs,regd,imm)   mmx_r2ri(pshufd, regs, regd, imm)
+
+#define         movdqa_m2r(var,reg)         mmx_m2r (movdqa, var, reg)
+#define         movdqa_r2m(reg,var)         mmx_r2m (movdqa, reg, var)
+#define         movdqa_r2r(regs,regd)       mmx_r2r (movdqa, regs, regd)
+#define         movdqu_m2r(var,reg)         mmx_m2r (movdqu, var, reg)
+#define         movdqu_r2m(reg,var)         mmx_r2m (movdqu, reg, var)
+#define         movdqu_r2r(regs,regd)       mmx_r2r (movdqu, regs, regd)
+
+#define         pmullw_r2m(reg,var)         mmx_r2m (pmullw, reg, var)
+
+#define         pslldq_i2r(imm,reg)         mmx_i2r (pslldq, imm, reg)
+#define         psrldq_i2r(imm,reg)         mmx_i2r (psrldq, imm, reg)
+
+#define         punpcklqdq_r2r(regs,regd)   mmx_r2r (punpcklqdq, regs, regd)
+#define         punpckhqdq_r2r(regs,regd)   mmx_r2r (punpckhqdq, regs, regd)
+
+
+#endif /* AVCODEC_X86_MMX_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/arm/bswap.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/arm/bswap.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,72 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_ARM_BSWAP_H
+#define AVUTIL_ARM_BSWAP_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+
+#ifdef __ARMCC_VERSION
+
+#if HAVE_ARMV6
+#define bswap_16 bswap_16
+static av_always_inline av_const unsigned bswap_16(unsigned x)
+{
+    __asm { rev16 x, x }
+    return x;
+}
+
+#define bswap_32 bswap_32
+static av_always_inline av_const uint32_t bswap_32(uint32_t x)
+{
+    return __rev(x);
+}
+#endif /* HAVE_ARMV6 */
+
+#elif HAVE_INLINE_ASM
+
+#if HAVE_ARMV6
+#define bswap_16 bswap_16
+static av_always_inline av_const unsigned bswap_16(unsigned x)
+{
+    __asm__("rev16 %0, %0" : "+r"(x));
+    return x;
+}
+#endif
+
+#define bswap_32 bswap_32
+static av_always_inline av_const uint32_t bswap_32(uint32_t x)
+{
+#if HAVE_ARMV6
+    __asm__("rev %0, %0" : "+r"(x));
+#else
+    uint32_t t;
+    __asm__ ("eor %1, %0, %0, ror #16 \n\t"
+             "bic %1, %1, #0xFF0000   \n\t"
+             "mov %0, %0, ror #8      \n\t"
+             "eor %0, %0, %1, lsr #8  \n\t"
+             : "+r"(x), "=&r"(t));
+#endif /* HAVE_ARMV6 */
+    return x;
+}
+
+#endif /* __ARMCC_VERSION */
+
+#endif /* AVUTIL_ARM_BSWAP_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/arm/intreadwrite.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/arm/intreadwrite.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,78 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_ARM_INTREADWRITE_H
+#define AVUTIL_ARM_INTREADWRITE_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM
+
+#define AV_RN16 AV_RN16
+static av_always_inline uint16_t AV_RN16(const void *p)
+{
+    uint16_t v;
+    __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p));
+    return v;
+}
+
+#define AV_WN16 AV_WN16
+static av_always_inline void AV_WN16(void *p, uint16_t v)
+{
+    __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v));
+}
+
+#define AV_RN32 AV_RN32
+static av_always_inline uint32_t AV_RN32(const void *p)
+{
+    uint32_t v;
+    __asm__ ("ldr  %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p));
+    return v;
+}
+
+#define AV_WN32 AV_WN32
+static av_always_inline void AV_WN32(void *p, uint32_t v)
+{
+    __asm__ ("str  %1, %0" : "=m"(*(uint32_t *)p) : "r"(v));
+}
+
+#define AV_RN64 AV_RN64
+static av_always_inline uint64_t AV_RN64(const void *p)
+{
+    union { uint64_t v; uint32_t hl[2]; } v;
+    __asm__ ("ldr   %0, %2  \n\t"
+             "ldr   %1, %3  \n\t"
+             : "=&r"(v.hl[0]), "=r"(v.hl[1])
+             : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1)));
+    return v.v;
+}
+
+#define AV_WN64 AV_WN64
+static av_always_inline void AV_WN64(void *p, uint64_t v)
+{
+    union { uint64_t v; uint32_t hl[2]; } vv = { v };
+    __asm__ ("str  %2, %0  \n\t"
+             "str  %3, %1  \n\t"
+             : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1))
+             : "r"(vv.hl[0]), "r"(vv.hl[1]));
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVUTIL_ARM_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/arm/timer.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/arm/timer.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_ARM_TIMER_H
+#define AVUTIL_ARM_TIMER_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_INLINE_ASM && defined(__ARM_ARCH_7A__)
+
+#define AV_READ_TIME read_time
+
+static inline uint64_t read_time(void)
+{
+    unsigned cc;
+    __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc));
+    return cc;
+}
+
+#endif /* HAVE_INLINE_ASM && __ARM_ARCH_7A__ */
+
+#endif /* AVUTIL_ARM_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/attributes.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/attributes.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,113 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Macro definitions for various function/variable attributes
+ */
+
+#ifndef AVUTIL_ATTRIBUTES_H
+#define AVUTIL_ATTRIBUTES_H
+
+#ifdef __GNUC__
+#    define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
+#else
+#    define AV_GCC_VERSION_AT_LEAST(x,y) 0
+#endif
+
+#ifndef av_always_inline
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_always_inline __attribute__((always_inline)) inline
+#else
+#    define av_always_inline inline
+#endif
+#endif
+
+#ifndef av_noinline
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_noinline __attribute__((noinline))
+#else
+#    define av_noinline
+#endif
+#endif
+
+#ifndef av_pure
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_pure __attribute__((pure))
+#else
+#    define av_pure
+#endif
+#endif
+
+#ifndef av_const
+#if AV_GCC_VERSION_AT_LEAST(2,6)
+#    define av_const __attribute__((const))
+#else
+#    define av_const
+#endif
+#endif
+
+#ifndef av_cold
+#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3)
+#    define av_cold __attribute__((cold))
+#else
+#    define av_cold
+#endif
+#endif
+
+#ifndef av_flatten
+#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1)
+#    define av_flatten __attribute__((flatten))
+#else
+#    define av_flatten
+#endif
+#endif
+
+#ifndef attribute_deprecated
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define attribute_deprecated __attribute__((deprecated))
+#else
+#    define attribute_deprecated
+#endif
+#endif
+
+#ifndef av_unused
+#if defined(__GNUC__)
+#    define av_unused __attribute__((unused))
+#else
+#    define av_unused
+#endif
+#endif
+
+#ifndef av_uninit
+#if defined(__GNUC__) && !defined(__ICC)
+#    define av_uninit(x) x=x
+#else
+#    define av_uninit(x) x
+#endif
+#endif
+
+#ifdef __GNUC__
+#    define av_builtin_constant_p __builtin_constant_p
+#else
+#    define av_builtin_constant_p(x) 0
+#endif
+
+#endif /* AVUTIL_ATTRIBUTES_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/bswap.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/bswap.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,95 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * byte swapping routines
+ */
+
+#ifndef AVUTIL_BSWAP_H
+#define AVUTIL_BSWAP_H
+
+#include <stdint.h>
+#include "config.h"
+#include "attributes.h"
+
+#if   ARCH_ARM
+#   include "arm/bswap.h"
+#elif ARCH_X86
+#   include "x86/bswap.h"
+#endif
+
+#ifndef bswap_16
+static av_always_inline av_const uint16_t bswap_16(uint16_t x)
+{
+    x= (x>>8) | (x<<8);
+    return x;
+}
+#endif
+
+#ifndef bswap_32
+static av_always_inline av_const uint32_t bswap_32(uint32_t x)
+{
+    x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
+    x= (x>>16) | (x<<16);
+    return x;
+}
+#endif
+
+#ifndef bswap_64
+static inline uint64_t av_const bswap_64(uint64_t x)
+{
+#if 0
+    x= ((x<< 8)&0xFF00FF00FF00FF00ULL) | ((x>> 8)&0x00FF00FF00FF00FFULL);
+    x= ((x<<16)&0xFFFF0000FFFF0000ULL) | ((x>>16)&0x0000FFFF0000FFFFULL);
+    return (x>>32) | (x<<32);
+#else
+    union {
+        uint64_t ll;
+        uint32_t l[2];
+    } w, r;
+    w.ll = x;
+    r.l[0] = bswap_32 (w.l[1]);
+    r.l[1] = bswap_32 (w.l[0]);
+    return r.ll;
+#endif
+}
+#endif
+
+// be2me ... big-endian to machine-endian
+// le2me ... little-endian to machine-endian
+
+#if HAVE_BIGENDIAN
+#define be2me_16(x) (x)
+#define be2me_32(x) (x)
+#define be2me_64(x) (x)
+#define le2me_16(x) bswap_16(x)
+#define le2me_32(x) bswap_32(x)
+#define le2me_64(x) bswap_64(x)
+#else
+#define be2me_16(x) bswap_16(x)
+#define be2me_32(x) bswap_32(x)
+#define be2me_64(x) bswap_64(x)
+#define le2me_16(x) (x)
+#define le2me_32(x) (x)
+#define le2me_64(x) (x)
+#endif
+
+#endif /* AVUTIL_BSWAP_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/common.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/common.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,298 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * common internal and external API header
+ */
+
+#ifndef AVUTIL_COMMON_H
+#define AVUTIL_COMMON_H
+
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "attributes.h"
+
+//rounded division & shift
+#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
+/* assume b>0 */
+#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
+#define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
+#define FFSIGN(a) ((a) > 0 ? 1 : -1)
+
+#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
+#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
+#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
+#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)
+
+#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
+#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
+#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+
+/* misc math functions */
+extern const uint8_t ff_log2_tab[256];
+
+static inline av_const int av_log2_c(unsigned int v)
+{
+    int n = 0;
+    if (v & 0xffff0000) {
+        v >>= 16;
+        n += 16;
+    }
+    if (v & 0xff00) {
+        v >>= 8;
+        n += 8;
+    }
+    n += ff_log2_tab[v];
+
+    return n;
+}
+
+static inline av_const int av_log2_16bit_c(unsigned int v)
+{
+    int n = 0;
+    if (v & 0xff00) {
+        v >>= 8;
+        n += 8;
+    }
+    n += ff_log2_tab[v];
+
+    return n;
+}
+
+#ifdef HAVE_AV_CONFIG_H
+#   include "config.h"
+#endif
+
+/**
+ * Clips a signed integer value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static inline av_const int av_clip(int a, int amin, int amax)
+{
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/**
+ * Clips a signed integer value into the 0-255 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const uint8_t av_clip_uint8(int a)
+{
+    if (a&(~0xFF)) return (-a)>>31;
+    else           return a;
+}
+
+/**
+ * Clips a signed integer value into the 0-65535 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const uint16_t av_clip_uint16(int a)
+{
+    if (a&(~0xFFFF)) return (-a)>>31;
+    else             return a;
+}
+
+/**
+ * Clips a signed integer value into the -32768,32767 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const int16_t av_clip_int16(int a)
+{
+    if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF;
+    else                      return a;
+}
+
+/**
+ * Clips a signed 64-bit integer value into the -2147483648,2147483647 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const int32_t av_clipl_int32(int64_t a)
+{
+    if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF;
+    else                                         return a;
+}
+
+/**
+ * Clips a float value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static inline av_const float av_clipf(float a, float amin, float amax)
+{
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/** Computes ceil(log2(x)).
+ * @param x value used to compute ceil(log2(x))
+ * @return computed ceiling of log2(x)
+ */
+static inline av_const int av_ceil_log2(int x)
+{
+    return av_log2_c((x - 1) << 1);
+}
+
+#define MKTAG(a,b,c,d) (a | (b << 8) | (c << 16) | (d << 24))
+#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
+
+/*!
+ * \def GET_UTF8(val, GET_BYTE, ERROR)
+ * Converts a UTF-8 character (up to 4 bytes long) to its 32-bit UCS-4 encoded form
+ * \param val is the output and should be of type uint32_t. It holds the converted
+ * UCS-4 character and should be a left value.
+ * \param GET_BYTE gets UTF-8 encoded bytes from any proper source. It can be
+ * a function or a statement whose return value or evaluated value is of type
+ * uint8_t. It will be executed up to 4 times for values in the valid UTF-8 range,
+ * and up to 7 times in the general case.
+ * \param ERROR action that should be taken when an invalid UTF-8 byte is returned
+ * from GET_BYTE. It should be a statement that jumps out of the macro,
+ * like exit(), goto, return, break, or continue.
+ */
+#define GET_UTF8(val, GET_BYTE, ERROR)\
+    val= GET_BYTE;\
+    {\
+        int ones= 7 - av_log2(val ^ 255);\
+        if(ones==1)\
+            ERROR\
+        val&= 127>>ones;\
+        while(--ones > 0){\
+            int tmp= GET_BYTE - 128;\
+            if(tmp>>6)\
+                ERROR\
+            val= (val<<6) + tmp;\
+        }\
+    }
+
+/*!
+ * \def GET_UTF16(val, GET_16BIT, ERROR)
+ * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form
+ * \param val is the output and should be of type uint32_t. It holds the converted
+ * UCS-4 character and should be a left value.
+ * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness.
+ * It can be a function or a statement whose return value or evaluated value is of type
+ * uint16_t. It will be executed up to 2 times.
+ * \param ERROR action that should be taken when an invalid UTF-16 surrogate is
+ * returned from GET_BYTE. It should be a statement that jumps out of the macro,
+ * like exit(), goto, return, break, or continue.
+ */
+#define GET_UTF16(val, GET_16BIT, ERROR)\
+    val = GET_16BIT;\
+    {\
+        unsigned int hi = val - 0xD800;\
+        if (hi < 0x800) {\
+            val = GET_16BIT - 0xDC00;\
+            if (val > 0x3FFU || hi > 0x3FFU)\
+                ERROR\
+            val += (hi<<10) + 0x10000;\
+        }\
+    }\
+
+/*!
+ * \def PUT_UTF8(val, tmp, PUT_BYTE)
+ * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).
+ * \param val is an input-only argument and should be of type uint32_t. It holds
+ * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If
+ * val is given as a function it is executed only once.
+ * \param tmp is a temporary variable and should be of type uint8_t. It
+ * represents an intermediate value during conversion that is to be
+ * output by PUT_BYTE.
+ * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination.
+ * It could be a function or a statement, and uses tmp as the input byte.
+ * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be
+ * executed up to 4 times for values in the valid UTF-8 range and up to
+ * 7 times in the general case, depending on the length of the converted
+ * Unicode character.
+ */
+#define PUT_UTF8(val, tmp, PUT_BYTE)\
+    {\
+        int bytes, shift;\
+        uint32_t in = val;\
+        if (in < 0x80) {\
+            tmp = in;\
+            PUT_BYTE\
+        } else {\
+            bytes = (av_log2(in) + 4) / 5;\
+            shift = (bytes - 1) * 6;\
+            tmp = (256 - (256 >> bytes)) | (in >> shift);\
+            PUT_BYTE\
+            while (shift >= 6) {\
+                shift -= 6;\
+                tmp = 0x80 | ((in >> shift) & 0x3f);\
+                PUT_BYTE\
+            }\
+        }\
+    }
+
+/*!
+ * \def PUT_UTF16(val, tmp, PUT_16BIT)
+ * Converts a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes).
+ * \param val is an input-only argument and should be of type uint32_t. It holds
+ * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If
+ * val is given as a function it is executed only once.
+ * \param tmp is a temporary variable and should be of type uint16_t. It
+ * represents an intermediate value during conversion that is to be
+ * output by PUT_16BIT.
+ * \param PUT_16BIT writes the converted UTF-16 data to any proper destination
+ * in desired endianness. It could be a function or a statement, and uses tmp
+ * as the input byte.  For example, PUT_BYTE could be "*output++ = tmp;"
+ * PUT_BYTE will be executed 1 or 2 times depending on input character.
+ */
+#define PUT_UTF16(val, tmp, PUT_16BIT)\
+    {\
+        uint32_t in = val;\
+        if (in < 0x10000) {\
+            tmp = in;\
+            PUT_16BIT\
+        } else {\
+            tmp = 0xD800 | ((in - 0x10000) >> 10);\
+            PUT_16BIT\
+            tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\
+            PUT_16BIT\
+        }\
+    }\
+
+
+
+#include "mem.h"
+
+#ifdef HAVE_AV_CONFIG_H
+#    include "internal.h"
+#endif /* HAVE_AV_CONFIG_H */
+
+#endif /* AVUTIL_COMMON_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/error.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/error.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * error code definitions
+ */
+
+#ifndef AVUTIL_ERROR_H
+#define AVUTIL_ERROR_H
+
+#include <errno.h>
+#include "common.h"
+
+/* error handling */
+#if EDOM > 0
+#define AVERROR(e) (-(e))   ///< Returns a negative error code from a POSIX error code, to return from library functions.
+#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value.
+#else
+/* Some platforms have E* and errno already negated. */
+#define AVERROR(e) (e)
+#define AVUNERROR(e) (e)
+#endif
+
+#define AVERROR_EOF         AVERROR(EPIPE)   ///< End of file
+
+
+/**
+ * Puts a description of the AVERROR code errnum in errbuf.
+ * In case of failure the global variable errno is set to indicate the
+ * error.
+ *
+ * @param errbuf_size the size in bytes of errbuf
+ * @return 0 on success, a negative value otherwise
+ */
+int av_strerror(int errnum, char *errbuf, size_t errbuf_size);
+
+#endif /* AVUTIL_ERROR_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/internal.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/internal.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,168 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * common internal API header
+ */
+
+#ifndef AVUTIL_INTERNAL_H
+#define AVUTIL_INTERNAL_H
+
+#if !defined(DEBUG) && !defined(NDEBUG)
+#    define NDEBUG
+#endif
+
+#include <limits.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include "config.h"
+#include "attributes.h"
+#include "timer.h"
+
+
+
+#ifndef INT16_MIN
+#define INT16_MIN       (-0x7fff - 1)
+#endif
+
+#ifndef INT16_MAX
+#define INT16_MAX       0x7fff
+#endif
+
+#ifndef INT32_MIN
+#define INT32_MIN       (-0x7fffffff - 1)
+#endif
+
+#ifndef INT32_MAX
+#define INT32_MAX       0x7fffffff
+#endif
+
+#ifndef UINT32_MAX
+#define UINT32_MAX      0xffffffff
+#endif
+
+#ifndef INT64_MIN
+#define INT64_MIN       (-0x7fffffffffffffffLL - 1)
+#endif
+
+#ifndef INT64_MAX
+#define INT64_MAX INT64_C(9223372036854775807)
+#endif
+
+#ifndef UINT64_MAX
+#define UINT64_MAX UINT64_C(0xFFFFFFFFFFFFFFFF)
+#endif
+
+#ifndef INT_BIT
+#    define INT_BIT (CHAR_BIT * sizeof(int))
+#endif
+
+#ifndef offsetof
+#    define offsetof(T, F) ((unsigned int)((char *)&((T *)0)->F))
+#endif
+
+/* Use to export labels from asm. */
+#define LABEL_MANGLE(a) #a
+#define LOCAL_MANGLE(a) #a
+#define MANGLE(a) #a
+
+// Use rip-relative addressing if compiling PIC code on x86-64.
+// #if ARCH_X86_64 && defined(PIC)
+// #    define LOCAL_MANGLE(a) #a "(%%rip)"
+// #else
+// #    define LOCAL_MANGLE(a) #a
+// #endif
+// 
+// #define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+
+/* debug stuff */
+
+/* dprintf macros */
+#ifdef DEBUG
+#    define dprintf(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__)
+#else
+#    define dprintf(pctx, ...)
+#endif
+
+#define av_abort()      do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0)
+
+/* math */
+
+
+/* avoid usage of dangerous/inappropriate system functions */
+// #undef  malloc
+// #define malloc please_use_av_malloc
+// #undef  free
+// #define free please_use_av_free
+#undef  realloc
+#define realloc please_use_av_realloc
+#undef  time
+#define time time_is_forbidden_due_to_security_issues
+#undef  rand
+#define rand rand_is_forbidden_due_to_state_trashing_use_av_lfg_get
+#undef  srand
+#define srand srand_is_forbidden_due_to_state_trashing_use_av_lfg_init
+#undef  random
+#define random random_is_forbidden_due_to_state_trashing_use_av_lfg_get
+#undef  sprintf
+#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf
+//#undef  exit
+//#define exit exit_is_forbidden
+#ifndef LIBAVFORMAT_BUILD
+
+#undef  puts
+#define puts please_use_av_log_instead_of_puts
+#undef  perror
+#define perror please_use_av_log_instead_of_perror
+#endif
+
+#define FF_ALLOC_OR_GOTO(p, size, label)\
+{\
+    p = av_malloc(size);\
+    if (p == NULL && (size) != 0) {\
+        av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\
+        goto label;\
+    }\
+}
+
+#define FF_ALLOCZ_OR_GOTO(p, size, label)\
+{\
+    p = av_mallocz(size);\
+    if (p == NULL && (size) != 0) {\
+        av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\
+        goto label;\
+    }\
+}
+
+
+/**
+ * Returns NULL if CONFIG_SMALL is true, otherwise the argument
+ * without modification. Used to disable the definition of strings
+ * (for example AVCodec long_names).
+ */
+#if CONFIG_SMALL
+#   define NULL_IF_CONFIG_SMALL(x) NULL
+#else
+#   define NULL_IF_CONFIG_SMALL(x) x
+#endif
+
+#endif /* AVUTIL_INTERNAL_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/intreadwrite.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/intreadwrite.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,498 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_INTREADWRITE_H
+#define AVUTIL_INTREADWRITE_H
+
+#include <stdint.h>
+#include "config.h"
+#include "bswap.h"
+#include "common.h"
+
+typedef union {
+    uint64_t u64;
+    uint32_t u32[2];
+    uint16_t u16[4];
+    uint8_t  u8 [8];
+    double   f64;
+    float    f32[2];
+} __attribute__((__may_alias__)) av_alias64;
+
+typedef union {
+    uint32_t u32;
+    uint16_t u16[2];
+    uint8_t  u8 [4];
+    float    f32;
+} __attribute__((__may_alias__)) av_alias32;
+
+typedef union {
+    uint16_t u16;
+    uint8_t  u8 [2];
+} __attribute__((__may_alias__)) av_alias16  ;
+
+/*
+ * Arch-specific headers can provide any combination of
+ * AV_[RW][BLN](16|24|32|64) and AV_(COPY|SWAP|ZERO)(64|128) macros.
+ * Preprocessor symbols must be defined, even if these are implemented
+ * as inline functions.
+ */
+
+#if   ARCH_ARM
+#   include "arm/intreadwrite.h"
+#elif ARCH_PPC
+#   include "ppc/intreadwrite.h"
+#elif ARCH_X86
+#   include "x86/intreadwrite.h"
+#endif
+
+/*
+ * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers.
+ */
+
+#if HAVE_BIGENDIAN
+
+#   if    defined(AV_RN16) && !defined(AV_RB16)
+#       define AV_RB16(p) AV_RN16(p)
+#   elif !defined(AV_RN16) &&  defined(AV_RB16)
+#       define AV_RN16(p) AV_RB16(p)
+#   endif
+
+#   if    defined(AV_WN16) && !defined(AV_WB16)
+#       define AV_WB16(p, v) AV_WN16(p, v)
+#   elif !defined(AV_WN16) &&  defined(AV_WB16)
+#       define AV_WN16(p, v) AV_WB16(p, v)
+#   endif
+
+#   if    defined(AV_RN24) && !defined(AV_RB24)
+#       define AV_RB24(p) AV_RN24(p)
+#   elif !defined(AV_RN24) &&  defined(AV_RB24)
+#       define AV_RN24(p) AV_RB24(p)
+#   endif
+
+#   if    defined(AV_WN24) && !defined(AV_WB24)
+#       define AV_WB24(p, v) AV_WN24(p, v)
+#   elif !defined(AV_WN24) &&  defined(AV_WB24)
+#       define AV_WN24(p, v) AV_WB24(p, v)
+#   endif
+
+#   if    defined(AV_RN32) && !defined(AV_RB32)
+#       define AV_RB32(p) AV_RN32(p)
+#   elif !defined(AV_RN32) &&  defined(AV_RB32)
+#       define AV_RN32(p) AV_RB32(p)
+#   endif
+
+#   if    defined(AV_WN32) && !defined(AV_WB32)
+#       define AV_WB32(p, v) AV_WN32(p, v)
+#   elif !defined(AV_WN32) &&  defined(AV_WB32)
+#       define AV_WN32(p, v) AV_WB32(p, v)
+#   endif
+
+#   if    defined(AV_RN64) && !defined(AV_RB64)
+#       define AV_RB64(p) AV_RN64(p)
+#   elif !defined(AV_RN64) &&  defined(AV_RB64)
+#       define AV_RN64(p) AV_RB64(p)
+#   endif
+
+#   if    defined(AV_WN64) && !defined(AV_WB64)
+#       define AV_WB64(p, v) AV_WN64(p, v)
+#   elif !defined(AV_WN64) &&  defined(AV_WB64)
+#       define AV_WN64(p, v) AV_WB64(p, v)
+#   endif
+
+#else /* HAVE_BIGENDIAN */
+
+#   if    defined(AV_RN16) && !defined(AV_RL16)
+#       define AV_RL16(p) AV_RN16(p)
+#   elif !defined(AV_RN16) &&  defined(AV_RL16)
+#       define AV_RN16(p) AV_RL16(p)
+#   endif
+
+#   if    defined(AV_WN16) && !defined(AV_WL16)
+#       define AV_WL16(p, v) AV_WN16(p, v)
+#   elif !defined(AV_WN16) &&  defined(AV_WL16)
+#       define AV_WN16(p, v) AV_WL16(p, v)
+#   endif
+
+#   if    defined(AV_RN24) && !defined(AV_RL24)
+#       define AV_RL24(p) AV_RN24(p)
+#   elif !defined(AV_RN24) &&  defined(AV_RL24)
+#       define AV_RN24(p) AV_RL24(p)
+#   endif
+
+#   if    defined(AV_WN24) && !defined(AV_WL24)
+#       define AV_WL24(p, v) AV_WN24(p, v)
+#   elif !defined(AV_WN24) &&  defined(AV_WL24)
+#       define AV_WN24(p, v) AV_WL24(p, v)
+#   endif
+
+#   if    defined(AV_RN32) && !defined(AV_RL32)
+#       define AV_RL32(p) AV_RN32(p)
+#   elif !defined(AV_RN32) &&  defined(AV_RL32)
+#       define AV_RN32(p) AV_RL32(p)
+#   endif
+
+#   if    defined(AV_WN32) && !defined(AV_WL32)
+#       define AV_WL32(p, v) AV_WN32(p, v)
+#   elif !defined(AV_WN32) &&  defined(AV_WL32)
+#       define AV_WN32(p, v) AV_WL32(p, v)
+#   endif
+
+#   if    defined(AV_RN64) && !defined(AV_RL64)
+#       define AV_RL64(p) AV_RN64(p)
+#   elif !defined(AV_RN64) &&  defined(AV_RL64)
+#       define AV_RN64(p) AV_RL64(p)
+#   endif
+
+#   if    defined(AV_WN64) && !defined(AV_WL64)
+#       define AV_WL64(p, v) AV_WN64(p, v)
+#   elif !defined(AV_WN64) &&  defined(AV_WL64)
+#       define AV_WN64(p, v) AV_WL64(p, v)
+#   endif
+
+#endif /* !HAVE_BIGENDIAN */
+
+/*
+ * Define AV_[RW]N helper macros to simplify definitions not provided
+ * by per-arch headers.
+ */
+
+
+
+#if defined(__DECC)
+
+#   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
+#   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
+
+#else
+
+#ifndef AV_RB16
+#   define AV_RB16(x)                           \
+    ((((const uint8_t*)(x))[0] << 8) |          \
+      ((const uint8_t*)(x))[1])
+#endif
+#ifndef AV_WB16
+#   define AV_WB16(p, d) do {                   \
+        ((uint8_t*)(p))[1] = (d);               \
+        ((uint8_t*)(p))[0] = (d)>>8;            \
+    } while(0)
+#endif
+
+#ifndef AV_RL16
+#   define AV_RL16(x)                           \
+    ((((const uint8_t*)(x))[1] << 8) |          \
+      ((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL16
+#   define AV_WL16(p, d) do {                   \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+    } while(0)
+#endif
+
+#ifndef AV_RB32
+#   define AV_RB32(x)                           \
+    ((((const uint8_t*)(x))[0] << 24) |         \
+     (((const uint8_t*)(x))[1] << 16) |         \
+     (((const uint8_t*)(x))[2] <<  8) |         \
+      ((const uint8_t*)(x))[3])
+#endif
+#ifndef AV_WB32
+#   define AV_WB32(p, d) do {                   \
+        ((uint8_t*)(p))[3] = (d);               \
+        ((uint8_t*)(p))[2] = (d)>>8;            \
+        ((uint8_t*)(p))[1] = (d)>>16;           \
+        ((uint8_t*)(p))[0] = (d)>>24;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL32
+#   define AV_RL32(x)                           \
+    ((((const uint8_t*)(x))[3] << 24) |         \
+     (((const uint8_t*)(x))[2] << 16) |         \
+     (((const uint8_t*)(x))[1] <<  8) |         \
+      ((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL32
+#   define AV_WL32(p, d) do {                   \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+        ((uint8_t*)(p))[3] = (d)>>24;           \
+    } while(0)
+#endif
+
+#ifndef AV_RB64
+#   define AV_RB64(x)                                   \
+    (((uint64_t)((const uint8_t*)(x))[0] << 56) |       \
+     ((uint64_t)((const uint8_t*)(x))[1] << 48) |       \
+     ((uint64_t)((const uint8_t*)(x))[2] << 40) |       \
+     ((uint64_t)((const uint8_t*)(x))[3] << 32) |       \
+     ((uint64_t)((const uint8_t*)(x))[4] << 24) |       \
+     ((uint64_t)((const uint8_t*)(x))[5] << 16) |       \
+     ((uint64_t)((const uint8_t*)(x))[6] <<  8) |       \
+      (uint64_t)((const uint8_t*)(x))[7])
+#endif
+#ifndef AV_WB64
+#   define AV_WB64(p, d) do {                   \
+        ((uint8_t*)(p))[7] = (d);               \
+        ((uint8_t*)(p))[6] = (d)>>8;            \
+        ((uint8_t*)(p))[5] = (d)>>16;           \
+        ((uint8_t*)(p))[4] = (d)>>24;           \
+        ((uint8_t*)(p))[3] = (d)>>32;           \
+        ((uint8_t*)(p))[2] = (d)>>40;           \
+        ((uint8_t*)(p))[1] = (d)>>48;           \
+        ((uint8_t*)(p))[0] = (d)>>56;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL64
+#   define AV_RL64(x)                                   \
+    (((uint64_t)((const uint8_t*)(x))[7] << 56) |       \
+     ((uint64_t)((const uint8_t*)(x))[6] << 48) |       \
+     ((uint64_t)((const uint8_t*)(x))[5] << 40) |       \
+     ((uint64_t)((const uint8_t*)(x))[4] << 32) |       \
+     ((uint64_t)((const uint8_t*)(x))[3] << 24) |       \
+     ((uint64_t)((const uint8_t*)(x))[2] << 16) |       \
+     ((uint64_t)((const uint8_t*)(x))[1] <<  8) |       \
+      (uint64_t)((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL64
+#   define AV_WL64(p, d) do {                   \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+        ((uint8_t*)(p))[3] = (d)>>24;           \
+        ((uint8_t*)(p))[4] = (d)>>32;           \
+        ((uint8_t*)(p))[5] = (d)>>40;           \
+        ((uint8_t*)(p))[6] = (d)>>48;           \
+        ((uint8_t*)(p))[7] = (d)>>56;           \
+    } while(0)
+#endif
+
+#if HAVE_BIGENDIAN
+#   define AV_RN(s, p)    AV_RB##s(p)
+#   define AV_WN(s, p, v) AV_WB##s(p, v)
+#else
+#   define AV_RN(s, p)    AV_RL##s(p)
+#   define AV_WN(s, p, v) AV_WL##s(p, v)
+#endif
+
+#endif /* HAVE_FAST_UNALIGNED */
+
+#ifndef AV_RN16
+#   define AV_RN16(p) AV_RN(16, p)
+#endif
+
+#ifndef AV_RN32
+#   define AV_RN32(p) AV_RN(32, p)
+#endif
+
+#ifndef AV_RN64
+#   define AV_RN64(p) AV_RN(64, p)
+#endif
+
+#ifndef AV_WN16
+#   define AV_WN16(p, v) AV_WN(16, p, v)
+#endif
+
+#ifndef AV_WN32
+#   define AV_WN32(p, v) AV_WN(32, p, v)
+#endif
+
+#ifndef AV_WN64
+#   define AV_WN64(p, v) AV_WN(64, p, v)
+#endif
+
+#if HAVE_BIGENDIAN
+#   define AV_RB(s, p)    AV_RN##s(p)
+#   define AV_WB(s, p, v) AV_WN##s(p, v)
+#   define AV_RL(s, p)    bswap_##s(AV_RN##s(p))
+#   define AV_WL(s, p, v) AV_WN##s(p, bswap_##s(v))
+#else
+#   define AV_RB(s, p)    bswap_##s(AV_RN##s(p))
+#   define AV_WB(s, p, v) AV_WN##s(p, bswap_##s(v))
+#   define AV_RL(s, p)    AV_RN##s(p)
+#   define AV_WL(s, p, v) AV_WN##s(p, v)
+#endif
+
+#define AV_RB8(x)     (((const uint8_t*)(x))[0])
+#define AV_WB8(p, d)  do { ((uint8_t*)(p))[0] = (d); } while(0)
+
+#define AV_RL8(x)     AV_RB8(x)
+#define AV_WL8(p, d)  AV_WB8(p, d)
+
+#ifndef AV_RB16
+#   define AV_RB16(p)    AV_RB(16, p)
+#endif
+#ifndef AV_WB16
+#   define AV_WB16(p, v) AV_WB(16, p, v)
+#endif
+
+#ifndef AV_RL16
+#   define AV_RL16(p)    AV_RL(16, p)
+#endif
+#ifndef AV_WL16
+#   define AV_WL16(p, v) AV_WL(16, p, v)
+#endif
+
+#ifndef AV_RB32
+#   define AV_RB32(p)    AV_RB(32, p)
+#endif
+#ifndef AV_WB32
+#   define AV_WB32(p, v) AV_WB(32, p, v)
+#endif
+
+#ifndef AV_RL32
+#   define AV_RL32(p)    AV_RL(32, p)
+#endif
+#ifndef AV_WL32
+#   define AV_WL32(p, v) AV_WL(32, p, v)
+#endif
+
+#ifndef AV_RB64
+#   define AV_RB64(p)    AV_RB(64, p)
+#endif
+#ifndef AV_WB64
+#   define AV_WB64(p, v) AV_WB(64, p, v)
+#endif
+
+#ifndef AV_RL64
+#   define AV_RL64(p)    AV_RL(64, p)
+#endif
+#ifndef AV_WL64
+#   define AV_WL64(p, v) AV_WL(64, p, v)
+#endif
+
+#ifndef AV_RB24
+#   define AV_RB24(x)                           \
+    ((((const uint8_t*)(x))[0] << 16) |         \
+     (((const uint8_t*)(x))[1] <<  8) |         \
+      ((const uint8_t*)(x))[2])
+#endif
+#ifndef AV_WB24
+#   define AV_WB24(p, d) do {                   \
+        ((uint8_t*)(p))[2] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[0] = (d)>>16;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL24
+#   define AV_RL24(x)                           \
+    ((((const uint8_t*)(x))[2] << 16) |         \
+     (((const uint8_t*)(x))[1] <<  8) |         \
+      ((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL24
+#   define AV_WL24(p, d) do {                   \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+    } while(0)
+#endif
+
+/*
+ * The AV_[RW]NA macros access naturally aligned data
+ * in a type-safe way.
+ */
+
+#define AV_RNA(s, p)    (((const av_alias##s*)(p))->u##s)
+#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v))
+
+#ifndef AV_RN16A
+#   define AV_RN16A(p) AV_RNA(16, p)
+#endif
+
+#ifndef AV_RN32A
+#   define AV_RN32A(p) AV_RNA(32, p)
+#endif
+
+#ifndef AV_RN64A
+#   define AV_RN64A(p) AV_RNA(64, p)
+#endif
+
+#ifndef AV_WN16A
+#   define AV_WN16A(p, v) AV_WNA(16, p, v)
+#endif
+
+#ifndef AV_WN32A
+#   define AV_WN32A(p, v) AV_WNA(32, p, v)
+#endif
+
+#ifndef AV_WN64A
+#   define AV_WN64A(p, v) AV_WNA(64, p, v)
+#endif
+
+/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be
+ * naturally aligned. They may be implemented using MMX,
+ * so emms_c() must be called before using any float code
+ * afterwards.
+ */
+
+#define AV_COPY(n, d, s) \
+    (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
+
+#ifndef AV_COPY16
+#   define AV_COPY16(d, s) AV_COPY(16, d, s)
+#endif
+
+#ifndef AV_COPY32
+#   define AV_COPY32(d, s) AV_COPY(32, d, s)
+#endif
+
+#ifndef AV_COPY64
+#   define AV_COPY64(d, s) AV_COPY(64, d, s)
+#endif
+
+#ifndef AV_COPY128
+#   define AV_COPY128(d, s)                    \
+    do {                                       \
+        AV_COPY64(d, s);                       \
+        AV_COPY64((char*)(d)+8, (char*)(s)+8); \
+    } while(0)
+#endif
+
+#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b))
+
+#ifndef AV_SWAP64
+#   define AV_SWAP64(a, b) AV_SWAP(64, a, b)
+#endif
+
+#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
+
+#ifndef AV_ZERO16
+#   define AV_ZERO16(d) AV_ZERO(16, d)
+#endif
+
+#ifndef AV_ZERO32
+#   define AV_ZERO32(d) AV_ZERO(32, d)
+#endif
+
+#ifndef AV_ZERO64
+#   define AV_ZERO64(d) AV_ZERO(64, d)
+#endif
+
+#ifndef AV_ZERO128
+#   define AV_ZERO128(d)         \
+    do {                         \
+        AV_ZERO64(d);            \
+        AV_ZERO64((char*)(d)+8); \
+    } while(0)
+#endif
+
+#endif /* AVUTIL_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/log.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/log.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,111 @@
+/*
+ * log functions
+ * Copyright (c) 2003 Michel Bardiaux
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * logging functions
+ */
+#include "error.h"
+#include <unistd.h>
+#include <stdlib.h>
+#include "log.h"
+
+
+static int av_log_level = AV_LOG_INFO;
+
+static int use_ansi_color=-1;
+
+#undef fprintf
+static void colored_fputs(int color, const char *str){
+    if(use_ansi_color<0){
+#if HAVE_ISATTY && !defined(_WIN32)
+        use_ansi_color= getenv("TERM") && !getenv("NO_COLOR") && isatty(2);
+#else
+        use_ansi_color= 0;
+#endif
+    }
+
+    if(use_ansi_color){
+        fprintf(stderr, "\033[%d;3%dm", color>>4, color&15);
+    }
+    fputs(str, stderr);
+    if(use_ansi_color){
+        fprintf(stderr, "\033[0m");
+    }
+}
+
+void av_log_default_callback(int level, const char* fmt, va_list vl)
+{
+    static int print_prefix=1;
+    static int count;
+    static char line[1024], prev[1024];
+    static const uint8_t color[]={0x41,0x41,0x11,0x03,9,9,9};
+
+    if(level>av_log_level)
+        return;
+#undef fprintf
+
+    line[0]=0;
+
+    vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl);
+
+    print_prefix= line[strlen(line)-1] == '\n';
+    if(print_prefix && !strcmp(line, prev)){
+        count++;
+        return;
+    }
+    if(count>0){
+        fprintf(stderr, "    Last message repeated %d times\n", count);
+        count=0;
+    }
+    colored_fputs(color[av_clip(level>>3, 0, 6)], line);
+    strcpy(prev, line);
+}
+
+static void (*av_log_callback)(int, const char*, va_list) = av_log_default_callback;
+
+void av_log(int level, const char *fmt, ...)
+{
+    va_list vl;
+    va_start(vl, fmt);
+    av_vlog(level, fmt, vl);
+    va_end(vl);
+}
+
+void av_vlog(int level, const char *fmt, va_list vl)
+{
+    av_log_callback(level, fmt, vl);
+}
+
+int av_log_get_level(void)
+{
+    return av_log_level;
+}
+
+void av_log_set_level(int level)
+{
+    av_log_level = level;
+}
+
+void av_log_set_callback(void (*callback)(int, const char*, va_list))
+{
+    av_log_callback = callback;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavutil/log.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/log.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,120 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LOG_H
+#define AVUTIL_LOG_H
+
+#include <stdarg.h>
+//#include "avutil.h"
+
+/**
+ * Describes the class of an AVClass context structure. That is an
+ * arbitrary struct of which the first field is a pointer to an
+ * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.).
+ */
+typedef struct {
+    /**
+     * The name of the class; usually it is the same name as the
+     * context structure type to which the AVClass is associated.
+     */
+    const char* class_name;
+
+    /**
+     * A pointer to a function which returns the name of a context
+     * instance ctx associated with the class.
+     */
+    const char* (*item_name)(void* ctx);
+
+    /**
+     * a pointer to the first option specified in the class if any or NULL
+     *
+     * @see av_set_default_options()
+     */
+    const struct AVOption *option;
+
+    /**
+     * LIBAVUTIL_VERSION with which this structure was created.
+     * This is used to allow fields to be added without requiring major
+     * version bumps everywhere.
+     */
+
+    int version;
+} AVClass;
+
+/* av_log API */
+
+#define AV_LOG_QUIET    -8
+
+/**
+ * Something went really wrong and we will crash now.
+ */
+#define AV_LOG_PANIC     0
+
+/**
+ * Something went wrong and recovery is not possible.
+ * For example, no header was found for a format which depends
+ * on headers or an illegal combination of parameters is used.
+ */
+#define AV_LOG_FATAL     8
+
+/**
+ * Something went wrong and cannot losslessly be recovered.
+ * However, not all future data is affected.
+ */
+#define AV_LOG_ERROR    16
+
+/**
+ * Something somehow does not look correct. This may or may not
+ * lead to problems. An example would be the use of '-vstrict -2'.
+ */
+#define AV_LOG_WARNING  24
+
+#define AV_LOG_INFO     32
+#define AV_LOG_VERBOSE  40
+
+/**
+ * Stuff which is only useful for libav* developers.
+ */
+#define AV_LOG_DEBUG    48
+
+/**
+ * Sends the specified message to the log if the level is less than or equal
+ * to the current av_log_level. By default, all logging messages are sent to
+ * stderr. This behavior can be altered by setting a different av_vlog callback
+ * function.
+ *
+ * @param avcl A pointer to an arbitrary struct of which the first field is a
+ * pointer to an AVClass struct.
+ * @param level The importance level of the message, lower values signifying
+ * higher importance.
+ * @param fmt The format string (printf-compatible) that specifies how
+ * subsequent arguments are converted to output.
+ * @see av_vlog
+ */
+
+void av_log(int level, const char *fmt, ...);
+
+void av_vlog(int level, const char *fmt, va_list);
+int av_log_get_level(void);
+void av_log_set_level(int);
+void av_log_set_callback(void (*)(int, const char*, va_list));
+void av_log_default_callback(int level, const char* fmt, va_list vl);
+
+#endif /* AVUTIL_LOG_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/mem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/mem.c	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,127 @@
+/*
+ * default memory allocator for libavutil
+ * Copyright (c) 2002 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * default memory allocator for libavutil
+ */
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#if HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "mem.h"
+
+/* here we can use OS-dependent allocation functions */
+#undef free
+#undef malloc
+#undef realloc
+
+#ifdef MALLOC_PREFIX
+
+#define malloc         AV_JOIN(MALLOC_PREFIX, malloc)
+#define memalign       AV_JOIN(MALLOC_PREFIX, memalign)
+#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign)
+#define realloc        AV_JOIN(MALLOC_PREFIX, realloc)
+#define free           AV_JOIN(MALLOC_PREFIX, free)
+
+void *malloc(size_t size);
+void *memalign(size_t align, size_t size);
+int   posix_memalign(void **ptr, size_t align, size_t size);
+void *realloc(void *ptr, size_t size);
+void  free(void *ptr);
+
+#endif /* MALLOC_PREFIX */
+
+
+/* You can redefine av_malloc and av_free in your project to use your
+   memory allocator. You do not need to suppress this file because the
+   linker will do it automatically. */
+
+void *av_malloc(unsigned int size)
+{
+    void *ptr = NULL;
+    /* let's disallow possible ambiguous cases */
+    if(size > (INT_MAX-16) )
+        return NULL;
+
+//FIXME: when no aligned mallocs vector code should be disabled.
+#if HAVE_POSIX_MEMALIGN
+    if (posix_memalign(&ptr,16,size))
+        ptr = NULL;
+#elif HAVE_MEMALIGN
+    ptr = memalign(16,size);
+#else
+    ptr = malloc(size);
+#endif
+    return ptr;
+}
+
+void *av_realloc(void *ptr, unsigned int size)
+{
+    /* let's disallow possible ambiguous cases */
+    if(size > (INT_MAX-16) )
+        return NULL;
+
+    return realloc(ptr, size);
+
+}
+
+void av_free(void *ptr)
+{
+    /* XXX: this test should not be needed on most libcs */
+    if (ptr)
+        free(ptr);
+
+}
+
+void av_freep(void *arg)
+{
+    void **ptr= (void**)arg;
+    av_free(*ptr);
+    *ptr = NULL;
+}
+
+void *av_mallocz(unsigned int size)
+{
+    void *ptr = av_malloc(size);
+    if (ptr)
+        memset(ptr, 0, size);
+    return ptr;
+}
+
+char *av_strdup(const char *s)
+{
+    char *ptr= NULL;
+    if(s){
+        int len = strlen(s) + 1;
+        ptr = av_malloc(len);
+        if (ptr)
+            memcpy(ptr, s, len);
+    }
+    return ptr;
+}
diff -r 11d15c47beaf -r 897f711a7157 libavutil/mem.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/mem.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,143 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * memory handling functions
+ */
+
+#ifndef AVUTIL_MEM_H
+#define AVUTIL_MEM_H
+
+#include "attributes.h"
+#include "config.h"
+
+#define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+#define DECLARE_ALIGNED_16(t,v)      t __attribute__ ((aligned (16))) v
+#define DECLARE_ASM_CONST(n,t,v)    static const t __attribute__((used)) __attribute__ ((aligned (n))) v
+
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+    #define av_malloc_attrib __attribute__((__malloc__))
+#else
+    #define av_malloc_attrib
+#endif
+
+/**
+ * Allocates a block of size bytes with alignment suitable for all
+ * memory accesses (including vectors if available on the CPU).
+ * @param size Size in bytes for the memory block to be allocated.
+ * @return Pointer to the allocated block, NULL if the block cannot
+ * be allocated.
+ * @see av_mallocz()
+ */
+void *av_malloc(unsigned int size) av_malloc_attrib;
+
+/**
+ * Allocates or reallocates a block of memory.
+ * If ptr is NULL and size > 0, allocates a new block. If
+ * size is zero, frees the memory block pointed to by ptr.
+ * @param size Size in bytes for the memory block to be allocated or
+ * reallocated.
+ * @param ptr Pointer to a memory block already allocated with
+ * av_malloc(z)() or av_realloc() or NULL.
+ * @return Pointer to a newly reallocated block or NULL if the block
+ * cannot be reallocated or the function is used to free the memory block.
+ * @see av_fast_realloc()
+ */
+void *av_realloc(void *ptr, unsigned int size);
+
+/**
+ * Reallocates the given block if it is not large enough, otherwise it
+ * does nothing.
+ *
+ * @see av_realloc
+ */
+void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
+
+/**
+ * Allocates a buffer, reusing the given one if large enough.
+ *
+ * Contrary to av_fast_realloc the current buffer contents might not be
+ * preserved and on error the old buffer is freed, thus no special
+ * handling to avoid memleaks is necessary.
+ *
+ * @param ptr pointer to pointer to already allocated buffer, overwritten with pointer to new buffer
+ * @param size size of the buffer *ptr points to
+ * @param min_size minimum size of *ptr buffer after returning, *ptr will be NULL and
+ *                 *size 0 if an error occurred.
+ */
+void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size);
+
+/**
+ * Frees a memory block which has been allocated with av_malloc(z)() or
+ * av_realloc().
+ * @param ptr Pointer to the memory block which should be freed.
+ * @note ptr = NULL is explicitly allowed.
+ * @note It is recommended that you use av_freep() instead.
+ * @see av_freep()
+ */
+
+void av_free(void *ptr);
+
+/**
+ * Allocates a block of size bytes with alignment suitable for all
+ * memory accesses (including vectors if available on the CPU) and
+ * zeroes all the bytes of the block.
+ * @param size Size in bytes for the memory block to be allocated.
+ * @return Pointer to the allocated block, NULL if it cannot be allocated.
+ * @see av_malloc()
+ */
+void *av_mallocz(unsigned int size) av_malloc_attrib;
+
+/**
+ * Duplicates the string s.
+ * @param s string to be duplicated
+ * @return Pointer to a newly allocated string containing a
+ * copy of s or NULL if the string cannot be allocated.
+ */
+char *av_strdup(const char *s) av_malloc_attrib;
+
+/**
+ * Frees a memory block which has been allocated with av_malloc(z)() or
+ * av_realloc() and set the pointer pointing to it to NULL.
+ * @param ptr Pointer to the pointer to the memory block which should
+ * be freed.
+ * @see av_free()
+ */
+void av_freep(void *ptr);
+
+
+static av_always_inline uint32_t pack16to32(int a, int b){
+#if HAVE_BIGENDIAN
+   return (b&0xFFFF) + (a<<16);
+#else
+   return (a&0xFFFF) + (b<<16);
+#endif
+}
+
+static av_always_inline uint16_t pack8to16(int a, int b){
+#if HAVE_BIGENDIAN
+   return (b&0xFF) + (a<<8);
+#else
+   return (a&0xFF) + (b<<8);
+#endif
+}
+
+#endif /* AVUTIL_MEM_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/pixfmt.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/pixfmt.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,161 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PIXFMT_H
+#define AVUTIL_PIXFMT_H
+
+/**
+ * @file
+ * pixel format definitions
+ *
+ * @warning This file has to be considered an internal but installed
+ * header, so it should not be directly included in your projects.
+ */
+
+/**
+ * Pixel format. Notes:
+ *
+ * PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA
+ * color is put together as:
+ *  (A << 24) | (R << 16) | (G << 8) | B
+ * This is stored as BGRA on little-endian CPU architectures and ARGB on
+ * big-endian CPUs.
+ *
+ * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized
+ * image data is stored in AVFrame.data[0]. The palette is transported in
+ * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is
+ * formatted the same as in PIX_FMT_RGB32 described above (i.e., it is
+ * also endian-specific). Note also that the individual RGB palette
+ * components stored in AVFrame.data[1] should be in the range 0..255.
+ * This is important as many custom PAL8 video codecs that were designed
+ * to run on the IBM VGA graphics adapter use 6-bit palette components.
+ *
+ * For all the 8bit per pixel formats, an RGB32 palette is in data[1] like
+ * for pal8. This palette is filled in automatically by the function
+ * allocating the picture.
+ *
+ * Note, make sure that all newly added big endian formats have pix_fmt&1==1
+ *       and that all newly added little endian formats have pix_fmt&1==0
+ *       this allows simpler detection of big vs little endian.
+ */
+enum PixelFormat {
+    PIX_FMT_NONE= -1,
+    PIX_FMT_YUV420P,   ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
+    PIX_FMT_YUYV422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
+    PIX_FMT_RGB24,     ///< packed RGB 8:8:8, 24bpp, RGBRGB...
+    PIX_FMT_BGR24,     ///< packed RGB 8:8:8, 24bpp, BGRBGR...
+    PIX_FMT_YUV422P,   ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
+    PIX_FMT_YUV444P,   ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
+    PIX_FMT_YUV410P,   ///< planar YUV 4:1:0,  9bpp, (1 Cr & Cb sample per 4x4 Y samples)
+    PIX_FMT_YUV411P,   ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
+    PIX_FMT_GRAY8,     ///<        Y        ,  8bpp
+    PIX_FMT_MONOWHITE, ///<        Y        ,  1bpp, 0 is white, 1 is black
+    PIX_FMT_MONOBLACK, ///<        Y        ,  1bpp, 0 is black, 1 is white
+    PIX_FMT_PAL8,      ///< 8 bit with PIX_FMT_RGB32 palette
+    PIX_FMT_YUVJ420P,  ///< planar YUV 4:2:0, 12bpp, full scale (JPEG)
+    PIX_FMT_YUVJ422P,  ///< planar YUV 4:2:2, 16bpp, full scale (JPEG)
+    PIX_FMT_YUVJ444P,  ///< planar YUV 4:4:4, 24bpp, full scale (JPEG)
+    PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing
+    PIX_FMT_XVMC_MPEG2_IDCT,
+    PIX_FMT_UYVY422,   ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
+    PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3
+    PIX_FMT_BGR8,      ///< packed RGB 3:3:2,  8bpp, (msb)2B 3G 3R(lsb)
+    PIX_FMT_BGR4,      ///< packed RGB 1:2:1,  4bpp, (msb)1B 2G 1R(lsb)
+    PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1B 2G 1R(lsb)
+    PIX_FMT_RGB8,      ///< packed RGB 3:3:2,  8bpp, (msb)2R 3G 3B(lsb)
+    PIX_FMT_RGB4,      ///< packed RGB 1:2:1,  4bpp, (msb)1R 2G 1B(lsb)
+    PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1R 2G 1B(lsb)
+    PIX_FMT_NV12,      ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 for UV
+    PIX_FMT_NV21,      ///< as above, but U and V bytes are swapped
+
+    PIX_FMT_ARGB,      ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
+    PIX_FMT_RGBA,      ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
+    PIX_FMT_ABGR,      ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
+    PIX_FMT_BGRA,      ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
+
+    PIX_FMT_GRAY16BE,  ///<        Y        , 16bpp, big-endian
+    PIX_FMT_GRAY16LE,  ///<        Y        , 16bpp, little-endian
+    PIX_FMT_YUV440P,   ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
+    PIX_FMT_YUVJ440P,  ///< planar YUV 4:4:0 full scale (JPEG)
+    PIX_FMT_YUVA420P,  ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
+    PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    PIX_FMT_RGB48BE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, big-endian
+    PIX_FMT_RGB48LE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, little-endian
+
+    PIX_FMT_RGB565BE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), big-endian
+    PIX_FMT_RGB565LE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), little-endian
+    PIX_FMT_RGB555BE,  ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), big-endian, most significant bit to 0
+    PIX_FMT_RGB555LE,  ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), little-endian, most significant bit to 0
+
+    PIX_FMT_BGR565BE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), big-endian
+    PIX_FMT_BGR565LE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), little-endian
+    PIX_FMT_BGR555BE,  ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), big-endian, most significant bit to 1
+    PIX_FMT_BGR555LE,  ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), little-endian, most significant bit to 1
+
+    PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers
+    PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers
+    PIX_FMT_VAAPI_VLD,  ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+
+    PIX_FMT_YUV420P16LE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    PIX_FMT_YUV420P16BE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
+    PIX_FMT_YUV422P16LE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    PIX_FMT_YUV422P16BE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    PIX_FMT_YUV444P16LE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    PIX_FMT_YUV444P16BE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
+    PIX_FMT_VDPAU_MPEG4,  ///< MPEG4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    PIX_FMT_DXVA2_VLD,    ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer
+
+    PIX_FMT_RGB444BE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), big-endian, most significant bits to 0
+    PIX_FMT_RGB444LE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), little-endian, most significant bits to 0
+    PIX_FMT_BGR444BE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), big-endian, most significant bits to 1
+    PIX_FMT_BGR444LE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), little-endian, most significant bits to 1
+    PIX_FMT_Y400A,     ///< 8bit gray, 8bit alpha
+    PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+};
+
+#if HAVE_BIGENDIAN
+#   define PIX_FMT_NE(be, le) PIX_FMT_##be
+#else
+#   define PIX_FMT_NE(be, le) PIX_FMT_##le
+#endif
+
+#define PIX_FMT_RGB32   PIX_FMT_NE(ARGB, BGRA)
+#define PIX_FMT_RGB32_1 PIX_FMT_NE(RGBA, ABGR)
+#define PIX_FMT_BGR32   PIX_FMT_NE(ABGR, RGBA)
+#define PIX_FMT_BGR32_1 PIX_FMT_NE(BGRA, ARGB)
+
+#define PIX_FMT_GRAY16 PIX_FMT_NE(GRAY16BE, GRAY16LE)
+#define PIX_FMT_RGB48  PIX_FMT_NE(RGB48BE,  RGB48LE)
+#define PIX_FMT_RGB565 PIX_FMT_NE(RGB565BE, RGB565LE)
+#define PIX_FMT_RGB555 PIX_FMT_NE(RGB555BE, RGB555LE)
+#define PIX_FMT_RGB444 PIX_FMT_NE(RGB444BE, RGB444LE)
+#define PIX_FMT_BGR565 PIX_FMT_NE(BGR565BE, BGR565LE)
+#define PIX_FMT_BGR555 PIX_FMT_NE(BGR555BE, BGR555LE)
+#define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE)
+
+#define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE)
+#define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE)
+#define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE)
+
+#endif /* AVUTIL_PIXFMT_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/ppc/intreadwrite.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/ppc/intreadwrite.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_INTREADWRITE_H
+#define AVUTIL_PPC_INTREADWRITE_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_XFORM_ASM
+
+#define AV_RL16 AV_RL16
+static av_always_inline uint16_t AV_RL16(const void *p)
+{
+    uint16_t v;
+    __asm__ ("lhbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p));
+    return v;
+}
+
+#define AV_WL16 AV_WL16
+static av_always_inline void AV_WL16(void *p, uint16_t v)
+{
+    __asm__ ("sthbrx  %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v));
+}
+
+#define AV_RL32 AV_RL32
+static av_always_inline uint32_t AV_RL32(const void *p)
+{
+    uint32_t v;
+    __asm__ ("lwbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p));
+    return v;
+}
+
+#define AV_WL32 AV_WL32
+static av_always_inline void AV_WL32(void *p, uint32_t v)
+{
+    __asm__ ("stwbrx  %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v));
+}
+
+#if HAVE_LDBRX
+
+#define AV_RL64 AV_RL64
+static av_always_inline uint64_t AV_RL64(const void *p)
+{
+    uint64_t v;
+    __asm__ ("ldbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p));
+    return v;
+}
+
+#define AV_WL64 AV_WL64
+static av_always_inline void AV_WL64(void *p, uint64_t v)
+{
+    __asm__ ("stdbrx  %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v));
+}
+
+#else
+
+#define AV_RL64 AV_RL64
+static av_always_inline uint64_t AV_RL64(const void *p)
+{
+    union { uint64_t v; uint32_t hl[2]; } v;
+    __asm__ ("lwbrx   %0, %y2  \n\t"
+             "lwbrx   %1, %y3  \n\t"
+             : "=&r"(v.hl[1]), "=r"(v.hl[0])
+             : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1)));
+    return v.v;
+}
+
+#define AV_WL64 AV_WL64
+static av_always_inline void AV_WL64(void *p, uint64_t v)
+{
+    union { uint64_t v; uint32_t hl[2]; } vv = { v };
+    __asm__ ("stwbrx  %2, %y0  \n\t"
+             "stwbrx  %3, %y1  \n\t"
+             : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1))
+             : "r"(vv.hl[1]), "r"(vv.hl[0]));
+}
+
+#endif /* HAVE_LDBRX */
+
+#endif /* HAVE_XFORM_ASM */
+
+/*
+ * GCC fails miserably on the packed struct version which is used by
+ * default, so we override it here.
+ */
+
+#define AV_RB64(p) (*(const uint64_t *)(p))
+#define AV_WB64(p, v) (*(uint64_t *)(p) = (v))
+
+#endif /* AVUTIL_PPC_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/ppc/timer.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/ppc/timer.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2005 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_TIMER_H
+#define AVUTIL_PPC_TIMER_H
+
+#include <stdint.h>
+
+#define AV_READ_TIME read_time
+
+static inline uint64_t read_time(void)
+{
+    uint32_t tbu, tbl, temp;
+
+     /* from section 2.2.1 of the 32-bit PowerPC PEM */
+     __asm__ volatile(
+         "1:\n"
+         "mftbu  %2\n"
+         "mftb   %0\n"
+         "mftbu  %1\n"
+         "cmpw   %2,%1\n"
+         "bne    1b\n"
+     : "=r"(tbl), "=r"(tbu), "=r"(temp)
+     :
+     : "cc");
+
+     return (((uint64_t)tbu)<<32) | (uint64_t)tbl;
+}
+
+#endif /* AVUTIL_PPC_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/timer.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/timer.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,69 @@
+/**
+ * @file
+ * high precision timer, useful to profile code
+ *
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_TIMER_H
+#define AVUTIL_TIMER_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include "config.h"
+
+#if   ARCH_ARM
+#   include "arm/timer.h"
+#elif ARCH_PPC
+#   include "ppc/timer.h"
+#elif ARCH_X86
+#   include "x86/timer.h"
+#endif
+
+#if !defined(AV_READ_TIME) && HAVE_GETHRTIME
+#   define AV_READ_TIME gethrtime
+#endif
+
+#ifdef AV_READ_TIME
+#define START_TIMER \
+uint64_t tend;\
+uint64_t tstart= AV_READ_TIME();\
+
+#define STOP_TIMER(id) \
+tend= AV_READ_TIME();\
+{\
+    static uint64_t tsum=0;\
+    static int tcount=0;\
+    static int tskip_count=0;\
+    if(tcount<2 || tend - tstart < 8*tsum/tcount || tend - tstart < 2000){\
+        tsum+= tend - tstart;\
+        tcount++;\
+    }else\
+        tskip_count++;\
+    if(((tcount+tskip_count)&(tcount+tskip_count-1))==0){\
+        av_log(NULL, AV_LOG_ERROR, "%"PRIu64" dezicycles in %s, %d runs, %d skips\n",\
+               tsum*10/tcount, id, tcount, tskip_count);\
+    }\
+}
+#else
+#define START_TIMER
+#define STOP_TIMER(id) {}
+#endif
+
+#endif /* AVUTIL_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86/bswap.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/x86/bswap.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,61 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * byte swapping routines
+ */
+
+#ifndef AVUTIL_X86_BSWAP_H
+#define AVUTIL_X86_BSWAP_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+
+#define bswap_16 bswap_16
+static av_always_inline av_const uint16_t bswap_16(uint16_t x)
+{
+    __asm__("rorw $8, %0" : "+r"(x));
+    return x;
+}
+
+#define bswap_32 bswap_32
+static av_always_inline av_const uint32_t bswap_32(uint32_t x)
+{
+// #if HAVE_BSWAP
+    __asm__("bswap   %0" : "+r" (x));
+// #else
+//     __asm__("rorw    $8,  %w0 \n\t"
+//             "rorl    $16, %0  \n\t"
+//             "rorw    $8,  %w0"
+//             : "+r"(x));
+// #endif
+    return x;
+}
+
+#if ARCH_X86_64
+#define bswap_64 bswap_64
+static inline uint64_t av_const bswap_64(uint64_t x)
+{
+    __asm__("bswap  %0": "=r" (x) : "0" (x));
+    return x;
+}
+#endif
+
+#endif /* AVUTIL_X86_BSWAP_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86/intreadwrite.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/x86/intreadwrite.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_INTREADWRITE_H
+#define AVUTIL_X86_INTREADWRITE_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+
+#if HAVE_MMX
+
+#if defined(__MMX__)
+
+#define AV_COPY64 AV_COPY64
+static av_always_inline void AV_COPY64(void *d, const void *s)
+{
+    __asm__("movq   %1, %%mm0  \n\t"
+            "movq   %%mm0, %0  \n\t"
+            : "=m"(*(uint64_t*)d)
+            : "m" (*(const uint64_t*)s)
+            : "mm0");
+}
+
+#define AV_SWAP64 AV_SWAP64
+static av_always_inline void AV_SWAP64(void *a, void *b)
+{
+    __asm__("movq   %1, %%mm0  \n\t"
+            "movq   %0, %%mm1  \n\t"
+            "movq   %%mm0, %0  \n\t"
+            "movq   %%mm1, %1  \n\t"
+            : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
+            ::"mm0", "mm1");
+}
+
+#define AV_ZERO64 AV_ZERO64
+static av_always_inline void AV_ZERO64(void *d)
+{
+    __asm__("pxor %%mm0, %%mm0  \n\t"
+            "movq %%mm0, %0     \n\t"
+            : "=m"(*(uint64_t*)d)
+            :: "mm0");
+}
+
+#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
+
+#ifdef __SSE__
+
+#define AV_COPY128 AV_COPY128
+static av_always_inline void AV_COPY128(void *d, const void *s)
+{
+    struct v {uint64_t v[2];};
+
+    __asm__("movaps   %1, %%xmm0  \n\t"
+            "movaps   %%xmm0, %0  \n\t"
+            : "=m"(*(struct v*)d)
+            : "m" (*(const struct v*)s)
+            : "xmm0");
+}
+
+#endif /* __SSE__ */
+
+#ifdef __SSE2__
+
+#define AV_ZERO128 AV_ZERO128
+static av_always_inline void AV_ZERO128(void *d)
+{
+    struct v {uint64_t v[2];};
+
+    __asm__("pxor %%xmm0, %%xmm0  \n\t"
+            "movdqa   %%xmm0, %0  \n\t"
+            : "=m"(*(struct v*)d)
+            :: "xmm0");
+}
+
+#endif /* __SSE2__ */
+
+#endif /* HAVE_MMX */
+
+#endif /* AVUTIL_X86_INTREADWRITE_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86/timer.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/x86/timer.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,35 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_TIMER_H
+#define AVUTIL_X86_TIMER_H
+
+#include <stdint.h>
+
+#define AV_READ_TIME read_time
+
+static inline uint64_t read_time(void)
+{
+    uint32_t a, d;
+    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
+    return ((uint64_t)d << 32) + a;
+}
+
+#endif /* AVUTIL_X86_TIMER_H */
diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86_cpu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libavutil/x86_cpu.h	Tue Sep 25 15:55:33 2012 +0200
@@ -0,0 +1,73 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_CPU_H
+#define AVUTIL_X86_CPU_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if ARCH_X86_64
+#    define REG_a "rax"
+#    define REG_b "rbx"
+#    define REG_c "rcx"
+#    define REG_d "rdx"
+#    define REG_D "rdi"
+#    define REG_S "rsi"
+#    define PTR_SIZE "8"
+typedef int64_t x86_reg;
+
+#    define REG_SP "rsp"
+#    define REG_BP "rbp"
+#    define REGBP   rbp
+#    define REGa    rax
+#    define REGb    rbx
+#    define REGc    rcx
+#    define REGd    rdx
+#    define REGSP   rsp
+
+#elif ARCH_X86_32
+
+#    define REG_a "eax"
+#    define REG_b "ebx"
+#    define REG_c "ecx"
+#    define REG_d "edx"
+#    define REG_D "edi"
+#    define REG_S "esi"
+#    define PTR_SIZE "4"
+typedef int32_t x86_reg;
+
+#    define REG_SP "esp"
+#    define REG_BP "ebp"
+#    define REGBP   ebp
+#    define REGa    eax
+#    define REGb    ebx
+#    define REGc    ecx
+#    define REGd    edx
+#    define REGSP   esp
+#else
+typedef int x86_reg;
+#endif
+
+// #if ARCH_X86_64 && defined(PIC)
+// #    define BROKEN_RELOCATIONS 1
+// #endif
+
+#endif /* AVUTIL_X86_CPU_H */